diff --git a/ftp_check.py b/ftp_check.py index 49a8157..aaefa7a 100644 --- a/ftp_check.py +++ b/ftp_check.py @@ -4,6 +4,9 @@ import sys import shutil import urllib +import logging + +logging.basicConfig(level=logging.DEBUG) tobechecked = sys.argv[1] totalsize = 0 @@ -14,150 +17,196 @@ if not os.path.exists('archive'): os.makedirs('archive') + def fixurl(itemurl): + """Remove port suffix.""" if re.search(r'^ftp:\/\/[^\/]+:21\/', itemurl): itemurl = itemurl.replace(':21', '', 1) return itemurl + +def make_initial_dirslist(ftp): + startdir = '/' + has_dir = re.search(r'^[^\/]+(\/.+)', ftp) + if has_dir: + startdir = has_dir.group(1) + if not startdir.endswith('/'): + startdir += '/' + logging.debug('startdir = ' + startdir) + return [startdir] + + +def check_ftp(ftp_str): + # strip off protocol and trailing slash + ftp = re.search(r'^(?:ftp:\/\/)?(.+)\/?$', ftp_str).group(1) + ftp_basename = re.search(r'^([^\/]+)', ftp).group(1) + output_doc = ftp_basename + '.html' + + logging.info('ftp_basename = ' + ftp_basename) + + os.makedirs(ftp_basename) + os.chdir(ftp_basename) + + itemftps = [] + itemslist = [] + itemsizes = [] + dirslist = make_initial_dirslist(ftp) + donedirs = [] + + def add_if_missing(s): + if s not in itemslist: + itemslist.append(s) + itemftps.append(ftp_basename) + itemsizes.append(0) + + def process_dir(dir): + if dir in donedirs: + return + + os.system('wget --no-glob --timeout=20 --output-document=' + output_doc + + ' "' + dir_url + '"') + if os.path.isfile(output_doc): + with open(output_doc, 'r') as index: + for line in index.read().splitlines(): + match = re.search(r'', line) + if match: + itemslist.append(match.group(1)) + itemftps.append(ftp_basename) + match = re.search(r'<\/a>.*\(([0-9]+)', line) + match2 = re.search(r'', line) + if match: + itemsizes.append(int(match.group(1))) + elif match2 and ' Directory ' in line: + dirslist.append(match2.group(1)) + itemsizes.append(0) + elif match2: + itemsizes.append(0) + donedirs.append(dir) + if os.path.isfile(output_doc): + os.remove(output_doc) + if os.path.isfile('wget-log'): + os.remove('wget-log') + + while all(dir not in donedirs for dir in dirslist): + for dir in dirslist: + dir = dir.replace(' ', '%20').replace('&', '&') + if re.search(r'&#[0-9]+;', dir): + raise Exception(dir) + dir = dir.replace('#', '%23') + dir_url = 'ftp://' + ftp_basename + dir + logging.info('dir_url = ' + dir_url) + + add_if_missing(dir_url) + add_if_missing(dir_url + './') + add_if_missing(dir_url + '../') + + # break directory loops + for match in re.findall(r'([^\/]+)', dir): + if '/' + match + '/' + match + '/' + match + '/' + match + '/' + match in dir: + break + else: + process_dir(dir) + os.chdir('..') + shutil.rmtree(ftp_basename) + + make_output(zip(itemftps, itemslist, itemsizes)) + + try: + urllib.urlopen('ftp://' + ftp_basename + '/NONEXISTINGFILEdgdjahxnedadbacxjbc/') + except Exception as error: + dir_not_found = str(error).replace('[Errno ftp error] ', '') + print(dir_not_found) + + try: + urllib.urlopen('ftp://' + ftp_basename + '/NONEXISTINGFILEdgdjahxnedadbacxjbc') + except Exception as error: + file_not_found = str(error).replace('[Errno ftp error] ', '') + print(file_not_found) + + if os.path.isfile('items/' + ftp_basename + '_dir_not_found'): + os.remove('items/' + ftp_basename + '_dir_not_found') + if os.path.isfile('items/' + ftp_basename + '_file_not_found'): + os.remove('items/' + ftp_basename + '_file_not_found') + + with open('items/' + ftp_basename + '_dir_not_found', 'w') as file: + file.write(dir_not_found) + with open('items/' + ftp_basename + '_file_not_found', 'w') as file: + file.write(file_not_found) + + if not tobechecked == 'to_be_rechecked': + with open('to_be_rechecked', 'a') as file: + if os.path.isfile('to_be_checked'): + file.write('\n' + ftp) + else: + file.write(ftp) + + +def make_output(totalitems): + global totalsize + itemsize = 0 + itemlinks = 0 + archive_file = 'archive/' + totalitems[0][0] + archive_data_file = archive_file + '-data' + + if os.path.isfile(archive_file): + with open(archive_file) as file: + archivelist = [list(ast.literal_eval(line)) for line in file] + else: + archivelist = [] + + if os.path.isfile(archive_data_file): + with open(archive_data_file, 'r') as file: + itemnum = int(file.read()) + 1 + else: + itemnum = 0 + + for item in totalitems: + match = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1]) + if match and (item[0], match.group(1), 0) not in totalitems: + match = re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1) + totalitems.append((item[0], match, 0)) + totalitems.append((item[0], match + './', 0)) + totalitems.append((item[0], match + '../', 0)) + + newitems = [] + for item in totalitems: + itemurl = fixurl(item[1]) + if '&' in itemurl or [item[2], itemurl] not in archivelist: + newitems.append(item) + + for item in newitems: + itemdir = re.search(r'^(ftp:\/\/.+\/)', item[1]).group(1) + while True: + if (item[0], itemdir, 0) not in newitems: + newitems.append((item[0], itemdir, 0)) + if re.search(r'^ftp:\/\/[^\/]+\/$', itemdir): + break + itemdir = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', itemdir).group(1) + itemurl = fixurl(item[1]) + with open('items/' + item[0] + '_' + str(itemnum), 'a') as file: + file.write(itemurl + '\n') + itemsize += item[2] + totalsize += item[2] + itemlinks += 1 + if itemsize > maxitemsize or newitems[len(newitems)-1] == item: + file.write('ITEM_NAME: ' + item[0] + '_' + str(itemnum) + '\n') + file.write('ITEM_TOTAL_SIZE: ' + str(itemsize) + '\n') + file.write('ITEM_TOTAL_LINKS: ' + str(itemlinks) + '\n') + itemnum += 1 + itemsize = 0 + itemlinks = 0 + if not [item[2], itemurl] in archivelist: + quote = '"' if "'" in itemurl else "'" + with open('archive/' + item[0], 'a') as file: + file.write(str(item[2]) + ", " + quote + itemurl + quote + "\n") + with open(archive_data_file, 'w') as file: + if os.path.isfile('items/' + item[0] + '_' + str(itemnum-1)): + file.write(str(itemnum-1)) + + with open(tobechecked, 'r') as file: ftps = file.read().splitlines() for ftp in ftps: - ftp = re.search(r'^(?:ftp:\/\/)?(.+)\/?$', ftp).group(1) - os.makedirs(re.search(r'^([^\/]+)', ftp).group(1)) - os.chdir(re.search(r'^([^\/]+)', ftp).group(1)) - itemftps = [] - itemslist = [] - itemsizes = [] - startdir = '/' - if re.search(r'^[^\/]+\/.+', ftp): - startdir = re.search(r'^[^\/]+(\/.+)', ftp).group(1) - if not startdir.endswith('/'): - startdir += '/' - dirslist = [startdir] - donedirs = [] - while all(dir not in donedirs for dir in dirslist): - for dir in dirslist: - dir = dir.replace(' ', '%20').replace('&', '&') - if re.search(r'&#[0-9]+;', dir): - raise Exception(dir) - dir = dir.replace('#', '%23') - if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir in itemslist: - itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir) - itemftps.append(re.search(r'^([^\/]+)', ftp).group(1)) - itemsizes.append(0) - if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + './' in itemslist: - itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + './') - itemftps.append(re.search(r'^([^\/]+)', ftp).group(1)) - itemsizes.append(0) - if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '../' in itemslist: - itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '../') - itemftps.append(re.search(r'^([^\/]+)', ftp).group(1)) - itemsizes.append(0) - for match in re.findall(r'([^\/]+)', dir): - if '/' + match + '/' + match + '/' + match + '/' + match + '/' + match in dir: - break - else: - if not dir in donedirs: - os.system('wget --no-glob --timeout=20 --output-document=' + re.search(r'^([^\/]+)', ftp).group(1) + '.html "ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '"') - if os.path.isfile(re.search(r'^([^\/]+)', ftp).group(1) + '.html'): - with open(re.search(r'^([^\/]+)', ftp).group(1) + '.html', 'r') as index: - for line in index.read().splitlines(): - if re.search(r'', line): - itemslist.append(re.search(r'', line).group(1)) - itemftps.append(re.search(r'^([^\/]+)', ftp).group(1)) - if re.search(r'<\/a>.*\(', line): - itemsizes.append(int(re.search(r'<\/a>.*\(([0-9]+)', line).group(1))) - elif re.search(r'', line) and ' Directory ' in line: - dirslist.append(re.search(r'', line).group(1)) - itemsizes.append(0) - elif re.search(r'', line): - itemsizes.append(0) - donedirs.append(dir) - if os.path.isfile(re.search(r'^([^\/]+)', ftp).group(1) + '.html'): - os.remove(re.search(r'^([^\/]+)', ftp).group(1) + '.html') - if os.path.isfile('wget-log'): - os.remove('wget-log') - os.chdir('..') - shutil.rmtree(re.search(r'^([^\/]+)', ftp).group(1)) - totalitems = zip(itemftps, itemslist, itemsizes) - archivelist = [] - newitems = [] - itemsize = 0 - itemnum = 0 - itemlinks = 0 - if os.path.isfile('archive/' + totalitems[0][0]): - with open('archive/' + totalitems[0][0]) as file: - archivelist = [list(ast.literal_eval(line)) for line in file] - if os.path.isfile('archive/' + totalitems[0][0] + '-data'): - with open('archive/' + totalitems[0][0] + '-data', 'r') as file: - itemnum = int(file.read()) + 1 - for item in totalitems: - if re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1]): - if not (item[0], re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1]).group(1), 0) in totalitems: - totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1), 0)) - totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1) + './', 0)) - totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1) + '../', 0)) - for item in totalitems: - itemurl = fixurl(item[1]) - if '&' in itemurl or not [item[2], itemurl] in archivelist: - newitems.append(item) - for item in newitems: - itemdir = re.search(r'^(ftp:\/\/.+\/)', item[1]).group(1) - while True: - if not (item[0], itemdir, 0) in newitems: - newitems.append((item[0], itemdir, 0)) - if re.search(r'^ftp:\/\/[^\/]+\/$', itemdir): - break - itemdir = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', itemdir).group(1) - itemurl = fixurl(item[1]) - with open('items/' + item[0] + '_' + str(itemnum), 'a') as file: - file.write(itemurl + '\n') - itemsize += item[2] - totalsize += item[2] - itemlinks += 1 - if itemsize > maxitemsize or newitems[len(newitems)-1] == item: - file.write('ITEM_NAME: ' + item[0] + '_' + str(itemnum) + '\n') - file.write('ITEM_TOTAL_SIZE: ' + str(itemsize) + '\n') - file.write('ITEM_TOTAL_LINKS: ' + str(itemlinks) + '\n') - itemnum += 1 - itemsize = 0 - itemlinks = 0 - if not [item[2], itemurl] in archivelist: - with open('archive/' + item[0], 'a') as file: - if "'" in itemurl: - file.write(str(item[2]) + ", \"" + itemurl + "\"\n") - else: - file.write(str(item[2]) + ', \'' + itemurl + '\'\n') - with open('archive/' + totalitems[0][0] + '-data', 'w') as file: - if os.path.isfile('items/' + item[0] + '_' + str(itemnum-1)): - file.write(str(itemnum-1)) - try: - urllib.urlopen('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + '/NONEXISTINGFILEdgdjahxnedadbacxjbc/') - except Exception as error: - dir_not_found = str(error).replace('[Errno ftp error] ', '') - print(dir_not_found) - - try: - urllib.urlopen('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + '/NONEXISTINGFILEdgdjahxnedadbacxjbc') - except Exception as error: - file_not_found = str(error).replace('[Errno ftp error] ', '') - print(file_not_found) - - if os.path.isfile('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found'): - os.remove('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found') - if os.path.isfile('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found'): - os.remove('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found') - - with open('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found', 'w') as file: - file.write(dir_not_found) - with open('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found', 'w') as file: - file.write(file_not_found) - - if not tobechecked == 'to_be_rechecked': - with open('to_be_rechecked', 'a') as file: - if os.path.isfile('to_be_checked'): - file.write('\n' + ftp) - else: - file.write(ftp) + check_ftp(ftp) print(totalsize)