diff --git a/ftp_check.py b/ftp_check.py
index 49a8157..aaefa7a 100644
--- a/ftp_check.py
+++ b/ftp_check.py
@@ -4,6 +4,9 @@
import sys
import shutil
import urllib
+import logging
+
+logging.basicConfig(level=logging.DEBUG)
tobechecked = sys.argv[1]
totalsize = 0
@@ -14,150 +17,196 @@
if not os.path.exists('archive'):
os.makedirs('archive')
+
def fixurl(itemurl):
+ """Remove port suffix."""
if re.search(r'^ftp:\/\/[^\/]+:21\/', itemurl):
itemurl = itemurl.replace(':21', '', 1)
return itemurl
+
+def make_initial_dirslist(ftp):
+ startdir = '/'
+ has_dir = re.search(r'^[^\/]+(\/.+)', ftp)
+ if has_dir:
+ startdir = has_dir.group(1)
+ if not startdir.endswith('/'):
+ startdir += '/'
+ logging.debug('startdir = ' + startdir)
+ return [startdir]
+
+
+def check_ftp(ftp_str):
+ # strip off protocol and trailing slash
+ ftp = re.search(r'^(?:ftp:\/\/)?(.+)\/?$', ftp_str).group(1)
+ ftp_basename = re.search(r'^([^\/]+)', ftp).group(1)
+ output_doc = ftp_basename + '.html'
+
+ logging.info('ftp_basename = ' + ftp_basename)
+
+ os.makedirs(ftp_basename)
+ os.chdir(ftp_basename)
+
+ itemftps = []
+ itemslist = []
+ itemsizes = []
+ dirslist = make_initial_dirslist(ftp)
+ donedirs = []
+
+ def add_if_missing(s):
+ if s not in itemslist:
+ itemslist.append(s)
+ itemftps.append(ftp_basename)
+ itemsizes.append(0)
+
+ def process_dir(dir):
+ if dir in donedirs:
+ return
+
+ os.system('wget --no-glob --timeout=20 --output-document=' + output_doc +
+ ' "' + dir_url + '"')
+ if os.path.isfile(output_doc):
+ with open(output_doc, 'r') as index:
+ for line in index.read().splitlines():
+ match = re.search(r'', line)
+ if match:
+ itemslist.append(match.group(1))
+ itemftps.append(ftp_basename)
+ match = re.search(r'<\/a>.*\(([0-9]+)', line)
+ match2 = re.search(r'', line)
+ if match:
+ itemsizes.append(int(match.group(1)))
+ elif match2 and ' Directory ' in line:
+ dirslist.append(match2.group(1))
+ itemsizes.append(0)
+ elif match2:
+ itemsizes.append(0)
+ donedirs.append(dir)
+ if os.path.isfile(output_doc):
+ os.remove(output_doc)
+ if os.path.isfile('wget-log'):
+ os.remove('wget-log')
+
+ while all(dir not in donedirs for dir in dirslist):
+ for dir in dirslist:
+ dir = dir.replace(' ', '%20').replace('&', '&')
+ if re.search(r'[0-9]+;', dir):
+ raise Exception(dir)
+ dir = dir.replace('#', '%23')
+ dir_url = 'ftp://' + ftp_basename + dir
+ logging.info('dir_url = ' + dir_url)
+
+ add_if_missing(dir_url)
+ add_if_missing(dir_url + './')
+ add_if_missing(dir_url + '../')
+
+ # break directory loops
+ for match in re.findall(r'([^\/]+)', dir):
+ if '/' + match + '/' + match + '/' + match + '/' + match + '/' + match in dir:
+ break
+ else:
+ process_dir(dir)
+ os.chdir('..')
+ shutil.rmtree(ftp_basename)
+
+ make_output(zip(itemftps, itemslist, itemsizes))
+
+ try:
+ urllib.urlopen('ftp://' + ftp_basename + '/NONEXISTINGFILEdgdjahxnedadbacxjbc/')
+ except Exception as error:
+ dir_not_found = str(error).replace('[Errno ftp error] ', '')
+ print(dir_not_found)
+
+ try:
+ urllib.urlopen('ftp://' + ftp_basename + '/NONEXISTINGFILEdgdjahxnedadbacxjbc')
+ except Exception as error:
+ file_not_found = str(error).replace('[Errno ftp error] ', '')
+ print(file_not_found)
+
+ if os.path.isfile('items/' + ftp_basename + '_dir_not_found'):
+ os.remove('items/' + ftp_basename + '_dir_not_found')
+ if os.path.isfile('items/' + ftp_basename + '_file_not_found'):
+ os.remove('items/' + ftp_basename + '_file_not_found')
+
+ with open('items/' + ftp_basename + '_dir_not_found', 'w') as file:
+ file.write(dir_not_found)
+ with open('items/' + ftp_basename + '_file_not_found', 'w') as file:
+ file.write(file_not_found)
+
+ if not tobechecked == 'to_be_rechecked':
+ with open('to_be_rechecked', 'a') as file:
+ if os.path.isfile('to_be_checked'):
+ file.write('\n' + ftp)
+ else:
+ file.write(ftp)
+
+
+def make_output(totalitems):
+ global totalsize
+ itemsize = 0
+ itemlinks = 0
+ archive_file = 'archive/' + totalitems[0][0]
+ archive_data_file = archive_file + '-data'
+
+ if os.path.isfile(archive_file):
+ with open(archive_file) as file:
+ archivelist = [list(ast.literal_eval(line)) for line in file]
+ else:
+ archivelist = []
+
+ if os.path.isfile(archive_data_file):
+ with open(archive_data_file, 'r') as file:
+ itemnum = int(file.read()) + 1
+ else:
+ itemnum = 0
+
+ for item in totalitems:
+ match = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1])
+ if match and (item[0], match.group(1), 0) not in totalitems:
+ match = re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1)
+ totalitems.append((item[0], match, 0))
+ totalitems.append((item[0], match + './', 0))
+ totalitems.append((item[0], match + '../', 0))
+
+ newitems = []
+ for item in totalitems:
+ itemurl = fixurl(item[1])
+ if '&' in itemurl or [item[2], itemurl] not in archivelist:
+ newitems.append(item)
+
+ for item in newitems:
+ itemdir = re.search(r'^(ftp:\/\/.+\/)', item[1]).group(1)
+ while True:
+ if (item[0], itemdir, 0) not in newitems:
+ newitems.append((item[0], itemdir, 0))
+ if re.search(r'^ftp:\/\/[^\/]+\/$', itemdir):
+ break
+ itemdir = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', itemdir).group(1)
+ itemurl = fixurl(item[1])
+ with open('items/' + item[0] + '_' + str(itemnum), 'a') as file:
+ file.write(itemurl + '\n')
+ itemsize += item[2]
+ totalsize += item[2]
+ itemlinks += 1
+ if itemsize > maxitemsize or newitems[len(newitems)-1] == item:
+ file.write('ITEM_NAME: ' + item[0] + '_' + str(itemnum) + '\n')
+ file.write('ITEM_TOTAL_SIZE: ' + str(itemsize) + '\n')
+ file.write('ITEM_TOTAL_LINKS: ' + str(itemlinks) + '\n')
+ itemnum += 1
+ itemsize = 0
+ itemlinks = 0
+ if not [item[2], itemurl] in archivelist:
+ quote = '"' if "'" in itemurl else "'"
+ with open('archive/' + item[0], 'a') as file:
+ file.write(str(item[2]) + ", " + quote + itemurl + quote + "\n")
+ with open(archive_data_file, 'w') as file:
+ if os.path.isfile('items/' + item[0] + '_' + str(itemnum-1)):
+ file.write(str(itemnum-1))
+
+
with open(tobechecked, 'r') as file:
ftps = file.read().splitlines()
for ftp in ftps:
- ftp = re.search(r'^(?:ftp:\/\/)?(.+)\/?$', ftp).group(1)
- os.makedirs(re.search(r'^([^\/]+)', ftp).group(1))
- os.chdir(re.search(r'^([^\/]+)', ftp).group(1))
- itemftps = []
- itemslist = []
- itemsizes = []
- startdir = '/'
- if re.search(r'^[^\/]+\/.+', ftp):
- startdir = re.search(r'^[^\/]+(\/.+)', ftp).group(1)
- if not startdir.endswith('/'):
- startdir += '/'
- dirslist = [startdir]
- donedirs = []
- while all(dir not in donedirs for dir in dirslist):
- for dir in dirslist:
- dir = dir.replace(' ', '%20').replace('&', '&')
- if re.search(r'[0-9]+;', dir):
- raise Exception(dir)
- dir = dir.replace('#', '%23')
- if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir in itemslist:
- itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir)
- itemftps.append(re.search(r'^([^\/]+)', ftp).group(1))
- itemsizes.append(0)
- if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + './' in itemslist:
- itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + './')
- itemftps.append(re.search(r'^([^\/]+)', ftp).group(1))
- itemsizes.append(0)
- if not 'ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '../' in itemslist:
- itemslist.append('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '../')
- itemftps.append(re.search(r'^([^\/]+)', ftp).group(1))
- itemsizes.append(0)
- for match in re.findall(r'([^\/]+)', dir):
- if '/' + match + '/' + match + '/' + match + '/' + match + '/' + match in dir:
- break
- else:
- if not dir in donedirs:
- os.system('wget --no-glob --timeout=20 --output-document=' + re.search(r'^([^\/]+)', ftp).group(1) + '.html "ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + dir + '"')
- if os.path.isfile(re.search(r'^([^\/]+)', ftp).group(1) + '.html'):
- with open(re.search(r'^([^\/]+)', ftp).group(1) + '.html', 'r') as index:
- for line in index.read().splitlines():
- if re.search(r'', line):
- itemslist.append(re.search(r'', line).group(1))
- itemftps.append(re.search(r'^([^\/]+)', ftp).group(1))
- if re.search(r'<\/a>.*\(', line):
- itemsizes.append(int(re.search(r'<\/a>.*\(([0-9]+)', line).group(1)))
- elif re.search(r'', line) and ' Directory ' in line:
- dirslist.append(re.search(r'', line).group(1))
- itemsizes.append(0)
- elif re.search(r'', line):
- itemsizes.append(0)
- donedirs.append(dir)
- if os.path.isfile(re.search(r'^([^\/]+)', ftp).group(1) + '.html'):
- os.remove(re.search(r'^([^\/]+)', ftp).group(1) + '.html')
- if os.path.isfile('wget-log'):
- os.remove('wget-log')
- os.chdir('..')
- shutil.rmtree(re.search(r'^([^\/]+)', ftp).group(1))
- totalitems = zip(itemftps, itemslist, itemsizes)
- archivelist = []
- newitems = []
- itemsize = 0
- itemnum = 0
- itemlinks = 0
- if os.path.isfile('archive/' + totalitems[0][0]):
- with open('archive/' + totalitems[0][0]) as file:
- archivelist = [list(ast.literal_eval(line)) for line in file]
- if os.path.isfile('archive/' + totalitems[0][0] + '-data'):
- with open('archive/' + totalitems[0][0] + '-data', 'r') as file:
- itemnum = int(file.read()) + 1
- for item in totalitems:
- if re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1]):
- if not (item[0], re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', item[1]).group(1), 0) in totalitems:
- totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1), 0))
- totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1) + './', 0))
- totalitems.append((item[0], re.search(r'^(.+\/)[^\/]+\/', item[1]).group(1) + '../', 0))
- for item in totalitems:
- itemurl = fixurl(item[1])
- if '&' in itemurl or not [item[2], itemurl] in archivelist:
- newitems.append(item)
- for item in newitems:
- itemdir = re.search(r'^(ftp:\/\/.+\/)', item[1]).group(1)
- while True:
- if not (item[0], itemdir, 0) in newitems:
- newitems.append((item[0], itemdir, 0))
- if re.search(r'^ftp:\/\/[^\/]+\/$', itemdir):
- break
- itemdir = re.search(r'^(ftp:\/\/.+\/)[^\/]+\/', itemdir).group(1)
- itemurl = fixurl(item[1])
- with open('items/' + item[0] + '_' + str(itemnum), 'a') as file:
- file.write(itemurl + '\n')
- itemsize += item[2]
- totalsize += item[2]
- itemlinks += 1
- if itemsize > maxitemsize or newitems[len(newitems)-1] == item:
- file.write('ITEM_NAME: ' + item[0] + '_' + str(itemnum) + '\n')
- file.write('ITEM_TOTAL_SIZE: ' + str(itemsize) + '\n')
- file.write('ITEM_TOTAL_LINKS: ' + str(itemlinks) + '\n')
- itemnum += 1
- itemsize = 0
- itemlinks = 0
- if not [item[2], itemurl] in archivelist:
- with open('archive/' + item[0], 'a') as file:
- if "'" in itemurl:
- file.write(str(item[2]) + ", \"" + itemurl + "\"\n")
- else:
- file.write(str(item[2]) + ', \'' + itemurl + '\'\n')
- with open('archive/' + totalitems[0][0] + '-data', 'w') as file:
- if os.path.isfile('items/' + item[0] + '_' + str(itemnum-1)):
- file.write(str(itemnum-1))
- try:
- urllib.urlopen('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + '/NONEXISTINGFILEdgdjahxnedadbacxjbc/')
- except Exception as error:
- dir_not_found = str(error).replace('[Errno ftp error] ', '')
- print(dir_not_found)
-
- try:
- urllib.urlopen('ftp://' + re.search(r'^([^\/]+)', ftp).group(1) + '/NONEXISTINGFILEdgdjahxnedadbacxjbc')
- except Exception as error:
- file_not_found = str(error).replace('[Errno ftp error] ', '')
- print(file_not_found)
-
- if os.path.isfile('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found'):
- os.remove('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found')
- if os.path.isfile('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found'):
- os.remove('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found')
-
- with open('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_dir_not_found', 'w') as file:
- file.write(dir_not_found)
- with open('items/' + re.search(r'^([^\/]+)', ftp).group(1) + '_file_not_found', 'w') as file:
- file.write(file_not_found)
-
- if not tobechecked == 'to_be_rechecked':
- with open('to_be_rechecked', 'a') as file:
- if os.path.isfile('to_be_checked'):
- file.write('\n' + ftp)
- else:
- file.write(ftp)
+ check_ftp(ftp)
print(totalsize)