Add --skip-dups option

This commit is contained in:
Maks Snegov 2019-10-22 14:39:36 +03:00
parent bdceede4f2
commit 3198361266

View File

@ -94,16 +94,19 @@ def embed_scripts(page, script_urls, base_url=None):
return page return page
def url_duplicate(url): def is_downloaded(url: str) -> bool:
"""Check if url was already downloaded""" """Check if url was already downloaded"""
for htmlfile in os.listdir(path='.'): for htmlfile in os.listdir(path='.'):
if not htmlfile.endswith('.html'): if not htmlfile.endswith('.html'):
continue continue
with open(htmlfile) as h: with open(htmlfile) as h:
h_url = h.readline() h_url = h.readline()
if url in URLDUP.findall(h_url): if url in URLDUP.findall(h_url):
raise UrlDuplicateError( print("URL is already saved in file '%s'" % htmlfile)
'URL is already saved in file "%s"' % htmlfile) return True
return False
def write_file(page, title, comment=None): def write_file(page, title, comment=None):
@ -135,13 +138,12 @@ def complete_url(url, base_url=None):
return url return url
def process_url(url): def process_url(url: str, dup_check: bool = False):
"""Save single URL to a file""" """Save single URL to a file"""
url = url.strip()
print('Processing URL: %s' % url) print('Processing URL: %s' % url)
try:
url_duplicate(url) if dup_check and is_downloaded(url):
except UrlDuplicateError as e:
print(e)
return return
try: try:
@ -166,7 +168,11 @@ def main():
) )
parser.add_argument("-i", "--infile", parser.add_argument("-i", "--infile",
help="File with URLs to download") help="File with URLs to download")
parser.add_argument("-s", "--skip-dups", action="store_false",
default=True, dest="dup_check",
help="Rewrite already downloaded files")
parser.add_argument('urls', metavar='URL', type=str, nargs='*', parser.add_argument('urls', metavar='URL', type=str, nargs='*',
default=sys.stdin,
help='URL of page to download') help='URL of page to download')
args = parser.parse_args() args = parser.parse_args()
@ -178,12 +184,12 @@ def main():
print(err) print(err)
return 1 return 1
for url in fd.readlines(): for url in fd.readlines():
process_url(url.strip()) process_url(url, dup_check=args.dup_check)
fd.close() fd.close()
# Process URLs from CLI # Process URLs from CLI
for arg in args.urls: for arg in args.urls:
process_url(arg) process_url(arg, dup_check=args.dup_check)
class UrlDuplicateError(Exception): class UrlDuplicateError(Exception):