From 31983612660648108eafd8a6c106d4600478b904 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Tue, 22 Oct 2019 14:39:36 +0300 Subject: [PATCH] Add --skip-dups option --- nevernote.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/nevernote.py b/nevernote.py index 2fd8cf6..cbd63d4 100755 --- a/nevernote.py +++ b/nevernote.py @@ -94,16 +94,19 @@ def embed_scripts(page, script_urls, base_url=None): return page -def url_duplicate(url): +def is_downloaded(url: str) -> bool: """Check if url was already downloaded""" for htmlfile in os.listdir(path='.'): if not htmlfile.endswith('.html'): continue + with open(htmlfile) as h: h_url = h.readline() if url in URLDUP.findall(h_url): - raise UrlDuplicateError( - 'URL is already saved in file "%s"' % htmlfile) + print("URL is already saved in file '%s'" % htmlfile) + return True + + return False def write_file(page, title, comment=None): @@ -135,13 +138,12 @@ def complete_url(url, base_url=None): return url -def process_url(url): +def process_url(url: str, dup_check: bool = False): """Save single URL to a file""" + url = url.strip() print('Processing URL: %s' % url) - try: - url_duplicate(url) - except UrlDuplicateError as e: - print(e) + + if dup_check and is_downloaded(url): return try: @@ -166,7 +168,11 @@ def main(): ) parser.add_argument("-i", "--infile", help="File with URLs to download") + parser.add_argument("-s", "--skip-dups", action="store_false", + default=True, dest="dup_check", + help="Rewrite already downloaded files") parser.add_argument('urls', metavar='URL', type=str, nargs='*', + default=sys.stdin, help='URL of page to download') args = parser.parse_args() @@ -178,12 +184,12 @@ def main(): print(err) return 1 for url in fd.readlines(): - process_url(url.strip()) + process_url(url, dup_check=args.dup_check) fd.close() # Process URLs from CLI for arg in args.urls: - process_url(arg) + process_url(arg, dup_check=args.dup_check) class UrlDuplicateError(Exception):