add duplicate checking
This commit is contained in:
parent
a0fbb414a7
commit
c523d025af
23
nevernote.py
23
nevernote.py
@ -3,13 +3,16 @@
|
||||
import argparse
|
||||
import base64
|
||||
import html.parser
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
from urllib.parse import urlparse
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
class InfiniteRedirects(Exception): pass
|
||||
class UrlDuplicateError(Exception): pass
|
||||
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
|
||||
|
||||
|
||||
class TitleParser(html.parser.HTMLParser):
|
||||
@ -66,7 +69,7 @@ def embed_pictures(page, pict_urls, base_url=None):
|
||||
try:
|
||||
page = page.replace(
|
||||
url, embedded_image(complete_url(url, base_url)))
|
||||
except (ValueError, InfiniteRedirects, ConnectionRefusedError):
|
||||
except (ValueError, ConnectionRefusedError):
|
||||
pass
|
||||
return page
|
||||
|
||||
@ -84,6 +87,17 @@ def embed_css(page, css_urls, base_url=None):
|
||||
return page
|
||||
|
||||
|
||||
def url_duplicate(url):
|
||||
for htmlfile in os.listdir():
|
||||
if not htmlfile.endswith('.html'):
|
||||
continue
|
||||
with open(htmlfile) as h:
|
||||
h_url = h.readline()
|
||||
if url in URLDUP.findall(h_url):
|
||||
raise UrlDuplicateError(
|
||||
'URL is already saved in file "%s"' % htmlfile)
|
||||
|
||||
|
||||
def write_file(page, title, comment=None):
|
||||
write_inc = lambda i: '_%d' % i if i > 1 else ''
|
||||
inc = 0
|
||||
@ -117,6 +131,11 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
for url in args.urls:
|
||||
try:
|
||||
url_duplicate(url)
|
||||
except UrlDuplicateError as e:
|
||||
print(e)
|
||||
continue
|
||||
page = get_text(url)
|
||||
parser = TitleParser(strict=False)
|
||||
parser.feed(page)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user