add duplicate checking
This commit is contained in:
parent
a0fbb414a7
commit
c523d025af
23
nevernote.py
23
nevernote.py
@ -3,13 +3,16 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import base64
|
import base64
|
||||||
import html.parser
|
import html.parser
|
||||||
|
import os
|
||||||
import os.path
|
import os.path
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
|
||||||
class InfiniteRedirects(Exception): pass
|
class UrlDuplicateError(Exception): pass
|
||||||
|
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
|
||||||
|
|
||||||
|
|
||||||
class TitleParser(html.parser.HTMLParser):
|
class TitleParser(html.parser.HTMLParser):
|
||||||
@ -66,7 +69,7 @@ def embed_pictures(page, pict_urls, base_url=None):
|
|||||||
try:
|
try:
|
||||||
page = page.replace(
|
page = page.replace(
|
||||||
url, embedded_image(complete_url(url, base_url)))
|
url, embedded_image(complete_url(url, base_url)))
|
||||||
except (ValueError, InfiniteRedirects, ConnectionRefusedError):
|
except (ValueError, ConnectionRefusedError):
|
||||||
pass
|
pass
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -84,6 +87,17 @@ def embed_css(page, css_urls, base_url=None):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def url_duplicate(url):
|
||||||
|
for htmlfile in os.listdir():
|
||||||
|
if not htmlfile.endswith('.html'):
|
||||||
|
continue
|
||||||
|
with open(htmlfile) as h:
|
||||||
|
h_url = h.readline()
|
||||||
|
if url in URLDUP.findall(h_url):
|
||||||
|
raise UrlDuplicateError(
|
||||||
|
'URL is already saved in file "%s"' % htmlfile)
|
||||||
|
|
||||||
|
|
||||||
def write_file(page, title, comment=None):
|
def write_file(page, title, comment=None):
|
||||||
write_inc = lambda i: '_%d' % i if i > 1 else ''
|
write_inc = lambda i: '_%d' % i if i > 1 else ''
|
||||||
inc = 0
|
inc = 0
|
||||||
@ -117,6 +131,11 @@ def main():
|
|||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
for url in args.urls:
|
for url in args.urls:
|
||||||
|
try:
|
||||||
|
url_duplicate(url)
|
||||||
|
except UrlDuplicateError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
page = get_text(url)
|
page = get_text(url)
|
||||||
parser = TitleParser(strict=False)
|
parser = TitleParser(strict=False)
|
||||||
parser.feed(page)
|
parser.feed(page)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user