add duplicate checking

This commit is contained in:
Maks Snegov 2014-07-20 13:06:51 +04:00
parent a0fbb414a7
commit c523d025af

View File

@ -3,13 +3,16 @@
import argparse import argparse
import base64 import base64
import html.parser import html.parser
import os
import os.path import os.path
import re
import sys import sys
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.request import urlopen from urllib.request import urlopen
class InfiniteRedirects(Exception): pass class UrlDuplicateError(Exception): pass
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
class TitleParser(html.parser.HTMLParser): class TitleParser(html.parser.HTMLParser):
@ -66,7 +69,7 @@ def embed_pictures(page, pict_urls, base_url=None):
try: try:
page = page.replace( page = page.replace(
url, embedded_image(complete_url(url, base_url))) url, embedded_image(complete_url(url, base_url)))
except (ValueError, InfiniteRedirects, ConnectionRefusedError): except (ValueError, ConnectionRefusedError):
pass pass
return page return page
@ -84,6 +87,17 @@ def embed_css(page, css_urls, base_url=None):
return page return page
def url_duplicate(url):
for htmlfile in os.listdir():
if not htmlfile.endswith('.html'):
continue
with open(htmlfile) as h:
h_url = h.readline()
if url in URLDUP.findall(h_url):
raise UrlDuplicateError(
'URL is already saved in file "%s"' % htmlfile)
def write_file(page, title, comment=None): def write_file(page, title, comment=None):
write_inc = lambda i: '_%d' % i if i > 1 else '' write_inc = lambda i: '_%d' % i if i > 1 else ''
inc = 0 inc = 0
@ -117,6 +131,11 @@ def main():
args = parser.parse_args() args = parser.parse_args()
for url in args.urls: for url in args.urls:
try:
url_duplicate(url)
except UrlDuplicateError as e:
print(e)
continue
page = get_text(url) page = get_text(url)
parser = TitleParser(strict=False) parser = TitleParser(strict=False)
parser.feed(page) parser.feed(page)