From af948ff6fcfa35016944b5f843b5e61c9e1eefc4 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 1 Jun 2014 21:28:06 +0400 Subject: [PATCH 01/42] move shell script to deprecated dir --- nevernote.sh => deprecated/nevernote.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename nevernote.sh => deprecated/nevernote.sh (100%) diff --git a/nevernote.sh b/deprecated/nevernote.sh similarity index 100% rename from nevernote.sh rename to deprecated/nevernote.sh From 7e431629209787df3d7fe48b203da475fc9e32a3 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 1 Jun 2014 23:20:42 +0400 Subject: [PATCH 02/42] rewrite HTML title parser --- nevernote.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/nevernote.py b/nevernote.py index 0ce97fc..c805ad2 100755 --- a/nevernote.py +++ b/nevernote.py @@ -2,11 +2,17 @@ import argparse import http.client +import html.parser import sys - -from bs4 import BeautifulSoup from urllib.parse import urlparse + +class TitleParser(html.parser.HTMLParser): + def handle_data(self, data): + if self.lasttag == 'title': + self.title = data + + def get_page(url): '''download page and decode it to utf-8''' charset = 'utf-8' @@ -50,13 +56,11 @@ def get_page(url): return page -def get_title(page): - soup = BeautifulSoup(page) - return soup.title.string - - def write_file(page): - fname = get_title(page) + '.html' + parser = TitleParser(strict=False) + parser.feed(page) + + fname = parser.title + '.html' with open(fname, 'w') as a_file: a_file.write(page) From 2f6c8774931c49349846f24f274275095dabe5e6 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 15 Jun 2014 20:16:35 +0400 Subject: [PATCH 03/42] fix: URL with no schema will raise error --- nevernote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index c805ad2..a240297 100755 --- a/nevernote.py +++ b/nevernote.py @@ -18,6 +18,8 @@ def get_page(url): charset = 'utf-8' up = urlparse(url) + if not up.scheme: + up = urlparse('http://' + url) headers = { "Host": up.netloc, @@ -30,7 +32,7 @@ def get_page(url): elif up.scheme == 'https': conn = http.client.HTTPSConnection(up.netloc) else: - print("ERROR: invalid protocol set in '{0}'".format(url)) + raise NotImplementedError("protocol %s is not implemented" % up.scheme) return False conn.request("GET", up.path, None, headers) From 5b05f3e8d0a58310a5d94d5592bfe7d53dc99a0b Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Tue, 17 Jun 2014 22:26:12 +0400 Subject: [PATCH 04/42] separate download_content() from get_page() --- nevernote.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/nevernote.py b/nevernote.py index a240297..55a5860 100755 --- a/nevernote.py +++ b/nevernote.py @@ -13,10 +13,8 @@ class TitleParser(html.parser.HTMLParser): self.title = data -def get_page(url): +def download_content(url): '''download page and decode it to utf-8''' - charset = 'utf-8' - up = urlparse(url) if not up.scheme: up = urlparse('http://' + url) @@ -33,7 +31,6 @@ def get_page(url): conn = http.client.HTTPSConnection(up.netloc) else: raise NotImplementedError("protocol %s is not implemented" % up.scheme) - return False conn.request("GET", up.path, None, headers) response = conn.getresponse() @@ -43,14 +40,22 @@ def get_page(url): or (response.status == http.client.FOUND): new_url = response.getheader('Location') print('Redirect to ' + new_url) - return get_page(new_url) + return download_content(new_url) + return response + + +def get_page(url): + response = download_content(url) # get page charset from response header - contenttype = response.getheader('Content-Type') - if contenttype: - ct_spl = contenttype.split('; ') - if len(ct_spl) > 1: - charset = ct_spl[1].split('=')[1] + c_type = response.getheader('Content-Type') + if not c_type.startswith('text'): + raise ValueError('incorrect Content-Type for HTML page: %s' % c_type) + + charset = 'iso-8859-1' + ct_spl = c_type.split('; ') + if len(ct_spl) > 1: + charset = ct_spl[1].split('=')[1] page_binary = response.read() page = page_binary.decode(charset) From 2666d7911ac212dd1fa2a53aefcb940bc6d4b034 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Tue, 17 Jun 2014 22:28:54 +0400 Subject: [PATCH 05/42] no scheme in url fix --- nevernote.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/nevernote.py b/nevernote.py index 55a5860..626c570 100755 --- a/nevernote.py +++ b/nevernote.py @@ -17,7 +17,7 @@ def download_content(url): '''download page and decode it to utf-8''' up = urlparse(url) if not up.scheme: - up = urlparse('http://' + url) + up = urlparse('//' + url) headers = { "Host": up.netloc, @@ -25,7 +25,7 @@ def download_content(url): "Connection": "keep-alive", } - if up.scheme == 'http': + if not up.scheme or up.scheme == 'http': conn = http.client.HTTPConnection(up.netloc) elif up.scheme == 'https': conn = http.client.HTTPSConnection(up.netloc) @@ -36,8 +36,8 @@ def download_content(url): response = conn.getresponse() # follow redirects - if (response.status == http.client.MOVED_PERMANENTLY) \ - or (response.status == http.client.FOUND): + if ((response.status == http.client.MOVED_PERMANENTLY) + or (response.status == http.client.FOUND)): new_url = response.getheader('Location') print('Redirect to ' + new_url) return download_content(new_url) @@ -73,10 +73,10 @@ def write_file(page): def main(): - parser = argparse.ArgumentParser(description= - 'Nevernote - download pages locally.') - parser.add_argument('urls', metavar='URL', type=str, nargs='+', help= - 'URL of page to download') + parser = argparse.ArgumentParser( + description='Nevernote - download pages locally.') + parser.add_argument('urls', metavar='URL', type=str, nargs='+', + help='URL of page to download') args = parser.parse_args() From ae4a9b986e0c9a93ae74697eb806a6932bf420d8 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Tue, 17 Jun 2014 22:31:02 +0400 Subject: [PATCH 06/42] add gzip support --- nevernote.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index 626c570..8baeffa 100755 --- a/nevernote.py +++ b/nevernote.py @@ -5,6 +5,7 @@ import http.client import html.parser import sys from urllib.parse import urlparse +import zlib class TitleParser(html.parser.HTMLParser): @@ -52,12 +53,20 @@ def get_page(url): if not c_type.startswith('text'): raise ValueError('incorrect Content-Type for HTML page: %s' % c_type) + c_encoding = response.getheader('Content-Encoding') + if c_encoding: + if c_encoding == 'gzip': + page_binary = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) + else: + raise NotImplementedError( + 'content encoding %s is not implemented' % c_encoding) + else: + page_binary = response.read() + charset = 'iso-8859-1' ct_spl = c_type.split('; ') if len(ct_spl) > 1: charset = ct_spl[1].split('=')[1] - - page_binary = response.read() page = page_binary.decode(charset) return page From aead01258d9c341ead486c68a1258ff2e4c3ed34 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 21 Jun 2014 09:43:12 +0400 Subject: [PATCH 07/42] remove never used if condition --- nevernote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index 8baeffa..b715eaa 100755 --- a/nevernote.py +++ b/nevernote.py @@ -26,7 +26,7 @@ def download_content(url): "Connection": "keep-alive", } - if not up.scheme or up.scheme == 'http': + if up.scheme == 'http': conn = http.client.HTTPConnection(up.netloc) elif up.scheme == 'https': conn = http.client.HTTPSConnection(up.netloc) @@ -40,7 +40,7 @@ def download_content(url): if ((response.status == http.client.MOVED_PERMANENTLY) or (response.status == http.client.FOUND)): new_url = response.getheader('Location') - print('Redirect to ' + new_url) + print('Redirecting to ' + new_url) return download_content(new_url) return response From ab9a7e34c122698dbdb67a53cb60d3299bf5bd75 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 21 Jun 2014 09:58:47 +0400 Subject: [PATCH 08/42] get title name --- nevernote.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nevernote.py b/nevernote.py index b715eaa..2287ff2 100755 --- a/nevernote.py +++ b/nevernote.py @@ -9,9 +9,11 @@ import zlib class TitleParser(html.parser.HTMLParser): - def handle_data(self, data): - if self.lasttag == 'title': - self.title = data + def handle_starttag(self, name, attribs): + if name == 'title': + title_start = self.rawdata.index('') + len('<title>') + title_end = self.rawdata.index('', title_start) + self.title = self.rawdata[title_start:title_end] def download_content(url): From e2009e7f089561da1e622931d12bc343f52ab526 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 21 Jun 2014 20:09:15 +0400 Subject: [PATCH 09/42] skip fname duplicates --- nevernote.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nevernote.py b/nevernote.py index 2287ff2..6dc697b 100755 --- a/nevernote.py +++ b/nevernote.py @@ -78,9 +78,17 @@ def write_file(page): parser = TitleParser(strict=False) parser.feed(page) - fname = parser.title + '.html' - with open(fname, 'w') as a_file: - a_file.write(page) + fname = parser.title.replace('/', '_') + '.html' + inc = 1 + while True: + try: + with open(fname, 'x') as a_file: + print('Saving in file "%s"' % fname) + a_file.write(page) + break + except FileExistsError: + inc += 1 + fname = parser.title.replace('/', '_') + '_%d.html' % inc def main(): From 5837451ed71e6c7bfc49ab1482e4b444b5ea2808 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 21 Jun 2014 20:23:25 +0400 Subject: [PATCH 10/42] add url as comment to saved pages --- nevernote.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/nevernote.py b/nevernote.py index 6dc697b..68fc254 100755 --- a/nevernote.py +++ b/nevernote.py @@ -3,6 +3,7 @@ import argparse import http.client import html.parser +import os.path import sys from urllib.parse import urlparse import zlib @@ -74,21 +75,23 @@ def get_page(url): return page -def write_file(page): +def write_file(page, comment=None): parser = TitleParser(strict=False) parser.feed(page) fname = parser.title.replace('/', '_') + '.html' inc = 1 while True: - try: - with open(fname, 'x') as a_file: - print('Saving in file "%s"' % fname) - a_file.write(page) - break - except FileExistsError: - inc += 1 - fname = parser.title.replace('/', '_') + '_%d.html' % inc + if not os.path.exists(fname): + break + inc += 1 + fname = parser.title.replace('/', '_') + '_%d.html' % inc + + with open(fname, 'x', newline='\n') as a_file: + print('Saving in file "%s"' % fname) + if comment: + a_file.write('' % comment) + a_file.write(page) def main(): @@ -101,7 +104,7 @@ def main(): for url in args.urls: page = get_page(url) - write_file(page) + write_file(page, comment=url) if __name__ == '__main__': From 11de3578656a31fb38c5fc400278b40682c287a5 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 11:45:37 +0400 Subject: [PATCH 11/42] add image embedding --- nevernote.py | 54 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 9 deletions(-) diff --git a/nevernote.py b/nevernote.py index 68fc254..28e9905 100755 --- a/nevernote.py +++ b/nevernote.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 import argparse +import base64 import http.client import html.parser import os.path @@ -10,8 +11,16 @@ import zlib class TitleParser(html.parser.HTMLParser): + def __init__(self, *args, **kwargs): + html.parser.HTMLParser.__init__(self, *args, **kwargs) + self.images = set() + def handle_starttag(self, name, attribs): - if name == 'title': + if name == 'img': + for attr, value in attribs: + if attr == 'src': + self.images.add(value) + elif name == 'title': title_start = self.rawdata.index('') + len('<title>') title_end = self.rawdata.index('', title_start) self.title = self.rawdata[title_start:title_end] @@ -75,23 +84,40 @@ def get_page(url): return page -def write_file(page, comment=None): - parser = TitleParser(strict=False) - parser.feed(page) +def embedded_image(url): + '''Download content from URL and return bytes if target is image''' + response = download_content(url) + ctype = response.getheader('Content-Type') + if not ctype or not ctype.startswith('image'): + raise ValueError('incorrect Content-Type for image: %s' % ctype) + b64pict = base64.b64encode(response.read()).decode() + return 'data:%s;base64,%s' % (ctype, b64pict) - fname = parser.title.replace('/', '_') + '.html' + +def embed_pictures(page, pict_urls): + for url in pict_urls: + print('New picture: %s' % url) + try: + page = page.replace(url, embedded_image(url)) + except (ValueError): + pass + return page + + +def write_file(page, title, comment=None): + fname = title.replace('/', '_') + '.html' inc = 1 while True: if not os.path.exists(fname): break inc += 1 - fname = parser.title.replace('/', '_') + '_%d.html' % inc + fname = title.replace('/', '_') + '_%d.html' % inc with open(fname, 'x', newline='\n') as a_file: print('Saving in file "%s"' % fname) + a_file.write(page) if comment: a_file.write('' % comment) - a_file.write(page) def main(): @@ -99,12 +125,22 @@ def main(): description='Nevernote - download pages locally.') parser.add_argument('urls', metavar='URL', type=str, nargs='+', help='URL of page to download') - args = parser.parse_args() for url in args.urls: page = get_page(url) - write_file(page, comment=url) + parser = TitleParser(strict=False) + parser.feed(page) + + for picturl in parser.images: + up = urlparse(picturl) + if not up.netloc: + parser.images.remove(picturl) + picturl = '//' + urlparse(url).netloc + picturl + parser.images.add(picturl) + + full_page = embed_pictures(page, parser.images) + write_file(full_page, parser.title, comment=url) if __name__ == '__main__': From 5b91bef8968cae8be8d8d083b7f2d498c0c8052c Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 11:47:21 +0400 Subject: [PATCH 12/42] add infinite redirects blocking --- nevernote.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/nevernote.py b/nevernote.py index 28e9905..87e99c8 100755 --- a/nevernote.py +++ b/nevernote.py @@ -10,6 +10,9 @@ from urllib.parse import urlparse import zlib +class InfiniteRedirects(Exception): pass + + class TitleParser(html.parser.HTMLParser): def __init__(self, *args, **kwargs): html.parser.HTMLParser.__init__(self, *args, **kwargs) @@ -26,8 +29,11 @@ class TitleParser(html.parser.HTMLParser): self.title = self.rawdata[title_start:title_end] -def download_content(url): +def download_content(url, depth=0): '''download page and decode it to utf-8''' + if depth > 10: + raise InfiniteRedirects('too much redirects: %s' % url) + up = urlparse(url) if not up.scheme: up = urlparse('//' + url) @@ -53,7 +59,7 @@ def download_content(url): or (response.status == http.client.FOUND)): new_url = response.getheader('Location') print('Redirecting to ' + new_url) - return download_content(new_url) + return download_content(new_url, depth+1) return response @@ -99,7 +105,7 @@ def embed_pictures(page, pict_urls): print('New picture: %s' % url) try: page = page.replace(url, embedded_image(url)) - except (ValueError): + except (ValueError, InfiniteRedirects): pass return page From ab03e18ce22343d0b6f9eb602742bff6ee1a352d Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 11:48:04 +0400 Subject: [PATCH 13/42] fix relative urls --- nevernote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index 87e99c8..1796713 100755 --- a/nevernote.py +++ b/nevernote.py @@ -35,7 +35,7 @@ def download_content(url, depth=0): raise InfiniteRedirects('too much redirects: %s' % url) up = urlparse(url) - if not up.scheme: + if not up.netloc: up = urlparse('//' + url) headers = { @@ -44,7 +44,7 @@ def download_content(url, depth=0): "Connection": "keep-alive", } - if up.scheme == 'http': + if not up.scheme or up.scheme == 'http': conn = http.client.HTTPConnection(up.netloc) elif up.scheme == 'https': conn = http.client.HTTPSConnection(up.netloc) From 36be68d78d72fb9bad074f94b424bd8a74cc336d Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 11:59:02 +0400 Subject: [PATCH 14/42] fix title with attributes parsing --- nevernote.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 1796713..3b387e2 100755 --- a/nevernote.py +++ b/nevernote.py @@ -24,7 +24,8 @@ class TitleParser(html.parser.HTMLParser): if attr == 'src': self.images.add(value) elif name == 'title': - title_start = self.rawdata.index('') + len('<title>') + titletag_start = self.rawdata.index('<title') + title_start = self.rawdata.index('>', titletag_start) + 1 title_end = self.rawdata.index('', title_start) self.title = self.rawdata[title_start:title_end] From ae63ca63182aa57b4236063f5d9bf04c6964c9f7 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 12:16:10 +0400 Subject: [PATCH 15/42] skip connRefusedError pictures --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 3b387e2..2eae6b5 100755 --- a/nevernote.py +++ b/nevernote.py @@ -106,7 +106,7 @@ def embed_pictures(page, pict_urls): print('New picture: %s' % url) try: page = page.replace(url, embedded_image(url)) - except (ValueError, InfiniteRedirects): + except (ValueError, InfiniteRedirects, ConnectionRefusedError): pass return page From 5c87f241d1914b75618f4df4c32562128726c6fe Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 12:24:10 +0400 Subject: [PATCH 16/42] clean title from multiple whitespaces --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 2eae6b5..db746bb 100755 --- a/nevernote.py +++ b/nevernote.py @@ -112,7 +112,7 @@ def embed_pictures(page, pict_urls): def write_file(page, title, comment=None): - fname = title.replace('/', '_') + '.html' + fname = ' '.join(title.replace('/', '_').split()) + '.html' inc = 1 while True: if not os.path.exists(fname): From fe69eff79b5867955e2dd9767d842a2038cf4910 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 12:38:05 +0400 Subject: [PATCH 17/42] fix increment postfix in filenames --- nevernote.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nevernote.py b/nevernote.py index db746bb..3e86ab0 100755 --- a/nevernote.py +++ b/nevernote.py @@ -112,13 +112,13 @@ def embed_pictures(page, pict_urls): def write_file(page, title, comment=None): - fname = ' '.join(title.replace('/', '_').split()) + '.html' - inc = 1 + write_inc = lambda i: '_%d' % i if i > 1 else '' + inc = 0 while True: + inc += 1 + fname = ' '.join(title.replace('/', '_').split()) + write_inc(inc) + '.html' if not os.path.exists(fname): break - inc += 1 - fname = title.replace('/', '_') + '_%d.html' % inc with open(fname, 'x', newline='\n') as a_file: print('Saving in file "%s"' % fname) From 35f755005da5ed1fbefd64316071f24053ac8411 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 13:12:35 +0400 Subject: [PATCH 18/42] fix: do not work with GET arguments --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 3e86ab0..5b778fc 100755 --- a/nevernote.py +++ b/nevernote.py @@ -52,7 +52,7 @@ def download_content(url, depth=0): else: raise NotImplementedError("protocol %s is not implemented" % up.scheme) - conn.request("GET", up.path, None, headers) + conn.request("GET", '?'.join((up.path, up.query)), None, headers) response = conn.getresponse() # follow redirects From a7ef8a8b7b4dd9a2057334101c2f5c37d954c75a Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 22:56:43 +0400 Subject: [PATCH 19/42] separate complete_url function --- nevernote.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/nevernote.py b/nevernote.py index 5b778fc..ee5ec47 100755 --- a/nevernote.py +++ b/nevernote.py @@ -101,11 +101,12 @@ def embedded_image(url): return 'data:%s;base64,%s' % (ctype, b64pict) -def embed_pictures(page, pict_urls): +def embed_pictures(page, pict_urls, base_url=None): for url in pict_urls: print('New picture: %s' % url) try: - page = page.replace(url, embedded_image(url)) + page = page.replace( + url, embedded_image(complete_url(url, base_url))) except (ValueError, InfiniteRedirects, ConnectionRefusedError): pass return page @@ -127,6 +128,14 @@ def write_file(page, title, comment=None): a_file.write('' % comment) +def complete_url(url, base_url): + if base_url is not None: + up = urlparse(url) + if not up.netloc: + url = '//' + urlparse(base_url).netloc + url + return url + + def main(): parser = argparse.ArgumentParser( description='Nevernote - download pages locally.') @@ -139,15 +148,8 @@ def main(): parser = TitleParser(strict=False) parser.feed(page) - for picturl in parser.images: - up = urlparse(picturl) - if not up.netloc: - parser.images.remove(picturl) - picturl = '//' + urlparse(url).netloc + picturl - parser.images.add(picturl) - - full_page = embed_pictures(page, parser.images) - write_file(full_page, parser.title, comment=url) + page = embed_pictures(page, parser.images, base_url=url) + write_file(page, parser.title, comment=url) if __name__ == '__main__': From 754411b6b76f26acc3c87ef083d13ba867233e1a Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 22:57:42 +0400 Subject: [PATCH 20/42] remove unused header from request --- nevernote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index ee5ec47..885420e 100755 --- a/nevernote.py +++ b/nevernote.py @@ -41,7 +41,6 @@ def download_content(url, depth=0): headers = { "Host": up.netloc, - "Content-Type": "text/html; charset=utf-8", "Connection": "keep-alive", } From 594ff719914462c409d259e89c21e7eff5fd818b Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 22 Jun 2014 23:51:18 +0400 Subject: [PATCH 21/42] add css embedding --- nevernote.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/nevernote.py b/nevernote.py index 885420e..26be95b 100755 --- a/nevernote.py +++ b/nevernote.py @@ -17,6 +17,7 @@ class TitleParser(html.parser.HTMLParser): def __init__(self, *args, **kwargs): html.parser.HTMLParser.__init__(self, *args, **kwargs) self.images = set() + self.css = set() def handle_starttag(self, name, attribs): if name == 'img': @@ -28,6 +29,10 @@ class TitleParser(html.parser.HTMLParser): title_start = self.rawdata.index('>', titletag_start) + 1 title_end = self.rawdata.index('', title_start) self.title = self.rawdata[title_start:title_end] + elif name == 'link': + attr_dict = dict(attribs) + if attr_dict.get('rel') == 'stylesheet': + self.css.add(attr_dict['href']) def download_content(url, depth=0): @@ -111,6 +116,22 @@ def embed_pictures(page, pict_urls, base_url=None): return page +def embed_css(page, css_urls, base_url=None): + for url in css_urls: + if not url: + continue + print('New CSS: %s' % url) + try: + css_start = page.rindex('<', 0, page.index(url)) + css_end = page.index('>', css_start) + 1 + css = ('' + % get_page(complete_url(url, base_url))) + page = page[:css_start] + css + page[css_end:] + except (InfiniteRedirects, ConnectionRefusedError): + pass + return page + + def write_file(page, title, comment=None): write_inc = lambda i: '_%d' % i if i > 1 else '' inc = 0 @@ -148,6 +169,7 @@ def main(): parser.feed(page) page = embed_pictures(page, parser.images, base_url=url) + page = embed_css(page, parser.css, base_url=url) write_file(page, parser.title, comment=url) From 6a818f4bb46989903d42392d31e3d6f0b9e5bc09 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Mon, 23 Jun 2014 00:50:21 +0400 Subject: [PATCH 22/42] fix: error with empty GET urls --- nevernote.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 26be95b..ecf5aba 100755 --- a/nevernote.py +++ b/nevernote.py @@ -56,7 +56,8 @@ def download_content(url, depth=0): else: raise NotImplementedError("protocol %s is not implemented" % up.scheme) - conn.request("GET", '?'.join((up.path, up.query)), None, headers) + requrl = ('?'.join((up.path, up.query)) if up.query else up.path) or '/' + conn.request("GET", requrl, None, headers) response = conn.getresponse() # follow redirects From eb2c43f438c49159e2da0d88e73e3b84486b3447 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Wed, 25 Jun 2014 08:38:43 +0400 Subject: [PATCH 23/42] ignore UTF-8 errors --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index ecf5aba..6d9118d 100755 --- a/nevernote.py +++ b/nevernote.py @@ -91,7 +91,7 @@ def get_page(url): ct_spl = c_type.split('; ') if len(ct_spl) > 1: charset = ct_spl[1].split('=')[1] - page = page_binary.decode(charset) + page = page_binary.decode(charset, errors='ignore') return page From 716c61f6f1cadee73dbc09af9967118871a3d862 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 08:09:07 +0400 Subject: [PATCH 24/42] replace http.client with urllib --- nevernote.py | 102 ++++++++++++++------------------------------------- 1 file changed, 27 insertions(+), 75 deletions(-) diff --git a/nevernote.py b/nevernote.py index 6d9118d..b9dcffe 100755 --- a/nevernote.py +++ b/nevernote.py @@ -2,12 +2,11 @@ import argparse import base64 -import http.client import html.parser import os.path import sys from urllib.parse import urlparse -import zlib +from urllib.request import urlopen class InfiniteRedirects(Exception): pass @@ -35,74 +34,29 @@ class TitleParser(html.parser.HTMLParser): self.css.add(attr_dict['href']) -def download_content(url, depth=0): - '''download page and decode it to utf-8''' - if depth > 10: - raise InfiniteRedirects('too much redirects: %s' % url) - - up = urlparse(url) - if not up.netloc: - up = urlparse('//' + url) - - headers = { - "Host": up.netloc, - "Connection": "keep-alive", - } - - if not up.scheme or up.scheme == 'http': - conn = http.client.HTTPConnection(up.netloc) - elif up.scheme == 'https': - conn = http.client.HTTPSConnection(up.netloc) - else: - raise NotImplementedError("protocol %s is not implemented" % up.scheme) - - requrl = ('?'.join((up.path, up.query)) if up.query else up.path) or '/' - conn.request("GET", requrl, None, headers) - response = conn.getresponse() - - # follow redirects - if ((response.status == http.client.MOVED_PERMANENTLY) - or (response.status == http.client.FOUND)): - new_url = response.getheader('Location') - print('Redirecting to ' + new_url) - return download_content(new_url, depth+1) - return response - - -def get_page(url): - response = download_content(url) - - # get page charset from response header - c_type = response.getheader('Content-Type') - if not c_type.startswith('text'): - raise ValueError('incorrect Content-Type for HTML page: %s' % c_type) - - c_encoding = response.getheader('Content-Encoding') - if c_encoding: - if c_encoding == 'gzip': - page_binary = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) - else: - raise NotImplementedError( - 'content encoding %s is not implemented' % c_encoding) - else: - page_binary = response.read() - - charset = 'iso-8859-1' - ct_spl = c_type.split('; ') - if len(ct_spl) > 1: - charset = ct_spl[1].split('=')[1] - page = page_binary.decode(charset, errors='ignore') - +def get_text(url, content='text/html'): + u = urlopen(url) + if u.status != 200: + raise RuntimeError('Incorrect HTTP status for %s' % url) + ctype = u.headers.get('content-type') + if ctype is None: + raise RuntimeError('None content type for %s' % url) + if not ctype.startswith(content): + raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) + encoding = ctype.split(';')[1].split('=')[1].lower() + data = u.read() + page = data.decode(encoding) return page def embedded_image(url): '''Download content from URL and return bytes if target is image''' - response = download_content(url) - ctype = response.getheader('Content-Type') - if not ctype or not ctype.startswith('image'): - raise ValueError('incorrect Content-Type for image: %s' % ctype) - b64pict = base64.b64encode(response.read()).decode() + u = urlopen(url) + if u.getcode() != 200: + raise RuntimeError('Incorrect status for %s' % url) + ctype = u.headers.get('Content-Type') + data = u.read() + b64pict = base64.b64encode(data).decode() return 'data:%s;base64,%s' % (ctype, b64pict) @@ -122,14 +76,11 @@ def embed_css(page, css_urls, base_url=None): if not url: continue print('New CSS: %s' % url) - try: - css_start = page.rindex('<', 0, page.index(url)) - css_end = page.index('>', css_start) + 1 - css = ('' - % get_page(complete_url(url, base_url))) - page = page[:css_start] + css + page[css_end:] - except (InfiniteRedirects, ConnectionRefusedError): - pass + css_start = page.rindex('<', 0, page.index(url)) + css_end = page.index('>', css_start) + 1 + css_tag = ('' + % get_text(complete_url(url, base_url), 'text/css')) + page = page[:css_start] + css_tag + page[css_end:] return page @@ -150,10 +101,11 @@ def write_file(page, title, comment=None): def complete_url(url, base_url): + base_up = urlparse(base_url) if base_url is not None: up = urlparse(url) if not up.netloc: - url = '//' + urlparse(base_url).netloc + url + url = base_up.scheme + '://' + base_up.netloc + url return url @@ -165,7 +117,7 @@ def main(): args = parser.parse_args() for url in args.urls: - page = get_page(url) + page = get_text(url) parser = TitleParser(strict=False) parser.feed(page) From a0fbb414a7838744f681531cd1986290bfb4054b Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 12:17:01 +0400 Subject: [PATCH 25/42] write url in the beginning of the file --- nevernote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index b9dcffe..79523df 100755 --- a/nevernote.py +++ b/nevernote.py @@ -95,9 +95,9 @@ def write_file(page, title, comment=None): with open(fname, 'x', newline='\n') as a_file: print('Saving in file "%s"' % fname) - a_file.write(page) if comment: - a_file.write('' % comment) + a_file.write('\n' % comment) + a_file.write(page) def complete_url(url, base_url): From c523d025af91ace21879d29f9734d158adfdd004 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 13:06:51 +0400 Subject: [PATCH 26/42] add duplicate checking --- nevernote.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index 79523df..86de4c9 100755 --- a/nevernote.py +++ b/nevernote.py @@ -3,13 +3,16 @@ import argparse import base64 import html.parser +import os import os.path +import re import sys from urllib.parse import urlparse from urllib.request import urlopen -class InfiniteRedirects(Exception): pass +class UrlDuplicateError(Exception): pass +URLDUP = re.compile(r'^$') class TitleParser(html.parser.HTMLParser): @@ -66,7 +69,7 @@ def embed_pictures(page, pict_urls, base_url=None): try: page = page.replace( url, embedded_image(complete_url(url, base_url))) - except (ValueError, InfiniteRedirects, ConnectionRefusedError): + except (ValueError, ConnectionRefusedError): pass return page @@ -84,6 +87,17 @@ def embed_css(page, css_urls, base_url=None): return page +def url_duplicate(url): + for htmlfile in os.listdir(): + if not htmlfile.endswith('.html'): + continue + with open(htmlfile) as h: + h_url = h.readline() + if url in URLDUP.findall(h_url): + raise UrlDuplicateError( + 'URL is already saved in file "%s"' % htmlfile) + + def write_file(page, title, comment=None): write_inc = lambda i: '_%d' % i if i > 1 else '' inc = 0 @@ -117,6 +131,11 @@ def main(): args = parser.parse_args() for url in args.urls: + try: + url_duplicate(url) + except UrlDuplicateError as e: + print(e) + continue page = get_text(url) parser = TitleParser(strict=False) parser.feed(page) From b58188b7b74dd1e3e732105bc5df81eeacee7356 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 13:29:56 +0400 Subject: [PATCH 27/42] remove import --- nevernote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 86de4c9..ebd07c1 100755 --- a/nevernote.py +++ b/nevernote.py @@ -4,7 +4,6 @@ import argparse import base64 import html.parser import os -import os.path import re import sys from urllib.parse import urlparse From 45f30ca9de034b6713b397b3ae7b2d306f7d7dcd Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 13:30:22 +0400 Subject: [PATCH 28/42] fix: error with urls without scheme ('//ya.ru/index.html') --- nevernote.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nevernote.py b/nevernote.py index ebd07c1..a913a0a 100755 --- a/nevernote.py +++ b/nevernote.py @@ -119,6 +119,8 @@ def complete_url(url, base_url): up = urlparse(url) if not up.netloc: url = base_up.scheme + '://' + base_up.netloc + url + elif not up.scheme: + url = base_up.scheme + ':' + url return url From 514b39d28756eb56ad71be53365586a5907cd8c6 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 13:31:20 +0400 Subject: [PATCH 29/42] use default charset utf-8 if not set in headers --- nevernote.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index a913a0a..dcfeabc 100755 --- a/nevernote.py +++ b/nevernote.py @@ -45,9 +45,11 @@ def get_text(url, content='text/html'): raise RuntimeError('None content type for %s' % url) if not ctype.startswith(content): raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) - encoding = ctype.split(';')[1].split('=')[1].lower() + + # get charset from 'Content-type' header + charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' data = u.read() - page = data.decode(encoding) + page = data.decode(charset.lower()) return page From 5c9d04cf3d860abf209ee4c9fba663ec1bd429e9 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 13:48:18 +0400 Subject: [PATCH 30/42] use file with links as arguments --- nevernote.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/nevernote.py b/nevernote.py index dcfeabc..02fbb91 100755 --- a/nevernote.py +++ b/nevernote.py @@ -126,6 +126,22 @@ def complete_url(url, base_url): return url +def process_url(url): + print('Processing URL: %s' % url) + try: + url_duplicate(url) + except UrlDuplicateError as e: + print(e) + return + page = get_text(url) + parser = TitleParser(strict=False) + parser.feed(page) + + page = embed_pictures(page, parser.images, base_url=url) + page = embed_css(page, parser.css, base_url=url) + write_file(page, parser.title, comment=url) + + def main(): parser = argparse.ArgumentParser( description='Nevernote - download pages locally.') @@ -133,19 +149,13 @@ def main(): help='URL of page to download') args = parser.parse_args() - for url in args.urls: - try: - url_duplicate(url) - except UrlDuplicateError as e: - print(e) - continue - page = get_text(url) - parser = TitleParser(strict=False) - parser.feed(page) - - page = embed_pictures(page, parser.images, base_url=url) - page = embed_css(page, parser.css, base_url=url) - write_file(page, parser.title, comment=url) + for arg in args.urls: + if os.path.isfile(arg): + print('Found file %s' % arg) + for url in (line.strip() for line in open(arg)): + process_url(url) + else: + process_url(arg) if __name__ == '__main__': From 964e79f97b4fcf9ad2adcaa12098096149ccc05b Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 14:03:49 +0400 Subject: [PATCH 31/42] add gzip encoding support --- nevernote.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/nevernote.py b/nevernote.py index 02fbb91..9abfc93 100755 --- a/nevernote.py +++ b/nevernote.py @@ -8,6 +8,7 @@ import re import sys from urllib.parse import urlparse from urllib.request import urlopen +import zlib class UrlDuplicateError(Exception): pass @@ -37,10 +38,10 @@ class TitleParser(html.parser.HTMLParser): def get_text(url, content='text/html'): - u = urlopen(url) - if u.status != 200: + response = urlopen(url) + if response.status != 200: raise RuntimeError('Incorrect HTTP status for %s' % url) - ctype = u.headers.get('content-type') + ctype = response.headers.get('content-type') if ctype is None: raise RuntimeError('None content type for %s' % url) if not ctype.startswith(content): @@ -48,7 +49,11 @@ def get_text(url, content='text/html'): # get charset from 'Content-type' header charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' - data = u.read() + + if response.info().get('Content-Encoding') == 'gzip': + data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) + else: + data = response.read() page = data.decode(charset.lower()) return page From b5ddae0ef8088289427bb4d07c27d2c31523e9fb Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 17:04:56 +0400 Subject: [PATCH 32/42] fix css charset error, add urllib.error.httperror --- nevernote.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/nevernote.py b/nevernote.py index 9abfc93..e72e574 100755 --- a/nevernote.py +++ b/nevernote.py @@ -6,6 +6,7 @@ import html.parser import os import re import sys +import urllib.error from urllib.parse import urlparse from urllib.request import urlopen import zlib @@ -37,10 +38,22 @@ class TitleParser(html.parser.HTMLParser): self.css.add(attr_dict['href']) -def get_text(url, content='text/html'): +def charset_header(content_type): + """ Parse charset from 'content-type' header + :param content_type: string + :return: string with character set + """ + if 'charset' in content_type: + return content_type.split(';')[1].split('=')[1] + else: + return None + + +def get_text(url, content='text/html', charset='utf-8'): response = urlopen(url) if response.status != 200: - raise RuntimeError('Incorrect HTTP status for %s' % url) + raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( + response.status, response.reason, url)) ctype = response.headers.get('content-type') if ctype is None: raise RuntimeError('None content type for %s' % url) @@ -48,7 +61,7 @@ def get_text(url, content='text/html'): raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) # get charset from 'Content-type' header - charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' + charset = charset_header(ctype) or charset if response.info().get('Content-Encoding') == 'gzip': data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) @@ -61,8 +74,9 @@ def get_text(url, content='text/html'): def embedded_image(url): '''Download content from URL and return bytes if target is image''' u = urlopen(url) - if u.getcode() != 200: - raise RuntimeError('Incorrect status for %s' % url) + if u.status != 200: + raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( + u.status, u.reason, url)) ctype = u.headers.get('Content-Type') data = u.read() b64pict = base64.b64encode(data).decode() @@ -75,20 +89,23 @@ def embed_pictures(page, pict_urls, base_url=None): try: page = page.replace( url, embedded_image(complete_url(url, base_url))) - except (ValueError, ConnectionRefusedError): + except (IncorrectHTTPStatus, urllib.error.HTTPError): pass return page def embed_css(page, css_urls, base_url=None): + if base_url is not None: + hdr = urlopen(base_url).headers.get('content-type') + base_char = charset_header(hdr) if hdr is not None else 'utf-8' for url in css_urls: if not url: continue print('New CSS: %s' % url) css_start = page.rindex('<', 0, page.index(url)) css_end = page.index('>', css_start) + 1 - css_tag = ('' - % get_text(complete_url(url, base_url), 'text/css')) + css_tag = ('' % get_text( + complete_url(url, base_url), content='text/css',charset=base_char)) page = page[:css_start] + css_tag + page[css_end:] return page From 61d3d84a9c93a0491f11dfe3606eede204a7478c Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 17:30:48 +0400 Subject: [PATCH 33/42] remove unused exception --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index e72e574..7750c1d 100755 --- a/nevernote.py +++ b/nevernote.py @@ -89,7 +89,7 @@ def embed_pictures(page, pict_urls, base_url=None): try: page = page.replace( url, embedded_image(complete_url(url, base_url))) - except (IncorrectHTTPStatus, urllib.error.HTTPError): + except urllib.error.HTTPError: pass return page From 09346f4a701b25e41040af640a69275ba1ad039e Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 17:31:15 +0400 Subject: [PATCH 34/42] fix: error with css charsets if no base charset --- nevernote.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 7750c1d..efba1f0 100755 --- a/nevernote.py +++ b/nevernote.py @@ -95,9 +95,11 @@ def embed_pictures(page, pict_urls, base_url=None): def embed_css(page, css_urls, base_url=None): + # fetch charset from base URL or use default UTF-8 if base_url is not None: hdr = urlopen(base_url).headers.get('content-type') - base_char = charset_header(hdr) if hdr is not None else 'utf-8' + base_char = charset_header(hdr) if hdr is not None else None + base_char = base_char or 'utf-8' for url in css_urls: if not url: continue From fb3870e9dd3b0ae920d77aee91b5807b75ee5bff Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 17:31:43 +0400 Subject: [PATCH 35/42] skip http error pages --- nevernote.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/nevernote.py b/nevernote.py index efba1f0..1ae7eff 100755 --- a/nevernote.py +++ b/nevernote.py @@ -157,12 +157,18 @@ def process_url(url): except UrlDuplicateError as e: print(e) return - page = get_text(url) - parser = TitleParser(strict=False) - parser.feed(page) - page = embed_pictures(page, parser.images, base_url=url) - page = embed_css(page, parser.css, base_url=url) + try: + page = get_text(url) + parser = TitleParser(strict=False) + parser.feed(page) + + page = embed_pictures(page, parser.images, base_url=url) + page = embed_css(page, parser.css, base_url=url) + except urllib.error.HTTPError as e: + print('Error with URL "%s": %s' % (url,e)) + return False + write_file(page, parser.title, comment=url) From 41e984e1f04d34c1caf39b0436d73468f14053ff Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 21:40:14 +0400 Subject: [PATCH 36/42] fix urllib.error.HTTPError calls --- nevernote.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/nevernote.py b/nevernote.py index 1ae7eff..1e403bc 100755 --- a/nevernote.py +++ b/nevernote.py @@ -52,8 +52,11 @@ def charset_header(content_type): def get_text(url, content='text/html', charset='utf-8'): response = urlopen(url) if response.status != 200: - raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( - response.status, response.reason, url)) + raise urllib.error.HTTPError( + url, response.status, + 'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url), + None, None + ) ctype = response.headers.get('content-type') if ctype is None: raise RuntimeError('None content type for %s' % url) @@ -73,12 +76,15 @@ def get_text(url, content='text/html', charset='utf-8'): def embedded_image(url): '''Download content from URL and return bytes if target is image''' - u = urlopen(url) - if u.status != 200: - raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( - u.status, u.reason, url)) - ctype = u.headers.get('Content-Type') - data = u.read() + response = urlopen(url) + if response.status != 200: + raise urllib.error.HTTPError( + url, response.status, + 'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url), + None, None + ) + ctype = response.headers.get('Content-Type') + data = response.read() b64pict = base64.b64encode(data).decode() return 'data:%s;base64,%s' % (ctype, b64pict) From 7ce2bfb97fcc3f1b3bbc21af732eb072d6c7ec11 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 20 Jul 2014 21:42:13 +0400 Subject: [PATCH 37/42] fix urllib.error.HTTPError print --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 1e403bc..cc98181 100755 --- a/nevernote.py +++ b/nevernote.py @@ -172,7 +172,7 @@ def process_url(url): page = embed_pictures(page, parser.images, base_url=url) page = embed_css(page, parser.css, base_url=url) except urllib.error.HTTPError as e: - print('Error with URL "%s": %s' % (url,e)) + print(e) return False write_file(page, parser.title, comment=url) From fbf52e95441f8f7d3d690e8fee513e8fe8d9cc2d Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Mon, 21 Jul 2014 00:46:30 +0400 Subject: [PATCH 38/42] add script parsing --- nevernote.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/nevernote.py b/nevernote.py index cc98181..1a072b1 100755 --- a/nevernote.py +++ b/nevernote.py @@ -21,12 +21,17 @@ class TitleParser(html.parser.HTMLParser): html.parser.HTMLParser.__init__(self, *args, **kwargs) self.images = set() self.css = set() + self.scripts = set() def handle_starttag(self, name, attribs): if name == 'img': for attr, value in attribs: if attr == 'src': self.images.add(value) + elif name == 'script': + for attr, value in attribs: + if attr == 'src': + self.scripts.add(value) elif name == 'title': titletag_start = self.rawdata.index('', titletag_start) + 1 @@ -118,6 +123,22 @@ def embed_css(page, css_urls, base_url=None): return page +def embed_scripts(page, script_urls, base_url=None): + # fetch charset from base URL or use default UTF-8 + if base_url is not None: + hdr = urlopen(base_url).headers.get('content-type') + base_char = charset_header(hdr) if hdr is not None else None + base_char = base_char or 'utf-8' + for url in script_urls: + if not url: + continue + print('New script: %s' % url) + script_link = ' src="%s"' % url + print(script_link) + page = page.replace(script_link, '') + return page + + def url_duplicate(url): for htmlfile in os.listdir(): if not htmlfile.endswith('.html'): @@ -171,6 +192,7 @@ def process_url(url): page = embed_pictures(page, parser.images, base_url=url) page = embed_css(page, parser.css, base_url=url) + page = embed_scripts(page, parser.scripts, base_url=url) except urllib.error.HTTPError as e: print(e) return False From cf626546e7c132342209bb912b135608e65c95bf Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Wed, 23 Jul 2014 08:45:12 +0400 Subject: [PATCH 39/42] use set of content-types for checking --- nevernote.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nevernote.py b/nevernote.py index 1a072b1..13cd4ef 100755 --- a/nevernote.py +++ b/nevernote.py @@ -54,7 +54,7 @@ def charset_header(content_type): return None -def get_text(url, content='text/html', charset='utf-8'): +def get_text(url, content={'text/html'}, charset='utf-8'): response = urlopen(url) if response.status != 200: raise urllib.error.HTTPError( @@ -65,7 +65,10 @@ def get_text(url, content='text/html', charset='utf-8'): ctype = response.headers.get('content-type') if ctype is None: raise RuntimeError('None content type for %s' % url) - if not ctype.startswith(content): + for cnt in content: + if ctype.startswith(cnt): + break + else: raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) # get charset from 'Content-type' header @@ -118,7 +121,7 @@ def embed_css(page, css_urls, base_url=None): css_start = page.rindex('<', 0, page.index(url)) css_end = page.index('>', css_start) + 1 css_tag = ('' % get_text( - complete_url(url, base_url), content='text/css',charset=base_char)) + complete_url(url, base_url), content={'text/css'}, charset=base_char)) page = page[:css_start] + css_tag + page[css_end:] return page From 6b3aa602ef4f10f20d785eecbafceac9602e1ae5 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 4 Oct 2014 03:24:38 +0400 Subject: [PATCH 40/42] add script embedding --- nevernote.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/nevernote.py b/nevernote.py index 13cd4ef..23309c2 100755 --- a/nevernote.py +++ b/nevernote.py @@ -137,8 +137,18 @@ def embed_scripts(page, script_urls, base_url=None): continue print('New script: %s' % url) script_link = ' src="%s"' % url - print(script_link) - page = page.replace(script_link, '') + script_link_idx = page.index(script_link) + script_content = get_text( + complete_url(url, base_url), + content={'application/x-javascript', 'text/javascript'}, + charset=base_char + ) + script_start = page.index('>', script_link_idx) + 1 + script_end = page.index('', script_start) + # add script content to page + page = page[:script_start] + script_content + page[script_end:] + # remove script src link + page = page[:script_link_idx] + page[script_link_idx+len(script_link):] return page From c1724b5921a0aa4b56139cdf36100d60f7294824 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 4 Oct 2014 03:38:34 +0400 Subject: [PATCH 41/42] use base64 encoding for embedded scripts can avoid some issues in browsers' renderers (habrahabr pages was broken because of nested in script content. --- nevernote.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/nevernote.py b/nevernote.py index 23309c2..f4361c4 100755 --- a/nevernote.py +++ b/nevernote.py @@ -127,28 +127,13 @@ def embed_css(page, css_urls, base_url=None): def embed_scripts(page, script_urls, base_url=None): - # fetch charset from base URL or use default UTF-8 - if base_url is not None: - hdr = urlopen(base_url).headers.get('content-type') - base_char = charset_header(hdr) if hdr is not None else None - base_char = base_char or 'utf-8' for url in script_urls: - if not url: - continue print('New script: %s' % url) - script_link = ' src="%s"' % url - script_link_idx = page.index(script_link) - script_content = get_text( - complete_url(url, base_url), - content={'application/x-javascript', 'text/javascript'}, - charset=base_char - ) - script_start = page.index('>', script_link_idx) + 1 - script_end = page.index('', script_start) - # add script content to page - page = page[:script_start] + script_content + page[script_end:] - # remove script src link - page = page[:script_link_idx] + page[script_link_idx+len(script_link):] + try: + page = page.replace( + url, embedded_image(complete_url(url, base_url))) + except urllib.error.HTTPError: + pass return page From 23f648e1adea229c8f706644abe1fdbe8ece3eb9 Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sat, 4 Oct 2014 10:59:32 -0400 Subject: [PATCH 42/42] limit filename length with 128 chars plus extension --- nevernote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nevernote.py b/nevernote.py index 23309c2..86a9c2b 100755 --- a/nevernote.py +++ b/nevernote.py @@ -168,7 +168,7 @@ def write_file(page, title, comment=None): inc = 0 while True: inc += 1 - fname = ' '.join(title.replace('/', '_').split()) + write_inc(inc) + '.html' + fname = (' '.join(title.replace('/', '_').split()) + write_inc(inc))[:128] + '.html' if not os.path.exists(fname): break