diff --git a/nevernote.py b/nevernote.py index 9abfc93..e72e574 100755 --- a/nevernote.py +++ b/nevernote.py @@ -6,6 +6,7 @@ import html.parser import os import re import sys +import urllib.error from urllib.parse import urlparse from urllib.request import urlopen import zlib @@ -37,10 +38,22 @@ class TitleParser(html.parser.HTMLParser): self.css.add(attr_dict['href']) -def get_text(url, content='text/html'): +def charset_header(content_type): + """ Parse charset from 'content-type' header + :param content_type: string + :return: string with character set + """ + if 'charset' in content_type: + return content_type.split(';')[1].split('=')[1] + else: + return None + + +def get_text(url, content='text/html', charset='utf-8'): response = urlopen(url) if response.status != 200: - raise RuntimeError('Incorrect HTTP status for %s' % url) + raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( + response.status, response.reason, url)) ctype = response.headers.get('content-type') if ctype is None: raise RuntimeError('None content type for %s' % url) @@ -48,7 +61,7 @@ def get_text(url, content='text/html'): raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) # get charset from 'Content-type' header - charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' + charset = charset_header(ctype) or charset if response.info().get('Content-Encoding') == 'gzip': data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) @@ -61,8 +74,9 @@ def get_text(url, content='text/html'): def embedded_image(url): '''Download content from URL and return bytes if target is image''' u = urlopen(url) - if u.getcode() != 200: - raise RuntimeError('Incorrect status for %s' % url) + if u.status != 200: + raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % ( + u.status, u.reason, url)) ctype = u.headers.get('Content-Type') data = u.read() b64pict = base64.b64encode(data).decode() @@ -75,20 +89,23 @@ def embed_pictures(page, pict_urls, base_url=None): try: page = page.replace( url, embedded_image(complete_url(url, base_url))) - except (ValueError, ConnectionRefusedError): + except (IncorrectHTTPStatus, urllib.error.HTTPError): pass return page def embed_css(page, css_urls, base_url=None): + if base_url is not None: + hdr = urlopen(base_url).headers.get('content-type') + base_char = charset_header(hdr) if hdr is not None else 'utf-8' for url in css_urls: if not url: continue print('New CSS: %s' % url) css_start = page.rindex('<', 0, page.index(url)) css_end = page.index('>', css_start) + 1 - css_tag = ('' - % get_text(complete_url(url, base_url), 'text/css')) + css_tag = ('' % get_text( + complete_url(url, base_url), content='text/css',charset=base_char)) page = page[:css_start] + css_tag + page[css_end:] return page