diff --git a/nevernote.py b/nevernote.py index cc98181..1a072b1 100755 --- a/nevernote.py +++ b/nevernote.py @@ -21,12 +21,17 @@ class TitleParser(html.parser.HTMLParser): html.parser.HTMLParser.__init__(self, *args, **kwargs) self.images = set() self.css = set() + self.scripts = set() def handle_starttag(self, name, attribs): if name == 'img': for attr, value in attribs: if attr == 'src': self.images.add(value) + elif name == 'script': + for attr, value in attribs: + if attr == 'src': + self.scripts.add(value) elif name == 'title': titletag_start = self.rawdata.index('', titletag_start) + 1 @@ -118,6 +123,22 @@ def embed_css(page, css_urls, base_url=None): return page +def embed_scripts(page, script_urls, base_url=None): + # fetch charset from base URL or use default UTF-8 + if base_url is not None: + hdr = urlopen(base_url).headers.get('content-type') + base_char = charset_header(hdr) if hdr is not None else None + base_char = base_char or 'utf-8' + for url in script_urls: + if not url: + continue + print('New script: %s' % url) + script_link = ' src="%s"' % url + print(script_link) + page = page.replace(script_link, '') + return page + + def url_duplicate(url): for htmlfile in os.listdir(): if not htmlfile.endswith('.html'): @@ -171,6 +192,7 @@ def process_url(url): page = embed_pictures(page, parser.images, base_url=url) page = embed_css(page, parser.css, base_url=url) + page = embed_scripts(page, parser.scripts, base_url=url) except urllib.error.HTTPError as e: print(e) return False