add script parsing
This commit is contained in:
parent
7ce2bfb97f
commit
fbf52e9544
22
nevernote.py
22
nevernote.py
@ -21,12 +21,17 @@ class TitleParser(html.parser.HTMLParser):
|
|||||||
html.parser.HTMLParser.__init__(self, *args, **kwargs)
|
html.parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||||
self.images = set()
|
self.images = set()
|
||||||
self.css = set()
|
self.css = set()
|
||||||
|
self.scripts = set()
|
||||||
|
|
||||||
def handle_starttag(self, name, attribs):
|
def handle_starttag(self, name, attribs):
|
||||||
if name == 'img':
|
if name == 'img':
|
||||||
for attr, value in attribs:
|
for attr, value in attribs:
|
||||||
if attr == 'src':
|
if attr == 'src':
|
||||||
self.images.add(value)
|
self.images.add(value)
|
||||||
|
elif name == 'script':
|
||||||
|
for attr, value in attribs:
|
||||||
|
if attr == 'src':
|
||||||
|
self.scripts.add(value)
|
||||||
elif name == 'title':
|
elif name == 'title':
|
||||||
titletag_start = self.rawdata.index('<title')
|
titletag_start = self.rawdata.index('<title')
|
||||||
title_start = self.rawdata.index('>', titletag_start) + 1
|
title_start = self.rawdata.index('>', titletag_start) + 1
|
||||||
@ -118,6 +123,22 @@ def embed_css(page, css_urls, base_url=None):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def embed_scripts(page, script_urls, base_url=None):
|
||||||
|
# fetch charset from base URL or use default UTF-8
|
||||||
|
if base_url is not None:
|
||||||
|
hdr = urlopen(base_url).headers.get('content-type')
|
||||||
|
base_char = charset_header(hdr) if hdr is not None else None
|
||||||
|
base_char = base_char or 'utf-8'
|
||||||
|
for url in script_urls:
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
print('New script: %s' % url)
|
||||||
|
script_link = ' src="%s"' % url
|
||||||
|
print(script_link)
|
||||||
|
page = page.replace(script_link, '')
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
def url_duplicate(url):
|
def url_duplicate(url):
|
||||||
for htmlfile in os.listdir():
|
for htmlfile in os.listdir():
|
||||||
if not htmlfile.endswith('.html'):
|
if not htmlfile.endswith('.html'):
|
||||||
@ -171,6 +192,7 @@ def process_url(url):
|
|||||||
|
|
||||||
page = embed_pictures(page, parser.images, base_url=url)
|
page = embed_pictures(page, parser.images, base_url=url)
|
||||||
page = embed_css(page, parser.css, base_url=url)
|
page = embed_css(page, parser.css, base_url=url)
|
||||||
|
page = embed_scripts(page, parser.scripts, base_url=url)
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
print(e)
|
print(e)
|
||||||
return False
|
return False
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user