Use BS4 for HTML parsing
This commit is contained in:
parent
3198361266
commit
89a8dd90cc
106
nevernote.py
106
nevernote.py
@ -2,44 +2,17 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import base64
|
import base64
|
||||||
import html.parser
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
|
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
|
||||||
|
|
||||||
|
|
||||||
class TitleParser(html.parser.HTMLParser):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
html.parser.HTMLParser.__init__(self, *args, **kwargs)
|
|
||||||
self.images = set()
|
|
||||||
self.css = set()
|
|
||||||
self.scripts = set()
|
|
||||||
|
|
||||||
def handle_starttag(self, name, attribs):
|
|
||||||
if name == 'img':
|
|
||||||
for attr, value in attribs:
|
|
||||||
if attr == 'src':
|
|
||||||
self.images.add(value)
|
|
||||||
elif name == 'script':
|
|
||||||
for attr, value in attribs:
|
|
||||||
if attr == 'src':
|
|
||||||
self.scripts.add(value)
|
|
||||||
elif name == 'title':
|
|
||||||
titletag_start = self.rawdata.index('<title')
|
|
||||||
title_start = self.rawdata.index('>', titletag_start) + 1
|
|
||||||
title_end = self.rawdata.index('</title>', title_start)
|
|
||||||
self.title = self.rawdata[title_start:title_end]
|
|
||||||
elif name == 'link':
|
|
||||||
attr_dict = dict(attribs)
|
|
||||||
if attr_dict.get('rel') == 'stylesheet':
|
|
||||||
self.css.add(attr_dict['href'])
|
|
||||||
|
|
||||||
|
|
||||||
def get_text(url):
|
def get_text(url):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
@ -56,44 +29,6 @@ def get_embedded_binary(url):
|
|||||||
return 'data:%s;base64,%s' % (ctype, b64pict)
|
return 'data:%s;base64,%s' % (ctype, b64pict)
|
||||||
|
|
||||||
|
|
||||||
def embed_pictures(page, pict_urls, base_url=None):
|
|
||||||
"""Write all pictures in HTML file"""
|
|
||||||
for url in pict_urls:
|
|
||||||
print('New picture: %s' % url)
|
|
||||||
try:
|
|
||||||
page = page.replace(
|
|
||||||
url, get_embedded_binary(complete_url(url, base_url)))
|
|
||||||
except requests.exceptions.HTTPError:
|
|
||||||
pass
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
def embed_css(page, css_urls, base_url=None):
|
|
||||||
"""Write all CSS's in HTML file"""
|
|
||||||
for url in css_urls:
|
|
||||||
if not url:
|
|
||||||
continue
|
|
||||||
print('New CSS: %s' % url)
|
|
||||||
css_start = page.rindex('<', 0, page.index(url))
|
|
||||||
css_end = page.index('>', css_start) + 1
|
|
||||||
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
|
|
||||||
complete_url(url, base_url)))
|
|
||||||
page = page[:css_start] + css_tag + page[css_end:]
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
def embed_scripts(page, script_urls, base_url=None):
|
|
||||||
"""Write all scripts in HTML file"""
|
|
||||||
for url in script_urls:
|
|
||||||
print('New script: %s' % url)
|
|
||||||
try:
|
|
||||||
page = page.replace(
|
|
||||||
url, get_embedded_binary(complete_url(url, base_url)))
|
|
||||||
except requests.exceptions.HTTPError:
|
|
||||||
pass
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
def is_downloaded(url: str) -> bool:
|
def is_downloaded(url: str) -> bool:
|
||||||
"""Check if url was already downloaded"""
|
"""Check if url was already downloaded"""
|
||||||
for htmlfile in os.listdir(path='.'):
|
for htmlfile in os.listdir(path='.'):
|
||||||
@ -146,19 +81,32 @@ def process_url(url: str, dup_check: bool = False):
|
|||||||
if dup_check and is_downloaded(url):
|
if dup_check and is_downloaded(url):
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
page_content = get_text(url)
|
||||||
page = get_text(url)
|
soup = BeautifulSoup(page_content, 'html.parser')
|
||||||
parser = TitleParser()
|
|
||||||
parser.feed(page)
|
|
||||||
|
|
||||||
page = embed_pictures(page, parser.images, base_url=url)
|
for img_tag in soup.find_all('img'):
|
||||||
page = embed_css(page, parser.css, base_url=url)
|
img_url = complete_url(img_tag['src'], base_url=url)
|
||||||
page = embed_scripts(page, parser.scripts, base_url=url)
|
print('New picture: %s' % img_url)
|
||||||
except requests.exceptions.HTTPError as e:
|
img_b64 = get_embedded_binary(img_url)
|
||||||
print(e)
|
img_tag['src'] = img_b64
|
||||||
return False
|
|
||||||
|
|
||||||
write_file(page, parser.title, comment=url)
|
for link_tag in soup.find_all('link'):
|
||||||
|
link_url = complete_url(link_tag['href'], base_url=url)
|
||||||
|
if 'stylesheet' in link_tag['rel']:
|
||||||
|
print('New CSS: %s' % link_url)
|
||||||
|
css_tag = soup.new_tag('style', media='screen', type='text/css')
|
||||||
|
css_tag.string = get_text(link_url)
|
||||||
|
link_tag.replace_with(css_tag)
|
||||||
|
|
||||||
|
for script_tag in soup.find_all('script'):
|
||||||
|
if script_tag.get('src') is None:
|
||||||
|
continue
|
||||||
|
script_url = complete_url(script_tag['src'], base_url=url)
|
||||||
|
print('New script: %s' % script_url)
|
||||||
|
script_b64 = get_embedded_binary(script_url)
|
||||||
|
script_tag['src'] = script_b64
|
||||||
|
|
||||||
|
write_file(soup.prettify(), soup.title.text, comment=url)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -192,9 +140,5 @@ def main():
|
|||||||
process_url(arg, dup_check=args.dup_check)
|
process_url(arg, dup_check=args.dup_check)
|
||||||
|
|
||||||
|
|
||||||
class UrlDuplicateError(Exception):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|||||||
@ -1 +1,2 @@
|
|||||||
requests
|
requests
|
||||||
|
beautifulsoup4
|
||||||
Loading…
Reference in New Issue
Block a user