diff --git a/nevernote.py b/nevernote.py
index cbd63d4..58eaee5 100755
--- a/nevernote.py
+++ b/nevernote.py
@@ -2,44 +2,17 @@
import argparse
import base64
-import html.parser
import os
import re
import sys
from urllib.parse import urlparse
import requests
+from bs4 import BeautifulSoup
URLDUP = re.compile(r'^$')
-class TitleParser(html.parser.HTMLParser):
- def __init__(self, *args, **kwargs):
- html.parser.HTMLParser.__init__(self, *args, **kwargs)
- self.images = set()
- self.css = set()
- self.scripts = set()
-
- def handle_starttag(self, name, attribs):
- if name == 'img':
- for attr, value in attribs:
- if attr == 'src':
- self.images.add(value)
- elif name == 'script':
- for attr, value in attribs:
- if attr == 'src':
- self.scripts.add(value)
- elif name == 'title':
- titletag_start = self.rawdata.index('
', titletag_start) + 1
- title_end = self.rawdata.index('', title_start)
- self.title = self.rawdata[title_start:title_end]
- elif name == 'link':
- attr_dict = dict(attribs)
- if attr_dict.get('rel') == 'stylesheet':
- self.css.add(attr_dict['href'])
-
-
def get_text(url):
response = requests.get(url)
response.raise_for_status()
@@ -56,44 +29,6 @@ def get_embedded_binary(url):
return 'data:%s;base64,%s' % (ctype, b64pict)
-def embed_pictures(page, pict_urls, base_url=None):
- """Write all pictures in HTML file"""
- for url in pict_urls:
- print('New picture: %s' % url)
- try:
- page = page.replace(
- url, get_embedded_binary(complete_url(url, base_url)))
- except requests.exceptions.HTTPError:
- pass
- return page
-
-
-def embed_css(page, css_urls, base_url=None):
- """Write all CSS's in HTML file"""
- for url in css_urls:
- if not url:
- continue
- print('New CSS: %s' % url)
- css_start = page.rindex('<', 0, page.index(url))
- css_end = page.index('>', css_start) + 1
- css_tag = ('' % get_text(
- complete_url(url, base_url)))
- page = page[:css_start] + css_tag + page[css_end:]
- return page
-
-
-def embed_scripts(page, script_urls, base_url=None):
- """Write all scripts in HTML file"""
- for url in script_urls:
- print('New script: %s' % url)
- try:
- page = page.replace(
- url, get_embedded_binary(complete_url(url, base_url)))
- except requests.exceptions.HTTPError:
- pass
- return page
-
-
def is_downloaded(url: str) -> bool:
"""Check if url was already downloaded"""
for htmlfile in os.listdir(path='.'):
@@ -146,19 +81,32 @@ def process_url(url: str, dup_check: bool = False):
if dup_check and is_downloaded(url):
return
- try:
- page = get_text(url)
- parser = TitleParser()
- parser.feed(page)
+ page_content = get_text(url)
+ soup = BeautifulSoup(page_content, 'html.parser')
- page = embed_pictures(page, parser.images, base_url=url)
- page = embed_css(page, parser.css, base_url=url)
- page = embed_scripts(page, parser.scripts, base_url=url)
- except requests.exceptions.HTTPError as e:
- print(e)
- return False
+ for img_tag in soup.find_all('img'):
+ img_url = complete_url(img_tag['src'], base_url=url)
+ print('New picture: %s' % img_url)
+ img_b64 = get_embedded_binary(img_url)
+ img_tag['src'] = img_b64
- write_file(page, parser.title, comment=url)
+ for link_tag in soup.find_all('link'):
+ link_url = complete_url(link_tag['href'], base_url=url)
+ if 'stylesheet' in link_tag['rel']:
+ print('New CSS: %s' % link_url)
+ css_tag = soup.new_tag('style', media='screen', type='text/css')
+ css_tag.string = get_text(link_url)
+ link_tag.replace_with(css_tag)
+
+ for script_tag in soup.find_all('script'):
+ if script_tag.get('src') is None:
+ continue
+ script_url = complete_url(script_tag['src'], base_url=url)
+ print('New script: %s' % script_url)
+ script_b64 = get_embedded_binary(script_url)
+ script_tag['src'] = script_b64
+
+ write_file(soup.prettify(), soup.title.text, comment=url)
def main():
@@ -192,9 +140,5 @@ def main():
process_url(arg, dup_check=args.dup_check)
-class UrlDuplicateError(Exception):
- pass
-
-
if __name__ == '__main__':
sys.exit(main())
diff --git a/requirements.txt b/requirements.txt
index f229360..a98ae43 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
requests
+beautifulsoup4
\ No newline at end of file