Use BS4 for HTML parsing

2019-10-22 16:05:29 +03:00
parent 3198361266
commit 89a8dd90cc
2 changed files with 26 additions and 81 deletions
--- a/nevernote.py
+++ b/nevernote.py
@@ -2,44 +2,17 @@
 import argparse
 import base64
 import html.parser
 import os
 import re
 import sys
 from urllib.parse import urlparse
 import requests
 from bs4 import BeautifulSoup
 URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
 class TitleParser(html.parser.HTMLParser):
    def __init__(self, *args, **kwargs):
        html.parser.HTMLParser.__init__(self, *args, **kwargs)
        self.images = set()
        self.css = set()
        self.scripts = set()
    def handle_starttag(self, name, attribs):
        if name == 'img':
            for attr, value in attribs:
                if attr == 'src':
                    self.images.add(value)
        elif name == 'script':
            for attr, value in attribs:
                if attr == 'src':
                    self.scripts.add(value)
        elif name == 'title':
            titletag_start = self.rawdata.index('<title')
            title_start = self.rawdata.index('>', titletag_start) + 1
            title_end = self.rawdata.index('</title>', title_start)
            self.title = self.rawdata[title_start:title_end]
        elif name == 'link':
            attr_dict = dict(attribs)
            if attr_dict.get('rel') == 'stylesheet':
                self.css.add(attr_dict['href'])
 def get_text(url):
    response = requests.get(url)
    response.raise_for_status()
@@ -56,44 +29,6 @@ def get_embedded_binary(url):
    return 'data:%s;base64,%s' % (ctype, b64pict)
 def embed_pictures(page, pict_urls, base_url=None):
    """Write all pictures in HTML file"""
    for url in pict_urls:
        print('New picture: %s' % url)
        try:
            page = page.replace(
                url, get_embedded_binary(complete_url(url, base_url)))
        except requests.exceptions.HTTPError:
            pass
    return page
 def embed_css(page, css_urls, base_url=None):
    """Write all CSS's in HTML file"""
    for url in css_urls:
        if not url:
            continue
        print('New CSS: %s' % url)
        css_start = page.rindex('<', 0, page.index(url))
        css_end = page.index('>', css_start) + 1
        css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
            complete_url(url, base_url)))
        page = page[:css_start] + css_tag + page[css_end:]
    return page
 def embed_scripts(page, script_urls, base_url=None):
    """Write all scripts in HTML file"""
    for url in script_urls:
        print('New script: %s' % url)
        try:
            page = page.replace(
                url, get_embedded_binary(complete_url(url, base_url)))
        except requests.exceptions.HTTPError:
            pass
    return page
 def is_downloaded(url: str) -> bool:
    """Check if url was already downloaded"""
    for htmlfile in os.listdir(path='.'):
@@ -146,19 +81,32 @@ def process_url(url: str, dup_check: bool = False):
    if dup_check and is_downloaded(url):
        return
-    try:
+    page_content = get_text(url)
-        page = get_text(url)
+    soup = BeautifulSoup(page_content, 'html.parser')
        parser = TitleParser()
        parser.feed(page)
-        page = embed_pictures(page, parser.images, base_url=url)
+    for img_tag in soup.find_all('img'):
-        page = embed_css(page, parser.css, base_url=url)
+        img_url = complete_url(img_tag['src'], base_url=url)
-        page = embed_scripts(page, parser.scripts, base_url=url)
+        print('New picture: %s' % img_url)
-    except requests.exceptions.HTTPError as e:
+        img_b64 = get_embedded_binary(img_url)
-        print(e)
+        img_tag['src'] = img_b64
        return False
-    write_file(page, parser.title, comment=url)
+    for link_tag in soup.find_all('link'):
        link_url = complete_url(link_tag['href'], base_url=url)
        if 'stylesheet' in link_tag['rel']:
            print('New CSS: %s' % link_url)
            css_tag = soup.new_tag('style', media='screen', type='text/css')
            css_tag.string = get_text(link_url)
            link_tag.replace_with(css_tag)
    for script_tag in soup.find_all('script'):
        if script_tag.get('src') is None:
            continue
        script_url = complete_url(script_tag['src'], base_url=url)
        print('New script: %s' % script_url)
        script_b64 = get_embedded_binary(script_url)
        script_tag['src'] = script_b64
    write_file(soup.prettify(), soup.title.text, comment=url)
 def main():
@@ -192,9 +140,5 @@ def main():
        process_url(arg, dup_check=args.dup_check)
 class UrlDuplicateError(Exception):
    pass
 if __name__ == '__main__':
    sys.exit(main())
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 requests
 beautifulsoup4