Use BS4 for HTML parsing

This commit is contained in:
Maks Snegov 2019-10-22 16:05:29 +03:00
parent 3198361266
commit 89a8dd90cc
2 changed files with 26 additions and 81 deletions

View File

@ -2,44 +2,17 @@
import argparse import argparse
import base64 import base64
import html.parser
import os import os
import re import re
import sys import sys
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
from bs4 import BeautifulSoup
URLDUP = re.compile(r'^<!-- URL: (.*) -->$') URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
class TitleParser(html.parser.HTMLParser):
def __init__(self, *args, **kwargs):
html.parser.HTMLParser.__init__(self, *args, **kwargs)
self.images = set()
self.css = set()
self.scripts = set()
def handle_starttag(self, name, attribs):
if name == 'img':
for attr, value in attribs:
if attr == 'src':
self.images.add(value)
elif name == 'script':
for attr, value in attribs:
if attr == 'src':
self.scripts.add(value)
elif name == 'title':
titletag_start = self.rawdata.index('<title')
title_start = self.rawdata.index('>', titletag_start) + 1
title_end = self.rawdata.index('</title>', title_start)
self.title = self.rawdata[title_start:title_end]
elif name == 'link':
attr_dict = dict(attribs)
if attr_dict.get('rel') == 'stylesheet':
self.css.add(attr_dict['href'])
def get_text(url): def get_text(url):
response = requests.get(url) response = requests.get(url)
response.raise_for_status() response.raise_for_status()
@ -56,44 +29,6 @@ def get_embedded_binary(url):
return 'data:%s;base64,%s' % (ctype, b64pict) return 'data:%s;base64,%s' % (ctype, b64pict)
def embed_pictures(page, pict_urls, base_url=None):
"""Write all pictures in HTML file"""
for url in pict_urls:
print('New picture: %s' % url)
try:
page = page.replace(
url, get_embedded_binary(complete_url(url, base_url)))
except requests.exceptions.HTTPError:
pass
return page
def embed_css(page, css_urls, base_url=None):
"""Write all CSS's in HTML file"""
for url in css_urls:
if not url:
continue
print('New CSS: %s' % url)
css_start = page.rindex('<', 0, page.index(url))
css_end = page.index('>', css_start) + 1
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
complete_url(url, base_url)))
page = page[:css_start] + css_tag + page[css_end:]
return page
def embed_scripts(page, script_urls, base_url=None):
"""Write all scripts in HTML file"""
for url in script_urls:
print('New script: %s' % url)
try:
page = page.replace(
url, get_embedded_binary(complete_url(url, base_url)))
except requests.exceptions.HTTPError:
pass
return page
def is_downloaded(url: str) -> bool: def is_downloaded(url: str) -> bool:
"""Check if url was already downloaded""" """Check if url was already downloaded"""
for htmlfile in os.listdir(path='.'): for htmlfile in os.listdir(path='.'):
@ -146,19 +81,32 @@ def process_url(url: str, dup_check: bool = False):
if dup_check and is_downloaded(url): if dup_check and is_downloaded(url):
return return
try: page_content = get_text(url)
page = get_text(url) soup = BeautifulSoup(page_content, 'html.parser')
parser = TitleParser()
parser.feed(page)
page = embed_pictures(page, parser.images, base_url=url) for img_tag in soup.find_all('img'):
page = embed_css(page, parser.css, base_url=url) img_url = complete_url(img_tag['src'], base_url=url)
page = embed_scripts(page, parser.scripts, base_url=url) print('New picture: %s' % img_url)
except requests.exceptions.HTTPError as e: img_b64 = get_embedded_binary(img_url)
print(e) img_tag['src'] = img_b64
return False
write_file(page, parser.title, comment=url) for link_tag in soup.find_all('link'):
link_url = complete_url(link_tag['href'], base_url=url)
if 'stylesheet' in link_tag['rel']:
print('New CSS: %s' % link_url)
css_tag = soup.new_tag('style', media='screen', type='text/css')
css_tag.string = get_text(link_url)
link_tag.replace_with(css_tag)
for script_tag in soup.find_all('script'):
if script_tag.get('src') is None:
continue
script_url = complete_url(script_tag['src'], base_url=url)
print('New script: %s' % script_url)
script_b64 = get_embedded_binary(script_url)
script_tag['src'] = script_b64
write_file(soup.prettify(), soup.title.text, comment=url)
def main(): def main():
@ -192,9 +140,5 @@ def main():
process_url(arg, dup_check=args.dup_check) process_url(arg, dup_check=args.dup_check)
class UrlDuplicateError(Exception):
pass
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())

View File

@ -1 +1,2 @@
requests requests
beautifulsoup4