Use requests library

This commit is contained in:
Maks Snegov 2019-10-22 11:33:06 +03:00
parent 56a7032b3e
commit 44b8a17841
2 changed files with 14 additions and 57 deletions

View File

@ -6,10 +6,9 @@ import html.parser
import os import os
import re import re
import sys import sys
import urllib.error
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.request import urlopen
import zlib import requests
class UrlDuplicateError(Exception): pass class UrlDuplicateError(Exception): pass
@ -43,56 +42,18 @@ class TitleParser(html.parser.HTMLParser):
self.css.add(attr_dict['href']) self.css.add(attr_dict['href'])
def charset_header(content_type): def get_text(url):
""" Parse charset from 'content-type' header response = requests.get(url)
:param content_type: string response.raise_for_status()
:return: string with character set return response.text
"""
if 'charset' in content_type:
return content_type.split(';')[1].split('=')[1]
else:
return None
def get_text(url, content={'text/html'}, charset='utf-8'):
response = urlopen(url)
if response.status != 200:
raise urllib.error.HTTPError(
url, response.status,
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
None, None
)
ctype = response.headers.get('content-type')
if ctype is None:
raise RuntimeError('None content type for %s' % url)
for cnt in content:
if ctype.startswith(cnt):
break
else:
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
# get charset from 'Content-type' header
charset = charset_header(ctype) or charset
if response.info().get('Content-Encoding') == 'gzip':
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
else:
data = response.read()
page = data.decode(charset.lower())
return page
def embedded_image(url): def embedded_image(url):
'''Download content from URL and return bytes if target is image''' '''Download content from URL and return bytes if target is image'''
response = urlopen(url) response = requests.get(url)
if response.status != 200: response.raise_for_status()
raise urllib.error.HTTPError(
url, response.status,
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
None, None
)
ctype = response.headers.get('Content-Type') ctype = response.headers.get('Content-Type')
data = response.read() data = response.content
b64pict = base64.b64encode(data).decode() b64pict = base64.b64encode(data).decode()
return 'data:%s;base64,%s' % (ctype, b64pict) return 'data:%s;base64,%s' % (ctype, b64pict)
@ -103,17 +64,12 @@ def embed_pictures(page, pict_urls, base_url=None):
try: try:
page = page.replace( page = page.replace(
url, embedded_image(complete_url(url, base_url))) url, embedded_image(complete_url(url, base_url)))
except urllib.error.HTTPError: except requests.exceptions.HTTPError:
pass pass
return page return page
def embed_css(page, css_urls, base_url=None): def embed_css(page, css_urls, base_url=None):
# fetch charset from base URL or use default UTF-8
if base_url is not None:
hdr = urlopen(base_url).headers.get('content-type')
base_char = charset_header(hdr) if hdr is not None else None
base_char = base_char or 'utf-8'
for url in css_urls: for url in css_urls:
if not url: if not url:
continue continue
@ -121,7 +77,7 @@ def embed_css(page, css_urls, base_url=None):
css_start = page.rindex('<', 0, page.index(url)) css_start = page.rindex('<', 0, page.index(url))
css_end = page.index('>', css_start) + 1 css_end = page.index('>', css_start) + 1
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text( css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
complete_url(url, base_url), content={'text/css'}, charset=base_char)) complete_url(url, base_url)))
page = page[:css_start] + css_tag + page[css_end:] page = page[:css_start] + css_tag + page[css_end:]
return page return page
@ -132,7 +88,7 @@ def embed_scripts(page, script_urls, base_url=None):
try: try:
page = page.replace( page = page.replace(
url, embedded_image(complete_url(url, base_url))) url, embedded_image(complete_url(url, base_url)))
except urllib.error.HTTPError: except requests.exceptions.HTTPError:
pass pass
return page return page
@ -191,7 +147,7 @@ def process_url(url):
page = embed_pictures(page, parser.images, base_url=url) page = embed_pictures(page, parser.images, base_url=url)
page = embed_css(page, parser.css, base_url=url) page = embed_css(page, parser.css, base_url=url)
page = embed_scripts(page, parser.scripts, base_url=url) page = embed_scripts(page, parser.scripts, base_url=url)
except urllib.error.HTTPError as e: except requests.exceptions.HTTPError as e:
print(e) print(e)
return False return False

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
requests