fix css charset error, add urllib.error.httperror

This commit is contained in:
Maks Snegov 2014-07-20 17:04:56 +04:00
parent 964e79f97b
commit b5ddae0ef8

View File

@ -6,6 +6,7 @@ import html.parser
import os import os
import re import re
import sys import sys
import urllib.error
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.request import urlopen from urllib.request import urlopen
import zlib import zlib
@ -37,10 +38,22 @@ class TitleParser(html.parser.HTMLParser):
self.css.add(attr_dict['href']) self.css.add(attr_dict['href'])
def get_text(url, content='text/html'): def charset_header(content_type):
""" Parse charset from 'content-type' header
:param content_type: string
:return: string with character set
"""
if 'charset' in content_type:
return content_type.split(';')[1].split('=')[1]
else:
return None
def get_text(url, content='text/html', charset='utf-8'):
response = urlopen(url) response = urlopen(url)
if response.status != 200: if response.status != 200:
raise RuntimeError('Incorrect HTTP status for %s' % url) raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % (
response.status, response.reason, url))
ctype = response.headers.get('content-type') ctype = response.headers.get('content-type')
if ctype is None: if ctype is None:
raise RuntimeError('None content type for %s' % url) raise RuntimeError('None content type for %s' % url)
@ -48,7 +61,7 @@ def get_text(url, content='text/html'):
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
# get charset from 'Content-type' header # get charset from 'Content-type' header
charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' charset = charset_header(ctype) or charset
if response.info().get('Content-Encoding') == 'gzip': if response.info().get('Content-Encoding') == 'gzip':
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
@ -61,8 +74,9 @@ def get_text(url, content='text/html'):
def embedded_image(url): def embedded_image(url):
'''Download content from URL and return bytes if target is image''' '''Download content from URL and return bytes if target is image'''
u = urlopen(url) u = urlopen(url)
if u.getcode() != 200: if u.status != 200:
raise RuntimeError('Incorrect status for %s' % url) raise urllib.error.HTTPError('Incorrect HTTP status (%d, %s) for %s' % (
u.status, u.reason, url))
ctype = u.headers.get('Content-Type') ctype = u.headers.get('Content-Type')
data = u.read() data = u.read()
b64pict = base64.b64encode(data).decode() b64pict = base64.b64encode(data).decode()
@ -75,20 +89,23 @@ def embed_pictures(page, pict_urls, base_url=None):
try: try:
page = page.replace( page = page.replace(
url, embedded_image(complete_url(url, base_url))) url, embedded_image(complete_url(url, base_url)))
except (ValueError, ConnectionRefusedError): except (IncorrectHTTPStatus, urllib.error.HTTPError):
pass pass
return page return page
def embed_css(page, css_urls, base_url=None): def embed_css(page, css_urls, base_url=None):
if base_url is not None:
hdr = urlopen(base_url).headers.get('content-type')
base_char = charset_header(hdr) if hdr is not None else 'utf-8'
for url in css_urls: for url in css_urls:
if not url: if not url:
continue continue
print('New CSS: %s' % url) print('New CSS: %s' % url)
css_start = page.rindex('<', 0, page.index(url)) css_start = page.rindex('<', 0, page.index(url))
css_end = page.index('>', css_start) + 1 css_end = page.index('>', css_start) + 1
css_tag = ('<style media="screen" type="text/css">%s</style>' css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
% get_text(complete_url(url, base_url), 'text/css')) complete_url(url, base_url), content='text/css',charset=base_char))
page = page[:css_start] + css_tag + page[css_end:] page = page[:css_start] + css_tag + page[css_end:]
return page return page