replace http.client with urllib

This commit is contained in:
Maks Snegov 2014-07-20 08:09:07 +04:00
parent eb2c43f438
commit 716c61f6f1

View File

@ -2,12 +2,11 @@
import argparse import argparse
import base64 import base64
import http.client
import html.parser import html.parser
import os.path import os.path
import sys import sys
from urllib.parse import urlparse from urllib.parse import urlparse
import zlib from urllib.request import urlopen
class InfiniteRedirects(Exception): pass class InfiniteRedirects(Exception): pass
@ -35,74 +34,29 @@ class TitleParser(html.parser.HTMLParser):
self.css.add(attr_dict['href']) self.css.add(attr_dict['href'])
def download_content(url, depth=0): def get_text(url, content='text/html'):
'''download page and decode it to utf-8''' u = urlopen(url)
if depth > 10: if u.status != 200:
raise InfiniteRedirects('too much redirects: %s' % url) raise RuntimeError('Incorrect HTTP status for %s' % url)
ctype = u.headers.get('content-type')
up = urlparse(url) if ctype is None:
if not up.netloc: raise RuntimeError('None content type for %s' % url)
up = urlparse('//' + url) if not ctype.startswith(content):
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
headers = { encoding = ctype.split(';')[1].split('=')[1].lower()
"Host": up.netloc, data = u.read()
"Connection": "keep-alive", page = data.decode(encoding)
}
if not up.scheme or up.scheme == 'http':
conn = http.client.HTTPConnection(up.netloc)
elif up.scheme == 'https':
conn = http.client.HTTPSConnection(up.netloc)
else:
raise NotImplementedError("protocol %s is not implemented" % up.scheme)
requrl = ('?'.join((up.path, up.query)) if up.query else up.path) or '/'
conn.request("GET", requrl, None, headers)
response = conn.getresponse()
# follow redirects
if ((response.status == http.client.MOVED_PERMANENTLY)
or (response.status == http.client.FOUND)):
new_url = response.getheader('Location')
print('Redirecting to ' + new_url)
return download_content(new_url, depth+1)
return response
def get_page(url):
response = download_content(url)
# get page charset from response header
c_type = response.getheader('Content-Type')
if not c_type.startswith('text'):
raise ValueError('incorrect Content-Type for HTML page: %s' % c_type)
c_encoding = response.getheader('Content-Encoding')
if c_encoding:
if c_encoding == 'gzip':
page_binary = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
else:
raise NotImplementedError(
'content encoding %s is not implemented' % c_encoding)
else:
page_binary = response.read()
charset = 'iso-8859-1'
ct_spl = c_type.split('; ')
if len(ct_spl) > 1:
charset = ct_spl[1].split('=')[1]
page = page_binary.decode(charset, errors='ignore')
return page return page
def embedded_image(url): def embedded_image(url):
'''Download content from URL and return bytes if target is image''' '''Download content from URL and return bytes if target is image'''
response = download_content(url) u = urlopen(url)
ctype = response.getheader('Content-Type') if u.getcode() != 200:
if not ctype or not ctype.startswith('image'): raise RuntimeError('Incorrect status for %s' % url)
raise ValueError('incorrect Content-Type for image: %s' % ctype) ctype = u.headers.get('Content-Type')
b64pict = base64.b64encode(response.read()).decode() data = u.read()
b64pict = base64.b64encode(data).decode()
return 'data:%s;base64,%s' % (ctype, b64pict) return 'data:%s;base64,%s' % (ctype, b64pict)
@ -122,14 +76,11 @@ def embed_css(page, css_urls, base_url=None):
if not url: if not url:
continue continue
print('New CSS: %s' % url) print('New CSS: %s' % url)
try: css_start = page.rindex('<', 0, page.index(url))
css_start = page.rindex('<', 0, page.index(url)) css_end = page.index('>', css_start) + 1
css_end = page.index('>', css_start) + 1 css_tag = ('<style media="screen" type="text/css">%s</style>'
css = ('<style media="screen" type="text/css">%s</style>' % get_text(complete_url(url, base_url), 'text/css'))
% get_page(complete_url(url, base_url))) page = page[:css_start] + css_tag + page[css_end:]
page = page[:css_start] + css + page[css_end:]
except (InfiniteRedirects, ConnectionRefusedError):
pass
return page return page
@ -150,10 +101,11 @@ def write_file(page, title, comment=None):
def complete_url(url, base_url): def complete_url(url, base_url):
base_up = urlparse(base_url)
if base_url is not None: if base_url is not None:
up = urlparse(url) up = urlparse(url)
if not up.netloc: if not up.netloc:
url = '//' + urlparse(base_url).netloc + url url = base_up.scheme + '://' + base_up.netloc + url
return url return url
@ -165,7 +117,7 @@ def main():
args = parser.parse_args() args = parser.parse_args()
for url in args.urls: for url in args.urls:
page = get_page(url) page = get_text(url)
parser = TitleParser(strict=False) parser = TitleParser(strict=False)
parser.feed(page) parser.feed(page)