add gzip encoding support

This commit is contained in:
Maks Snegov 2014-07-20 14:03:49 +04:00
parent 5c9d04cf3d
commit 964e79f97b

View File

@ -8,6 +8,7 @@ import re
import sys import sys
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.request import urlopen from urllib.request import urlopen
import zlib
class UrlDuplicateError(Exception): pass class UrlDuplicateError(Exception): pass
@ -37,10 +38,10 @@ class TitleParser(html.parser.HTMLParser):
def get_text(url, content='text/html'): def get_text(url, content='text/html'):
u = urlopen(url) response = urlopen(url)
if u.status != 200: if response.status != 200:
raise RuntimeError('Incorrect HTTP status for %s' % url) raise RuntimeError('Incorrect HTTP status for %s' % url)
ctype = u.headers.get('content-type') ctype = response.headers.get('content-type')
if ctype is None: if ctype is None:
raise RuntimeError('None content type for %s' % url) raise RuntimeError('None content type for %s' % url)
if not ctype.startswith(content): if not ctype.startswith(content):
@ -48,7 +49,11 @@ def get_text(url, content='text/html'):
# get charset from 'Content-type' header # get charset from 'Content-type' header
charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8' charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8'
data = u.read()
if response.info().get('Content-Encoding') == 'gzip':
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
else:
data = response.read()
page = data.decode(charset.lower()) page = data.decode(charset.lower())
return page return page