add gzip encoding support
This commit is contained in:
parent
5c9d04cf3d
commit
964e79f97b
13
nevernote.py
13
nevernote.py
@ -8,6 +8,7 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
import zlib
|
||||||
|
|
||||||
|
|
||||||
class UrlDuplicateError(Exception): pass
|
class UrlDuplicateError(Exception): pass
|
||||||
@ -37,10 +38,10 @@ class TitleParser(html.parser.HTMLParser):
|
|||||||
|
|
||||||
|
|
||||||
def get_text(url, content='text/html'):
|
def get_text(url, content='text/html'):
|
||||||
u = urlopen(url)
|
response = urlopen(url)
|
||||||
if u.status != 200:
|
if response.status != 200:
|
||||||
raise RuntimeError('Incorrect HTTP status for %s' % url)
|
raise RuntimeError('Incorrect HTTP status for %s' % url)
|
||||||
ctype = u.headers.get('content-type')
|
ctype = response.headers.get('content-type')
|
||||||
if ctype is None:
|
if ctype is None:
|
||||||
raise RuntimeError('None content type for %s' % url)
|
raise RuntimeError('None content type for %s' % url)
|
||||||
if not ctype.startswith(content):
|
if not ctype.startswith(content):
|
||||||
@ -48,7 +49,11 @@ def get_text(url, content='text/html'):
|
|||||||
|
|
||||||
# get charset from 'Content-type' header
|
# get charset from 'Content-type' header
|
||||||
charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8'
|
charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8'
|
||||||
data = u.read()
|
|
||||||
|
if response.info().get('Content-Encoding') == 'gzip':
|
||||||
|
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
|
||||||
|
else:
|
||||||
|
data = response.read()
|
||||||
page = data.decode(charset.lower())
|
page = data.decode(charset.lower())
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user