add gzip encoding support

2014-07-20 14:03:49 +04:00
parent 5c9d04cf3d
commit 964e79f97b
1 changed files with 9 additions and 4 deletions
--- a/nevernote.py
+++ b/nevernote.py
@@ -8,6 +8,7 @@ import re
 import sys
 from urllib.parse import urlparse
 from urllib.request import urlopen
+import zlib


 class UrlDuplicateError(Exception): pass
@@ -37,10 +38,10 @@ class TitleParser(html.parser.HTMLParser):


 def get_text(url, content='text/html'):
-    u = urlopen(url)
-    if u.status != 200:
+    response = urlopen(url)
+    if response.status != 200:
        raise RuntimeError('Incorrect HTTP status for %s' % url)
-    ctype = u.headers.get('content-type')
+    ctype = response.headers.get('content-type')
    if ctype is None:
        raise RuntimeError('None content type for %s' % url)
    if not ctype.startswith(content):
@@ -48,7 +49,11 @@ def get_text(url, content='text/html'):

    # get charset from 'Content-type' header
    charset = ctype.split(';')[1].split('=')[1] if 'charset' in ctype else 'utf-8'
-    data = u.read()
+
+    if response.info().get('Content-Encoding') == 'gzip':
+        data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
+    else:
+        data = response.read()
    page = data.decode(charset.lower())
    return page