separate download_content() from get_page()
This commit is contained in:
parent
2f6c877493
commit
5b05f3e8d0
21
nevernote.py
21
nevernote.py
@ -13,10 +13,8 @@ class TitleParser(html.parser.HTMLParser):
|
||||
self.title = data
|
||||
|
||||
|
||||
def get_page(url):
|
||||
def download_content(url):
|
||||
'''download page and decode it to utf-8'''
|
||||
charset = 'utf-8'
|
||||
|
||||
up = urlparse(url)
|
||||
if not up.scheme:
|
||||
up = urlparse('http://' + url)
|
||||
@ -33,7 +31,6 @@ def get_page(url):
|
||||
conn = http.client.HTTPSConnection(up.netloc)
|
||||
else:
|
||||
raise NotImplementedError("protocol %s is not implemented" % up.scheme)
|
||||
return False
|
||||
|
||||
conn.request("GET", up.path, None, headers)
|
||||
response = conn.getresponse()
|
||||
@ -43,12 +40,20 @@ def get_page(url):
|
||||
or (response.status == http.client.FOUND):
|
||||
new_url = response.getheader('Location')
|
||||
print('Redirect to ' + new_url)
|
||||
return get_page(new_url)
|
||||
return download_content(new_url)
|
||||
return response
|
||||
|
||||
|
||||
def get_page(url):
|
||||
response = download_content(url)
|
||||
|
||||
# get page charset from response header
|
||||
contenttype = response.getheader('Content-Type')
|
||||
if contenttype:
|
||||
ct_spl = contenttype.split('; ')
|
||||
c_type = response.getheader('Content-Type')
|
||||
if not c_type.startswith('text'):
|
||||
raise ValueError('incorrect Content-Type for HTML page: %s' % c_type)
|
||||
|
||||
charset = 'iso-8859-1'
|
||||
ct_spl = c_type.split('; ')
|
||||
if len(ct_spl) > 1:
|
||||
charset = ct_spl[1].split('=')[1]
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user