Use requests library
This commit is contained in:
parent
56a7032b3e
commit
44b8a17841
70
nevernote.py
70
nevernote.py
@ -6,10 +6,9 @@ import html.parser
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import urllib.error
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from urllib.request import urlopen
|
|
||||||
import zlib
|
import requests
|
||||||
|
|
||||||
|
|
||||||
class UrlDuplicateError(Exception): pass
|
class UrlDuplicateError(Exception): pass
|
||||||
@ -43,56 +42,18 @@ class TitleParser(html.parser.HTMLParser):
|
|||||||
self.css.add(attr_dict['href'])
|
self.css.add(attr_dict['href'])
|
||||||
|
|
||||||
|
|
||||||
def charset_header(content_type):
|
def get_text(url):
|
||||||
""" Parse charset from 'content-type' header
|
response = requests.get(url)
|
||||||
:param content_type: string
|
response.raise_for_status()
|
||||||
:return: string with character set
|
return response.text
|
||||||
"""
|
|
||||||
if 'charset' in content_type:
|
|
||||||
return content_type.split(';')[1].split('=')[1]
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_text(url, content={'text/html'}, charset='utf-8'):
|
|
||||||
response = urlopen(url)
|
|
||||||
if response.status != 200:
|
|
||||||
raise urllib.error.HTTPError(
|
|
||||||
url, response.status,
|
|
||||||
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
|
|
||||||
None, None
|
|
||||||
)
|
|
||||||
ctype = response.headers.get('content-type')
|
|
||||||
if ctype is None:
|
|
||||||
raise RuntimeError('None content type for %s' % url)
|
|
||||||
for cnt in content:
|
|
||||||
if ctype.startswith(cnt):
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
|
|
||||||
|
|
||||||
# get charset from 'Content-type' header
|
|
||||||
charset = charset_header(ctype) or charset
|
|
||||||
|
|
||||||
if response.info().get('Content-Encoding') == 'gzip':
|
|
||||||
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
|
|
||||||
else:
|
|
||||||
data = response.read()
|
|
||||||
page = data.decode(charset.lower())
|
|
||||||
return page
|
|
||||||
|
|
||||||
|
|
||||||
def embedded_image(url):
|
def embedded_image(url):
|
||||||
'''Download content from URL and return bytes if target is image'''
|
'''Download content from URL and return bytes if target is image'''
|
||||||
response = urlopen(url)
|
response = requests.get(url)
|
||||||
if response.status != 200:
|
response.raise_for_status()
|
||||||
raise urllib.error.HTTPError(
|
|
||||||
url, response.status,
|
|
||||||
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
|
|
||||||
None, None
|
|
||||||
)
|
|
||||||
ctype = response.headers.get('Content-Type')
|
ctype = response.headers.get('Content-Type')
|
||||||
data = response.read()
|
data = response.content
|
||||||
b64pict = base64.b64encode(data).decode()
|
b64pict = base64.b64encode(data).decode()
|
||||||
return 'data:%s;base64,%s' % (ctype, b64pict)
|
return 'data:%s;base64,%s' % (ctype, b64pict)
|
||||||
|
|
||||||
@ -103,17 +64,12 @@ def embed_pictures(page, pict_urls, base_url=None):
|
|||||||
try:
|
try:
|
||||||
page = page.replace(
|
page = page.replace(
|
||||||
url, embedded_image(complete_url(url, base_url)))
|
url, embedded_image(complete_url(url, base_url)))
|
||||||
except urllib.error.HTTPError:
|
except requests.exceptions.HTTPError:
|
||||||
pass
|
pass
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def embed_css(page, css_urls, base_url=None):
|
def embed_css(page, css_urls, base_url=None):
|
||||||
# fetch charset from base URL or use default UTF-8
|
|
||||||
if base_url is not None:
|
|
||||||
hdr = urlopen(base_url).headers.get('content-type')
|
|
||||||
base_char = charset_header(hdr) if hdr is not None else None
|
|
||||||
base_char = base_char or 'utf-8'
|
|
||||||
for url in css_urls:
|
for url in css_urls:
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
@ -121,7 +77,7 @@ def embed_css(page, css_urls, base_url=None):
|
|||||||
css_start = page.rindex('<', 0, page.index(url))
|
css_start = page.rindex('<', 0, page.index(url))
|
||||||
css_end = page.index('>', css_start) + 1
|
css_end = page.index('>', css_start) + 1
|
||||||
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
|
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
|
||||||
complete_url(url, base_url), content={'text/css'}, charset=base_char))
|
complete_url(url, base_url)))
|
||||||
page = page[:css_start] + css_tag + page[css_end:]
|
page = page[:css_start] + css_tag + page[css_end:]
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -132,7 +88,7 @@ def embed_scripts(page, script_urls, base_url=None):
|
|||||||
try:
|
try:
|
||||||
page = page.replace(
|
page = page.replace(
|
||||||
url, embedded_image(complete_url(url, base_url)))
|
url, embedded_image(complete_url(url, base_url)))
|
||||||
except urllib.error.HTTPError:
|
except requests.exceptions.HTTPError:
|
||||||
pass
|
pass
|
||||||
return page
|
return page
|
||||||
|
|
||||||
@ -191,7 +147,7 @@ def process_url(url):
|
|||||||
page = embed_pictures(page, parser.images, base_url=url)
|
page = embed_pictures(page, parser.images, base_url=url)
|
||||||
page = embed_css(page, parser.css, base_url=url)
|
page = embed_css(page, parser.css, base_url=url)
|
||||||
page = embed_scripts(page, parser.scripts, base_url=url)
|
page = embed_scripts(page, parser.scripts, base_url=url)
|
||||||
except urllib.error.HTTPError as e:
|
except requests.exceptions.HTTPError as e:
|
||||||
print(e)
|
print(e)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|||||||
1
requirements.txt
Normal file
1
requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
requests
|
||||||
Loading…
Reference in New Issue
Block a user