Merge branch 'devel'
This commit is contained in:
commit
edd12deb37
238
nevernote.py
238
nevernote.py
@ -1,77 +1,217 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import http.client
|
import base64
|
||||||
|
import html.parser
|
||||||
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.error
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
from urllib.request import urlopen
|
||||||
|
import zlib
|
||||||
|
|
||||||
def get_page(url):
|
|
||||||
'''download page and decode it to utf-8'''
|
|
||||||
charset = 'utf-8'
|
|
||||||
|
|
||||||
up = urlparse(url)
|
class UrlDuplicateError(Exception): pass
|
||||||
|
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": up.netloc,
|
|
||||||
"Content-Type": "text/html; charset=utf-8",
|
|
||||||
"Connection": "keep-alive",
|
|
||||||
}
|
|
||||||
|
|
||||||
if up.scheme == 'http':
|
class TitleParser(html.parser.HTMLParser):
|
||||||
conn = http.client.HTTPConnection(up.netloc)
|
def __init__(self, *args, **kwargs):
|
||||||
elif up.scheme == 'https':
|
html.parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||||
conn = http.client.HTTPSConnection(up.netloc)
|
self.images = set()
|
||||||
|
self.css = set()
|
||||||
|
self.scripts = set()
|
||||||
|
|
||||||
|
def handle_starttag(self, name, attribs):
|
||||||
|
if name == 'img':
|
||||||
|
for attr, value in attribs:
|
||||||
|
if attr == 'src':
|
||||||
|
self.images.add(value)
|
||||||
|
elif name == 'script':
|
||||||
|
for attr, value in attribs:
|
||||||
|
if attr == 'src':
|
||||||
|
self.scripts.add(value)
|
||||||
|
elif name == 'title':
|
||||||
|
titletag_start = self.rawdata.index('<title')
|
||||||
|
title_start = self.rawdata.index('>', titletag_start) + 1
|
||||||
|
title_end = self.rawdata.index('</title>', title_start)
|
||||||
|
self.title = self.rawdata[title_start:title_end]
|
||||||
|
elif name == 'link':
|
||||||
|
attr_dict = dict(attribs)
|
||||||
|
if attr_dict.get('rel') == 'stylesheet':
|
||||||
|
self.css.add(attr_dict['href'])
|
||||||
|
|
||||||
|
|
||||||
|
def charset_header(content_type):
|
||||||
|
""" Parse charset from 'content-type' header
|
||||||
|
:param content_type: string
|
||||||
|
:return: string with character set
|
||||||
|
"""
|
||||||
|
if 'charset' in content_type:
|
||||||
|
return content_type.split(';')[1].split('=')[1]
|
||||||
else:
|
else:
|
||||||
print("ERROR: invalid protocol set in '{0}'".format(url))
|
return None
|
||||||
return False
|
|
||||||
|
|
||||||
conn.request("GET", up.path, None, headers)
|
|
||||||
response = conn.getresponse()
|
|
||||||
|
|
||||||
# follow redirects
|
def get_text(url, content={'text/html'}, charset='utf-8'):
|
||||||
if (response.status == http.client.MOVED_PERMANENTLY) \
|
response = urlopen(url)
|
||||||
or (response.status == http.client.FOUND):
|
if response.status != 200:
|
||||||
new_url = response.getheader('Location')
|
raise urllib.error.HTTPError(
|
||||||
print('Redirect to ' + new_url)
|
url, response.status,
|
||||||
return get_page(new_url)
|
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
|
||||||
|
None, None
|
||||||
|
)
|
||||||
|
ctype = response.headers.get('content-type')
|
||||||
|
if ctype is None:
|
||||||
|
raise RuntimeError('None content type for %s' % url)
|
||||||
|
for cnt in content:
|
||||||
|
if ctype.startswith(cnt):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
|
||||||
|
|
||||||
# get page charset from response header
|
# get charset from 'Content-type' header
|
||||||
contenttype = response.getheader('Content-Type')
|
charset = charset_header(ctype) or charset
|
||||||
if contenttype:
|
|
||||||
ct_spl = contenttype.split('; ')
|
|
||||||
if len(ct_spl) > 1:
|
|
||||||
charset = ct_spl[1].split('=')[1]
|
|
||||||
|
|
||||||
page_binary = response.read()
|
|
||||||
page = page_binary.decode(charset)
|
|
||||||
|
|
||||||
|
if response.info().get('Content-Encoding') == 'gzip':
|
||||||
|
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
|
||||||
|
else:
|
||||||
|
data = response.read()
|
||||||
|
page = data.decode(charset.lower())
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def get_title(page):
|
def embedded_image(url):
|
||||||
soup = BeautifulSoup(page)
|
'''Download content from URL and return bytes if target is image'''
|
||||||
return soup.title.string
|
response = urlopen(url)
|
||||||
|
if response.status != 200:
|
||||||
|
raise urllib.error.HTTPError(
|
||||||
|
url, response.status,
|
||||||
|
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
|
||||||
|
None, None
|
||||||
|
)
|
||||||
|
ctype = response.headers.get('Content-Type')
|
||||||
|
data = response.read()
|
||||||
|
b64pict = base64.b64encode(data).decode()
|
||||||
|
return 'data:%s;base64,%s' % (ctype, b64pict)
|
||||||
|
|
||||||
|
|
||||||
def write_file(page):
|
def embed_pictures(page, pict_urls, base_url=None):
|
||||||
fname = get_title(page) + '.html'
|
for url in pict_urls:
|
||||||
with open(fname, 'w') as a_file:
|
print('New picture: %s' % url)
|
||||||
|
try:
|
||||||
|
page = page.replace(
|
||||||
|
url, embedded_image(complete_url(url, base_url)))
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
pass
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def embed_css(page, css_urls, base_url=None):
|
||||||
|
# fetch charset from base URL or use default UTF-8
|
||||||
|
if base_url is not None:
|
||||||
|
hdr = urlopen(base_url).headers.get('content-type')
|
||||||
|
base_char = charset_header(hdr) if hdr is not None else None
|
||||||
|
base_char = base_char or 'utf-8'
|
||||||
|
for url in css_urls:
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
print('New CSS: %s' % url)
|
||||||
|
css_start = page.rindex('<', 0, page.index(url))
|
||||||
|
css_end = page.index('>', css_start) + 1
|
||||||
|
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
|
||||||
|
complete_url(url, base_url), content={'text/css'}, charset=base_char))
|
||||||
|
page = page[:css_start] + css_tag + page[css_end:]
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def embed_scripts(page, script_urls, base_url=None):
|
||||||
|
for url in script_urls:
|
||||||
|
print('New script: %s' % url)
|
||||||
|
try:
|
||||||
|
page = page.replace(
|
||||||
|
url, embedded_image(complete_url(url, base_url)))
|
||||||
|
except urllib.error.HTTPError:
|
||||||
|
pass
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def url_duplicate(url):
|
||||||
|
for htmlfile in os.listdir():
|
||||||
|
if not htmlfile.endswith('.html'):
|
||||||
|
continue
|
||||||
|
with open(htmlfile) as h:
|
||||||
|
h_url = h.readline()
|
||||||
|
if url in URLDUP.findall(h_url):
|
||||||
|
raise UrlDuplicateError(
|
||||||
|
'URL is already saved in file "%s"' % htmlfile)
|
||||||
|
|
||||||
|
|
||||||
|
def write_file(page, title, comment=None):
|
||||||
|
write_inc = lambda i: '_%d' % i if i > 1 else ''
|
||||||
|
inc = 0
|
||||||
|
while True:
|
||||||
|
inc += 1
|
||||||
|
fname = (' '.join(title.replace('/', '_').split()) + write_inc(inc))[:128] + '.html'
|
||||||
|
if not os.path.exists(fname):
|
||||||
|
break
|
||||||
|
|
||||||
|
with open(fname, 'x', newline='\n') as a_file:
|
||||||
|
print('Saving in file "%s"' % fname)
|
||||||
|
if comment:
|
||||||
|
a_file.write('<!-- URL: %s -->\n' % comment)
|
||||||
a_file.write(page)
|
a_file.write(page)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def complete_url(url, base_url):
|
||||||
parser = argparse.ArgumentParser(description=
|
base_up = urlparse(base_url)
|
||||||
'Nevernote - download pages locally.')
|
if base_url is not None:
|
||||||
parser.add_argument('urls', metavar='URL', type=str, nargs='+', help=
|
up = urlparse(url)
|
||||||
'URL of page to download')
|
if not up.netloc:
|
||||||
|
url = base_up.scheme + '://' + base_up.netloc + url
|
||||||
|
elif not up.scheme:
|
||||||
|
url = base_up.scheme + ':' + url
|
||||||
|
return url
|
||||||
|
|
||||||
|
|
||||||
|
def process_url(url):
|
||||||
|
print('Processing URL: %s' % url)
|
||||||
|
try:
|
||||||
|
url_duplicate(url)
|
||||||
|
except UrlDuplicateError as e:
|
||||||
|
print(e)
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
page = get_text(url)
|
||||||
|
parser = TitleParser(strict=False)
|
||||||
|
parser.feed(page)
|
||||||
|
|
||||||
|
page = embed_pictures(page, parser.images, base_url=url)
|
||||||
|
page = embed_css(page, parser.css, base_url=url)
|
||||||
|
page = embed_scripts(page, parser.scripts, base_url=url)
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
print(e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
write_file(page, parser.title, comment=url)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Nevernote - download pages locally.')
|
||||||
|
parser.add_argument('urls', metavar='URL', type=str, nargs='+',
|
||||||
|
help='URL of page to download')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
for url in args.urls:
|
for arg in args.urls:
|
||||||
page = get_page(url)
|
if os.path.isfile(arg):
|
||||||
write_file(page)
|
print('Found file %s' % arg)
|
||||||
|
for url in (line.strip() for line in open(arg)):
|
||||||
|
process_url(url)
|
||||||
|
else:
|
||||||
|
process_url(arg)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user