Merge branch 'devel'

This commit is contained in:
Maks Snegov 2016-02-04 09:10:56 +03:00
commit edd12deb37
2 changed files with 189 additions and 49 deletions

View File

@ -1,77 +1,217 @@
#!/usr/bin/python3 #!/usr/bin/python3
import argparse import argparse
import http.client import base64
import html.parser
import os
import re
import sys import sys
import urllib.error
from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
from urllib.request import urlopen
import zlib
def get_page(url):
'''download page and decode it to utf-8'''
charset = 'utf-8'
up = urlparse(url) class UrlDuplicateError(Exception): pass
URLDUP = re.compile(r'^<!-- URL: (.*) -->$')
headers = {
"Host": up.netloc,
"Content-Type": "text/html; charset=utf-8",
"Connection": "keep-alive",
}
if up.scheme == 'http': class TitleParser(html.parser.HTMLParser):
conn = http.client.HTTPConnection(up.netloc) def __init__(self, *args, **kwargs):
elif up.scheme == 'https': html.parser.HTMLParser.__init__(self, *args, **kwargs)
conn = http.client.HTTPSConnection(up.netloc) self.images = set()
self.css = set()
self.scripts = set()
def handle_starttag(self, name, attribs):
if name == 'img':
for attr, value in attribs:
if attr == 'src':
self.images.add(value)
elif name == 'script':
for attr, value in attribs:
if attr == 'src':
self.scripts.add(value)
elif name == 'title':
titletag_start = self.rawdata.index('<title')
title_start = self.rawdata.index('>', titletag_start) + 1
title_end = self.rawdata.index('</title>', title_start)
self.title = self.rawdata[title_start:title_end]
elif name == 'link':
attr_dict = dict(attribs)
if attr_dict.get('rel') == 'stylesheet':
self.css.add(attr_dict['href'])
def charset_header(content_type):
""" Parse charset from 'content-type' header
:param content_type: string
:return: string with character set
"""
if 'charset' in content_type:
return content_type.split(';')[1].split('=')[1]
else: else:
print("ERROR: invalid protocol set in '{0}'".format(url)) return None
return False
conn.request("GET", up.path, None, headers)
response = conn.getresponse()
# follow redirects def get_text(url, content={'text/html'}, charset='utf-8'):
if (response.status == http.client.MOVED_PERMANENTLY) \ response = urlopen(url)
or (response.status == http.client.FOUND): if response.status != 200:
new_url = response.getheader('Location') raise urllib.error.HTTPError(
print('Redirect to ' + new_url) url, response.status,
return get_page(new_url) 'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
None, None
)
ctype = response.headers.get('content-type')
if ctype is None:
raise RuntimeError('None content type for %s' % url)
for cnt in content:
if ctype.startswith(cnt):
break
else:
raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))
# get page charset from response header # get charset from 'Content-type' header
contenttype = response.getheader('Content-Type') charset = charset_header(ctype) or charset
if contenttype:
ct_spl = contenttype.split('; ')
if len(ct_spl) > 1:
charset = ct_spl[1].split('=')[1]
page_binary = response.read()
page = page_binary.decode(charset)
if response.info().get('Content-Encoding') == 'gzip':
data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
else:
data = response.read()
page = data.decode(charset.lower())
return page return page
def get_title(page): def embedded_image(url):
soup = BeautifulSoup(page) '''Download content from URL and return bytes if target is image'''
return soup.title.string response = urlopen(url)
if response.status != 200:
raise urllib.error.HTTPError(
url, response.status,
'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
None, None
)
ctype = response.headers.get('Content-Type')
data = response.read()
b64pict = base64.b64encode(data).decode()
return 'data:%s;base64,%s' % (ctype, b64pict)
def write_file(page): def embed_pictures(page, pict_urls, base_url=None):
fname = get_title(page) + '.html' for url in pict_urls:
with open(fname, 'w') as a_file: print('New picture: %s' % url)
try:
page = page.replace(
url, embedded_image(complete_url(url, base_url)))
except urllib.error.HTTPError:
pass
return page
def embed_css(page, css_urls, base_url=None):
# fetch charset from base URL or use default UTF-8
if base_url is not None:
hdr = urlopen(base_url).headers.get('content-type')
base_char = charset_header(hdr) if hdr is not None else None
base_char = base_char or 'utf-8'
for url in css_urls:
if not url:
continue
print('New CSS: %s' % url)
css_start = page.rindex('<', 0, page.index(url))
css_end = page.index('>', css_start) + 1
css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
complete_url(url, base_url), content={'text/css'}, charset=base_char))
page = page[:css_start] + css_tag + page[css_end:]
return page
def embed_scripts(page, script_urls, base_url=None):
for url in script_urls:
print('New script: %s' % url)
try:
page = page.replace(
url, embedded_image(complete_url(url, base_url)))
except urllib.error.HTTPError:
pass
return page
def url_duplicate(url):
for htmlfile in os.listdir():
if not htmlfile.endswith('.html'):
continue
with open(htmlfile) as h:
h_url = h.readline()
if url in URLDUP.findall(h_url):
raise UrlDuplicateError(
'URL is already saved in file "%s"' % htmlfile)
def write_file(page, title, comment=None):
write_inc = lambda i: '_%d' % i if i > 1 else ''
inc = 0
while True:
inc += 1
fname = (' '.join(title.replace('/', '_').split()) + write_inc(inc))[:128] + '.html'
if not os.path.exists(fname):
break
with open(fname, 'x', newline='\n') as a_file:
print('Saving in file "%s"' % fname)
if comment:
a_file.write('<!-- URL: %s -->\n' % comment)
a_file.write(page) a_file.write(page)
def main(): def complete_url(url, base_url):
parser = argparse.ArgumentParser(description= base_up = urlparse(base_url)
'Nevernote - download pages locally.') if base_url is not None:
parser.add_argument('urls', metavar='URL', type=str, nargs='+', help= up = urlparse(url)
'URL of page to download') if not up.netloc:
url = base_up.scheme + '://' + base_up.netloc + url
elif not up.scheme:
url = base_up.scheme + ':' + url
return url
def process_url(url):
print('Processing URL: %s' % url)
try:
url_duplicate(url)
except UrlDuplicateError as e:
print(e)
return
try:
page = get_text(url)
parser = TitleParser(strict=False)
parser.feed(page)
page = embed_pictures(page, parser.images, base_url=url)
page = embed_css(page, parser.css, base_url=url)
page = embed_scripts(page, parser.scripts, base_url=url)
except urllib.error.HTTPError as e:
print(e)
return False
write_file(page, parser.title, comment=url)
def main():
parser = argparse.ArgumentParser(
description='Nevernote - download pages locally.')
parser.add_argument('urls', metavar='URL', type=str, nargs='+',
help='URL of page to download')
args = parser.parse_args() args = parser.parse_args()
for url in args.urls: for arg in args.urls:
page = get_page(url) if os.path.isfile(arg):
write_file(page) print('Found file %s' % arg)
for url in (line.strip() for line in open(arg)):
process_url(url)
else:
process_url(arg)
if __name__ == '__main__': if __name__ == '__main__':