diff --git a/nevernote.sh b/deprecated/nevernote.sh similarity index 100% rename from nevernote.sh rename to deprecated/nevernote.sh diff --git a/nevernote.py b/nevernote.py index 0ce97fc..2628a6c 100755 --- a/nevernote.py +++ b/nevernote.py @@ -1,77 +1,217 @@ #!/usr/bin/python3 import argparse -import http.client +import base64 +import html.parser +import os +import re import sys - -from bs4 import BeautifulSoup +import urllib.error from urllib.parse import urlparse +from urllib.request import urlopen +import zlib -def get_page(url): - '''download page and decode it to utf-8''' - charset = 'utf-8' - up = urlparse(url) +class UrlDuplicateError(Exception): pass +URLDUP = re.compile(r'^$') - headers = { - "Host": up.netloc, - "Content-Type": "text/html; charset=utf-8", - "Connection": "keep-alive", - } - if up.scheme == 'http': - conn = http.client.HTTPConnection(up.netloc) - elif up.scheme == 'https': - conn = http.client.HTTPSConnection(up.netloc) +class TitleParser(html.parser.HTMLParser): + def __init__(self, *args, **kwargs): + html.parser.HTMLParser.__init__(self, *args, **kwargs) + self.images = set() + self.css = set() + self.scripts = set() + + def handle_starttag(self, name, attribs): + if name == 'img': + for attr, value in attribs: + if attr == 'src': + self.images.add(value) + elif name == 'script': + for attr, value in attribs: + if attr == 'src': + self.scripts.add(value) + elif name == 'title': + titletag_start = self.rawdata.index('', titletag_start) + 1 + title_end = self.rawdata.index('', title_start) + self.title = self.rawdata[title_start:title_end] + elif name == 'link': + attr_dict = dict(attribs) + if attr_dict.get('rel') == 'stylesheet': + self.css.add(attr_dict['href']) + + +def charset_header(content_type): + """ Parse charset from 'content-type' header + :param content_type: string + :return: string with character set + """ + if 'charset' in content_type: + return content_type.split(';')[1].split('=')[1] else: - print("ERROR: invalid protocol set in '{0}'".format(url)) - return False + return None - conn.request("GET", up.path, None, headers) - response = conn.getresponse() - # follow redirects - if (response.status == http.client.MOVED_PERMANENTLY) \ - or (response.status == http.client.FOUND): - new_url = response.getheader('Location') - print('Redirect to ' + new_url) - return get_page(new_url) +def get_text(url, content={'text/html'}, charset='utf-8'): + response = urlopen(url) + if response.status != 200: + raise urllib.error.HTTPError( + url, response.status, + 'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url), + None, None + ) + ctype = response.headers.get('content-type') + if ctype is None: + raise RuntimeError('None content type for %s' % url) + for cnt in content: + if ctype.startswith(cnt): + break + else: + raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype)) - # get page charset from response header - contenttype = response.getheader('Content-Type') - if contenttype: - ct_spl = contenttype.split('; ') - if len(ct_spl) > 1: - charset = ct_spl[1].split('=')[1] - - page_binary = response.read() - page = page_binary.decode(charset) + # get charset from 'Content-type' header + charset = charset_header(ctype) or charset + if response.info().get('Content-Encoding') == 'gzip': + data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS) + else: + data = response.read() + page = data.decode(charset.lower()) return page -def get_title(page): - soup = BeautifulSoup(page) - return soup.title.string +def embedded_image(url): + '''Download content from URL and return bytes if target is image''' + response = urlopen(url) + if response.status != 200: + raise urllib.error.HTTPError( + url, response.status, + 'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url), + None, None + ) + ctype = response.headers.get('Content-Type') + data = response.read() + b64pict = base64.b64encode(data).decode() + return 'data:%s;base64,%s' % (ctype, b64pict) -def write_file(page): - fname = get_title(page) + '.html' - with open(fname, 'w') as a_file: +def embed_pictures(page, pict_urls, base_url=None): + for url in pict_urls: + print('New picture: %s' % url) + try: + page = page.replace( + url, embedded_image(complete_url(url, base_url))) + except urllib.error.HTTPError: + pass + return page + + +def embed_css(page, css_urls, base_url=None): + # fetch charset from base URL or use default UTF-8 + if base_url is not None: + hdr = urlopen(base_url).headers.get('content-type') + base_char = charset_header(hdr) if hdr is not None else None + base_char = base_char or 'utf-8' + for url in css_urls: + if not url: + continue + print('New CSS: %s' % url) + css_start = page.rindex('<', 0, page.index(url)) + css_end = page.index('>', css_start) + 1 + css_tag = ('' % get_text( + complete_url(url, base_url), content={'text/css'}, charset=base_char)) + page = page[:css_start] + css_tag + page[css_end:] + return page + + +def embed_scripts(page, script_urls, base_url=None): + for url in script_urls: + print('New script: %s' % url) + try: + page = page.replace( + url, embedded_image(complete_url(url, base_url))) + except urllib.error.HTTPError: + pass + return page + + +def url_duplicate(url): + for htmlfile in os.listdir(): + if not htmlfile.endswith('.html'): + continue + with open(htmlfile) as h: + h_url = h.readline() + if url in URLDUP.findall(h_url): + raise UrlDuplicateError( + 'URL is already saved in file "%s"' % htmlfile) + + +def write_file(page, title, comment=None): + write_inc = lambda i: '_%d' % i if i > 1 else '' + inc = 0 + while True: + inc += 1 + fname = (' '.join(title.replace('/', '_').split()) + write_inc(inc))[:128] + '.html' + if not os.path.exists(fname): + break + + with open(fname, 'x', newline='\n') as a_file: + print('Saving in file "%s"' % fname) + if comment: + a_file.write('\n' % comment) a_file.write(page) -def main(): - parser = argparse.ArgumentParser(description= - 'Nevernote - download pages locally.') - parser.add_argument('urls', metavar='URL', type=str, nargs='+', help= - 'URL of page to download') +def complete_url(url, base_url): + base_up = urlparse(base_url) + if base_url is not None: + up = urlparse(url) + if not up.netloc: + url = base_up.scheme + '://' + base_up.netloc + url + elif not up.scheme: + url = base_up.scheme + ':' + url + return url + +def process_url(url): + print('Processing URL: %s' % url) + try: + url_duplicate(url) + except UrlDuplicateError as e: + print(e) + return + + try: + page = get_text(url) + parser = TitleParser(strict=False) + parser.feed(page) + + page = embed_pictures(page, parser.images, base_url=url) + page = embed_css(page, parser.css, base_url=url) + page = embed_scripts(page, parser.scripts, base_url=url) + except urllib.error.HTTPError as e: + print(e) + return False + + write_file(page, parser.title, comment=url) + + +def main(): + parser = argparse.ArgumentParser( + description='Nevernote - download pages locally.') + parser.add_argument('urls', metavar='URL', type=str, nargs='+', + help='URL of page to download') args = parser.parse_args() - for url in args.urls: - page = get_page(url) - write_file(page) + for arg in args.urls: + if os.path.isfile(arg): + print('Found file %s' % arg) + for url in (line.strip() for line in open(arg)): + process_url(url) + else: + process_url(arg) if __name__ == '__main__':