Merge branch 'devel'

2016-02-04 09:10:56 +03:00
parent 6cbfec5067 1a6a7b3c9b
commit edd12deb37
2 changed files with 189 additions and 49 deletions
--- a/deprecated/nevernote.sh
+++ b/deprecated/nevernote.sh
--- a/nevernote.py
+++ b/nevernote.py
@@ -1,77 +1,217 @@
 #!/usr/bin/python3

 import argparse
-import http.client
+import base64
+import html.parser
+import os
+import re
 import sys
-
-from bs4 import BeautifulSoup
+import urllib.error
 from urllib.parse import urlparse
+from urllib.request import urlopen
+import zlib

-def get_page(url):
-    '''download page and decode it to utf-8'''
-    charset = 'utf-8'

-    up = urlparse(url)
+class UrlDuplicateError(Exception): pass
+URLDUP = re.compile(r'^<!-- URL: (.*) -->$')

-    headers = {
-        "Host": up.netloc,
-        "Content-Type": "text/html; charset=utf-8",
-        "Connection": "keep-alive",
-    }

-    if up.scheme == 'http':
-        conn = http.client.HTTPConnection(up.netloc)
-    elif up.scheme == 'https':
-        conn = http.client.HTTPSConnection(up.netloc)
+class TitleParser(html.parser.HTMLParser):
+    def __init__(self, *args, **kwargs):
+        html.parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.images = set()
+        self.css = set()
+        self.scripts = set()
+
+    def handle_starttag(self, name, attribs):
+        if name == 'img':
+            for attr, value in attribs:
+                if attr == 'src':
+                    self.images.add(value)
+        elif name == 'script':
+            for attr, value in attribs:
+                if attr == 'src':
+                    self.scripts.add(value)
+        elif name == 'title':
+            titletag_start = self.rawdata.index('<title')
+            title_start = self.rawdata.index('>', titletag_start) + 1
+            title_end = self.rawdata.index('</title>', title_start)
+            self.title = self.rawdata[title_start:title_end]
+        elif name == 'link':
+            attr_dict = dict(attribs)
+            if attr_dict.get('rel') == 'stylesheet':
+                self.css.add(attr_dict['href'])
+
+
+def charset_header(content_type):
+    """ Parse charset from 'content-type' header
+    :param content_type: string
+    :return: string with character set
+    """
+    if 'charset' in content_type:
+        return content_type.split(';')[1].split('=')[1]
    else:
-        print("ERROR: invalid protocol set in '{0}'".format(url))
-        return False
+        return None

-    conn.request("GET", up.path, None, headers)
-    response = conn.getresponse()

-    # follow redirects
-    if (response.status == http.client.MOVED_PERMANENTLY) \
-            or (response.status == http.client.FOUND):
-        new_url = response.getheader('Location')
-        print('Redirect to ' + new_url)
-        return get_page(new_url)
+def get_text(url, content={'text/html'}, charset='utf-8'):
+    response = urlopen(url)
+    if response.status != 200:
+        raise urllib.error.HTTPError(
+            url, response.status,
+            'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
+            None, None
+        )
+    ctype = response.headers.get('content-type')
+    if ctype is None:
+        raise RuntimeError('None content type for %s' % url)
+    for cnt in content:
+        if ctype.startswith(cnt):
+            break
+    else:
+        raise RuntimeError('Incorrect content-type for %s: %s' % (url, ctype))

-    # get page charset from response header
-    contenttype = response.getheader('Content-Type')
-    if contenttype:
-        ct_spl = contenttype.split('; ')
-        if len(ct_spl) > 1:
-            charset = ct_spl[1].split('=')[1]
-
-    page_binary = response.read()
-    page = page_binary.decode(charset)
+    # get charset from 'Content-type' header
+    charset = charset_header(ctype) or charset

+    if response.info().get('Content-Encoding') == 'gzip':
+        data = zlib.decompress(response.read(), 16+zlib.MAX_WBITS)
+    else:
+        data = response.read()
+    page = data.decode(charset.lower())
    return page


-def get_title(page):
-    soup = BeautifulSoup(page)
-    return soup.title.string
+def embedded_image(url):
+    '''Download content from URL and return bytes if target is image'''
+    response = urlopen(url)
+    if response.status != 200:
+        raise urllib.error.HTTPError(
+            url, response.status,
+            'Incorrect HTTP status (%d, %s) for %s' % (response.status, response.reason, url),
+            None, None
+        )
+    ctype = response.headers.get('Content-Type')
+    data = response.read()
+    b64pict = base64.b64encode(data).decode()
+    return 'data:%s;base64,%s' % (ctype, b64pict)


-def write_file(page):
-    fname = get_title(page) + '.html'
-    with open(fname, 'w') as a_file:
+def embed_pictures(page, pict_urls, base_url=None):
+    for url in pict_urls:
+        print('New picture: %s' % url)
+        try:
+            page = page.replace(
+                url, embedded_image(complete_url(url, base_url)))
+        except urllib.error.HTTPError:
+            pass
+    return page
+
+
+def embed_css(page, css_urls, base_url=None):
+    # fetch charset from base URL or use default UTF-8
+    if base_url is not None:
+        hdr = urlopen(base_url).headers.get('content-type')
+        base_char = charset_header(hdr) if hdr is not None else None
+        base_char = base_char or 'utf-8'
+    for url in css_urls:
+        if not url:
+            continue
+        print('New CSS: %s' % url)
+        css_start = page.rindex('<', 0, page.index(url))
+        css_end = page.index('>', css_start) + 1
+        css_tag = ('<style media="screen" type="text/css">%s</style>' % get_text(
+            complete_url(url, base_url), content={'text/css'}, charset=base_char))
+        page = page[:css_start] + css_tag + page[css_end:]
+    return page
+
+
+def embed_scripts(page, script_urls, base_url=None):
+    for url in script_urls:
+        print('New script: %s' % url)
+        try:
+            page = page.replace(
+                url, embedded_image(complete_url(url, base_url)))
+        except urllib.error.HTTPError:
+            pass
+    return page
+
+
+def url_duplicate(url):
+    for htmlfile in os.listdir():
+        if not htmlfile.endswith('.html'):
+            continue
+        with open(htmlfile) as h:
+            h_url = h.readline()
+            if url in URLDUP.findall(h_url):
+                raise UrlDuplicateError(
+                    'URL is already saved in file "%s"' % htmlfile)
+
+
+def write_file(page, title, comment=None):
+    write_inc = lambda i: '_%d' % i if i > 1 else ''
+    inc = 0
+    while True:
+        inc += 1
+        fname = (' '.join(title.replace('/', '_').split()) + write_inc(inc))[:128] + '.html'
+        if not os.path.exists(fname):
+            break
+
+    with open(fname, 'x', newline='\n') as a_file:
+        print('Saving in file "%s"' % fname)
+        if comment:
+            a_file.write('<!-- URL: %s -->\n' % comment)
        a_file.write(page)


-def main():
-    parser = argparse.ArgumentParser(description=
-            'Nevernote - download pages locally.')
-    parser.add_argument('urls', metavar='URL', type=str, nargs='+', help=
-            'URL of page to download')
+def complete_url(url, base_url):
+    base_up = urlparse(base_url)
+    if base_url is not None:
+        up = urlparse(url)
+        if not up.netloc:
+            url = base_up.scheme + '://' + base_up.netloc + url
+        elif not up.scheme:
+            url = base_up.scheme + ':' + url
+    return url

+
+def process_url(url):
+    print('Processing URL: %s' % url)
+    try:
+        url_duplicate(url)
+    except UrlDuplicateError as e:
+        print(e)
+        return
+
+    try:
+        page = get_text(url)
+        parser = TitleParser(strict=False)
+        parser.feed(page)
+
+        page = embed_pictures(page, parser.images, base_url=url)
+        page = embed_css(page, parser.css, base_url=url)
+        page = embed_scripts(page, parser.scripts, base_url=url)
+    except urllib.error.HTTPError as e:
+        print(e)
+        return False
+
+    write_file(page, parser.title, comment=url)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Nevernote - download pages locally.')
+    parser.add_argument('urls', metavar='URL', type=str, nargs='+',
+        help='URL of page to download')
    args = parser.parse_args()

-    for url in args.urls:
-        page = get_page(url)
-        write_file(page)
+    for arg in args.urls:
+        if os.path.isfile(arg):
+            print('Found file %s' % arg)
+            for url in (line.strip() for line in open(arg)):
+                process_url(url)
+        else:
+            process_url(arg)


 if __name__ == '__main__':