#!/usr/bin/python3 import argparse import http.client import sys from bs4 import BeautifulSoup from urllib.parse import urlparse def get_page(url): '''download page and decode it to utf-8''' charset = 'utf-8' up = urlparse(url) headers = { "Host": up.netloc, "Content-Type": "text/html; charset=utf-8", "Connection": "keep-alive", } if up.scheme == 'http': conn = http.client.HTTPConnection(up.netloc) elif up.scheme == 'https': conn = http.client.HTTPSConnection(up.netloc) else: print("ERROR: invalid protocol set in '{0}'".format(url)) return False conn.request("GET", up.path, None, headers) response = conn.getresponse() # follow redirects if (response.status == http.client.MOVED_PERMANENTLY) \ or (response.status == http.client.FOUND): new_url = response.getheader('Location') print('Redirect to ' + new_url) return get_page(new_url) # get page charset from response header contenttype = response.getheader('Content-Type') if contenttype: ct_spl = contenttype.split('; ') if len(ct_spl) > 1: charset = ct_spl[1].split('=')[1] page_binary = response.read() page = page_binary.decode(charset) return page def get_title(page): soup = BeautifulSoup(page) return soup.title.string def write_file(page): fname = get_title(page) + '.html' with open(fname, 'w') as a_file: a_file.write(page) def main(): parser = argparse.ArgumentParser(description= 'Nevernote - download pages locally.') parser.add_argument('urls', metavar='URL', type=str, nargs='+', help= 'URL of page to download') args = parser.parse_args() for url in args.urls: page = get_page(url) write_file(page) if __name__ == '__main__': sys.exit(main())