rewrite HTML title parser
This commit is contained in:
parent
af948ff6fc
commit
7e43162920
20
nevernote.py
20
nevernote.py
@ -2,11 +2,17 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import http.client
|
import http.client
|
||||||
|
import html.parser
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
|
||||||
|
class TitleParser(html.parser.HTMLParser):
|
||||||
|
def handle_data(self, data):
|
||||||
|
if self.lasttag == 'title':
|
||||||
|
self.title = data
|
||||||
|
|
||||||
|
|
||||||
def get_page(url):
|
def get_page(url):
|
||||||
'''download page and decode it to utf-8'''
|
'''download page and decode it to utf-8'''
|
||||||
charset = 'utf-8'
|
charset = 'utf-8'
|
||||||
@ -50,13 +56,11 @@ def get_page(url):
|
|||||||
return page
|
return page
|
||||||
|
|
||||||
|
|
||||||
def get_title(page):
|
|
||||||
soup = BeautifulSoup(page)
|
|
||||||
return soup.title.string
|
|
||||||
|
|
||||||
|
|
||||||
def write_file(page):
|
def write_file(page):
|
||||||
fname = get_title(page) + '.html'
|
parser = TitleParser(strict=False)
|
||||||
|
parser.feed(page)
|
||||||
|
|
||||||
|
fname = parser.title + '.html'
|
||||||
with open(fname, 'w') as a_file:
|
with open(fname, 'w') as a_file:
|
||||||
a_file.write(page)
|
a_file.write(page)
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user