rewrite HTML title parser

This commit is contained in:
Maks Snegov 2014-06-01 23:20:42 +04:00
parent af948ff6fc
commit 7e43162920

View File

@ -2,11 +2,17 @@
import argparse import argparse
import http.client import http.client
import html.parser
import sys import sys
from bs4 import BeautifulSoup
from urllib.parse import urlparse from urllib.parse import urlparse
class TitleParser(html.parser.HTMLParser):
def handle_data(self, data):
if self.lasttag == 'title':
self.title = data
def get_page(url): def get_page(url):
'''download page and decode it to utf-8''' '''download page and decode it to utf-8'''
charset = 'utf-8' charset = 'utf-8'
@ -50,13 +56,11 @@ def get_page(url):
return page return page
def get_title(page):
soup = BeautifulSoup(page)
return soup.title.string
def write_file(page): def write_file(page):
fname = get_title(page) + '.html' parser = TitleParser(strict=False)
parser.feed(page)
fname = parser.title + '.html'
with open(fname, 'w') as a_file: with open(fname, 'w') as a_file:
a_file.write(page) a_file.write(page)