add infinite redirects blocking

This commit is contained in:
Maks Snegov 2014-06-22 11:47:21 +04:00
parent 11de357865
commit 5b91bef896

View File

@ -10,6 +10,9 @@ from urllib.parse import urlparse
import zlib import zlib
class InfiniteRedirects(Exception): pass
class TitleParser(html.parser.HTMLParser): class TitleParser(html.parser.HTMLParser):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
html.parser.HTMLParser.__init__(self, *args, **kwargs) html.parser.HTMLParser.__init__(self, *args, **kwargs)
@ -26,8 +29,11 @@ class TitleParser(html.parser.HTMLParser):
self.title = self.rawdata[title_start:title_end] self.title = self.rawdata[title_start:title_end]
def download_content(url): def download_content(url, depth=0):
'''download page and decode it to utf-8''' '''download page and decode it to utf-8'''
if depth > 10:
raise InfiniteRedirects('too much redirects: %s' % url)
up = urlparse(url) up = urlparse(url)
if not up.scheme: if not up.scheme:
up = urlparse('//' + url) up = urlparse('//' + url)
@ -53,7 +59,7 @@ def download_content(url):
or (response.status == http.client.FOUND)): or (response.status == http.client.FOUND)):
new_url = response.getheader('Location') new_url = response.getheader('Location')
print('Redirecting to ' + new_url) print('Redirecting to ' + new_url)
return download_content(new_url) return download_content(new_url, depth+1)
return response return response
@ -99,7 +105,7 @@ def embed_pictures(page, pict_urls):
print('New picture: %s' % url) print('New picture: %s' % url)
try: try:
page = page.replace(url, embedded_image(url)) page = page.replace(url, embedded_image(url))
except (ValueError): except (ValueError, InfiniteRedirects):
pass pass
return page return page