Retry failed page content downloads (eg. timeouts).
This commit is contained in:
parent
ec33276fd7
commit
110a67cda4
1 changed files with 7 additions and 1 deletions
|
@ -108,7 +108,10 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
|
try:
|
||||||
|
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
|
||||||
|
except IOError:
|
||||||
|
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries = MaxRetries
|
tries = MaxRetries
|
||||||
while not isValidPageContent(data) and tries > 0:
|
while not isValidPageContent(data) and tries > 0:
|
||||||
|
@ -147,6 +150,9 @@ def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
|
||||||
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
out.debug('matched URL %r with pattern %s' % (searchUrl, search.pattern))
|
||||||
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
||||||
|
if searchUrls:
|
||||||
|
# do not search other links if one pattern matched
|
||||||
|
break
|
||||||
if not searchUrls:
|
if not searchUrls:
|
||||||
patterns = [x.pattern for x in searches]
|
patterns = [x.pattern for x in searches]
|
||||||
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
|
||||||
|
|
Loading…
Reference in a new issue