Do not stream page content URLs.
This commit is contained in:
parent
96edb60e01
commit
be1694592e
1 changed files with 6 additions and 5 deletions
|
@ -109,12 +109,12 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url, session)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries = MaxRetries
|
tries = MaxRetries
|
||||||
while not isValidPageContent(data) and tries > 0:
|
while not isValidPageContent(data) and tries > 0:
|
||||||
time.sleep(RetryPauseSeconds)
|
time.sleep(RetryPauseSeconds)
|
||||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes, stream=False)
|
||||||
data = page.text
|
data = page.text
|
||||||
tries -= 1
|
tries -= 1
|
||||||
if not isValidPageContent(data):
|
if not isValidPageContent(data):
|
||||||
|
@ -232,7 +232,8 @@ def get_robotstxt_parser(url, session=None):
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
timeout=ConnectionTimeoutSecs, raise_for_status=True):
|
timeout=ConnectionTimeoutSecs, raise_for_status=True,
|
||||||
|
stream=True):
|
||||||
"""Open an URL and return the response object."""
|
"""Open an URL and return the response object."""
|
||||||
out.debug('Open URL %s' % url)
|
out.debug('Open URL %s' % url)
|
||||||
headers = {'User-Agent': UserAgent}
|
headers = {'User-Agent': UserAgent}
|
||||||
|
@ -245,10 +246,10 @@ def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
}
|
}
|
||||||
if hasattr(requests, 'adapters'):
|
if hasattr(requests, 'adapters'):
|
||||||
# requests >= 1.0
|
# requests >= 1.0
|
||||||
kwargs["stream"] = True
|
kwargs["stream"] = stream
|
||||||
else:
|
else:
|
||||||
# requests << 1.0
|
# requests << 1.0
|
||||||
kwargs["prefetch"] = False
|
kwargs["prefetch"] = not stream
|
||||||
kwargs["config"] = {"max_retries": MaxRetries}
|
kwargs["config"] = {"max_retries": MaxRetries}
|
||||||
try:
|
try:
|
||||||
req = session.get(url, **kwargs)
|
req = session.get(url, **kwargs)
|
||||||
|
|
Loading…
Reference in a new issue