Always use connection pooling.

This commit is contained in:
Bastian Kleineidam 2013-02-12 17:55:13 +01:00
parent 39f74137de
commit 6d0fffd825
8 changed files with 51 additions and 36 deletions

View file

@ -17,12 +17,13 @@ class FetchComicError(IOError):
class ComicStrip(object):
"""A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer):
def __init__(self, name, stripUrl, imageUrls, namer, session):
"""Store the image URL list."""
self.name = name
self.stripUrl = stripUrl
self.imageUrls = imageUrls
self.namer = namer
self.session = session
def getImages(self):
"""Get a list of image downloaders."""
@ -35,13 +36,15 @@ class ComicStrip(object):
if filename is None:
filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
class ComicImage(object):
"""A comic image downloader."""
def __init__(self, name, url, referrer, dirname, filename):
ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session):
"""Set URL and filename."""
self.name = name
self.referrer = referrer
@ -49,11 +52,12 @@ class ComicImage(object):
self.dirname = dirname
filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename)
self.session = session
def connect(self):
"""Connect to host and get meta information."""
try:
self.urlobj = getImageObject(self.url, self.referrer)
self.urlobj = getImageObject(self.url, self.referrer, self.session)
except IOError as msg:
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
@ -100,7 +104,8 @@ class ComicImage(object):
try:
out.debug('Writing comic to file %s...' % fn)
with open(fn, 'wb') as comicOut:
comicOut.write(self.urlobj.content)
for chunk in self.urlobj.iter_content(chunk_size=self.ChunkBytes):
comicOut.write(chunk)
self.touch(fn)
except Exception:
if os.path.isfile(fn):

View file

@ -29,9 +29,9 @@ def bounceStarter(url, nextSearch):
@classmethod
def _starter(cls):
"""Get bounced start URL."""
data, baseUrl = getPageContent(url, session=cls.session)
data, baseUrl = getPageContent(url, cls.session)
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
data, baseUrl = getPageContent(url1, session=cls.session)
data, baseUrl = getPageContent(url1, cls.session)
return fetchUrl(url1, data, baseUrl, nextSearch)
return _starter
@ -41,6 +41,6 @@ def indirectStarter(url, latestSearch):
@classmethod
def _starter(cls):
"""Get indirect start URL."""
data, baseUrl = getPageContent(url, session=cls.session)
data, baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, baseUrl, latestSearch)
return _starter

View file

@ -25,14 +25,14 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
data, _baseUrl = getPageContent(baseUrl, cls.session)
try:
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
else:
data, _baseUrl = getPageContent(url, session=cls.session)
data, _baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, _baseUrl, _nextSearch)
attrs = dict(

View file

@ -26,14 +26,14 @@ def add(name, path):
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
data, baseUrl = getPageContent(_url, session=cls.session)
data, baseUrl = getPageContent(_url, cls.session)
try:
url = fetchUrl(_url, data, baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic
return fetchUrl(_url, data, baseUrl, _lastSearch)
else:
data, baseUrl = getPageContent(url, session=cls.session)
data, baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, baseUrl, _nextSearch)
globals()[classname] = make_scraper(classname,

View file

@ -31,10 +31,10 @@ def add(name, url, description, adult, bounce):
def _starter(cls):
"""Get start URL."""
url1 = modifier(url)
data, baseUrl = getPageContent(url1, session=cls.session)
data, baseUrl = getPageContent(url1, cls.session)
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
if bounce:
data, baseUrl = getPageContent(url2, session=cls.session)
data, baseUrl = getPageContent(url2, cls.session)
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
return modifier(url3)
return modifier(url2)

View file

@ -26,7 +26,7 @@ def add(name, shortname):
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4>
"""
data = getPageContent(pageUrl)[0]
data = getPageContent(pageUrl, cls.session)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
if mo:

View file

@ -62,7 +62,7 @@ class _BasicScraper(object):
def getStrip(self, url):
"""Get comic strip for given URL."""
data, baseUrl = getPageContent(url, session=self.session)
data, baseUrl = getPageContent(url, self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
@ -70,7 +70,7 @@ class _BasicScraper(object):
def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images."""
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
def getAllStrips(self, maxstrips=None):
"""Get all comic strips."""
@ -98,12 +98,19 @@ class _BasicScraper(object):
retrieving the given number of strips."""
seen_urls = set()
while url:
data, baseUrl = getPageContent(url, session=self.session)
data, baseUrl = getPageContent(url, self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls)
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
prevUrl = None
if self.prevSearch:
try:
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
except ValueError as msg:
# assume there is no previous URL, but print a warning
out.warn("%s Assuming no previous comic strips exist." % msg)
else:
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url)
if prevUrl in seen_urls:
# avoid recursive URL loops

View file

@ -95,16 +95,21 @@ def case_insensitive_re(name):
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL."""
check_robotstxt(url)
check_robotstxt(url, session)
# read page data
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
if not data:
tries = 0
while not data and tries < 5:
# sometimes the python requests library is wonky - try again
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
time.sleep(5)
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text
tries += 1
if not data:
raise ValueError("Got empty data from %s" % url)
# determine base URL
baseUrl = None
match = baseSearch.search(data)
@ -115,9 +120,9 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
return data, baseUrl
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrls(url, data, baseUrl, urlSearch):
@ -191,21 +196,21 @@ def get_roboturl(url):
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
def check_robotstxt(url):
def check_robotstxt(url, session):
"""Check if robots.txt allows our user agent for the given URL.
@raises: IOError if URL is not allowed
"""
roboturl = get_roboturl(url)
rp = get_robotstxt_parser(roboturl)
rp = get_robotstxt_parser(roboturl, session)
if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url)
@memoized
def get_robotstxt_parser(url):
def get_robotstxt_parser(url, session):
"""Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser()
req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False)
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
if req.status_code in (401, 403):
rp.disallow_all = True
elif req.status_code >= 400:
@ -215,16 +220,14 @@ def get_robotstxt_parser(url):
return rp
def urlopen(url, referrer=None, max_content_bytes=None,
timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True):
def urlopen(url, session, referrer=None, max_content_bytes=None,
timeout=ConnectionTimeoutSecs, raise_for_status=True):
"""Open an URL and return the response object."""
out.debug('Open URL %s' % url)
headers = {'User-Agent': UserAgent}
if referrer:
headers['Referer'] = referrer
out.debug('Sending headers %s' % headers, level=3)
if session is None:
session = requests
kwargs = {
"headers": headers,
"timeout": timeout,