Always use connection pooling.

This commit is contained in:
Bastian Kleineidam 2013-02-12 17:55:13 +01:00
parent 39f74137de
commit 6d0fffd825
8 changed files with 51 additions and 36 deletions

View file

@ -17,12 +17,13 @@ class FetchComicError(IOError):
class ComicStrip(object): class ComicStrip(object):
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer): def __init__(self, name, stripUrl, imageUrls, namer, session):
"""Store the image URL list.""" """Store the image URL list."""
self.name = name self.name = name
self.stripUrl = stripUrl self.stripUrl = stripUrl
self.imageUrls = imageUrls self.imageUrls = imageUrls
self.namer = namer self.namer = namer
self.session = session
def getImages(self): def getImages(self):
"""Get a list of image downloaders.""" """Get a list of image downloaders."""
@ -35,13 +36,15 @@ class ComicStrip(object):
if filename is None: if filename is None:
filename = url.rsplit('/', 1)[1] filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name) dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename) return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
class ComicImage(object): class ComicImage(object):
"""A comic image downloader.""" """A comic image downloader."""
def __init__(self, name, url, referrer, dirname, filename): ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session):
"""Set URL and filename.""" """Set URL and filename."""
self.name = name self.name = name
self.referrer = referrer self.referrer = referrer
@ -49,11 +52,12 @@ class ComicImage(object):
self.dirname = dirname self.dirname = dirname
filename = getFilename(filename) filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename) self.filename, self.ext = os.path.splitext(filename)
self.session = session
def connect(self): def connect(self):
"""Connect to host and get meta information.""" """Connect to host and get meta information."""
try: try:
self.urlobj = getImageObject(self.url, self.referrer) self.urlobj = getImageObject(self.url, self.referrer, self.session)
except IOError as msg: except IOError as msg:
raise FetchComicError('Unable to retrieve URL.', self.url, msg) raise FetchComicError('Unable to retrieve URL.', self.url, msg)
@ -100,7 +104,8 @@ class ComicImage(object):
try: try:
out.debug('Writing comic to file %s...' % fn) out.debug('Writing comic to file %s...' % fn)
with open(fn, 'wb') as comicOut: with open(fn, 'wb') as comicOut:
comicOut.write(self.urlobj.content) for chunk in self.urlobj.iter_content(chunk_size=self.ChunkBytes):
comicOut.write(chunk)
self.touch(fn) self.touch(fn)
except Exception: except Exception:
if os.path.isfile(fn): if os.path.isfile(fn):

View file

@ -29,9 +29,9 @@ def bounceStarter(url, nextSearch):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get bounced start URL.""" """Get bounced start URL."""
data, baseUrl = getPageContent(url, session=cls.session) data, baseUrl = getPageContent(url, cls.session)
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch) url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
data, baseUrl = getPageContent(url1, session=cls.session) data, baseUrl = getPageContent(url1, cls.session)
return fetchUrl(url1, data, baseUrl, nextSearch) return fetchUrl(url1, data, baseUrl, nextSearch)
return _starter return _starter
@ -41,6 +41,6 @@ def indirectStarter(url, latestSearch):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get indirect start URL.""" """Get indirect start URL."""
data, baseUrl = getPageContent(url, session=cls.session) data, baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, baseUrl, latestSearch) return fetchUrl(url, data, baseUrl, latestSearch)
return _starter return _starter

View file

@ -25,14 +25,14 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
# first, try hopping to previous and next comic # first, try hopping to previous and next comic
data, _baseUrl = getPageContent(baseUrl, session=cls.session) data, _baseUrl = getPageContent(baseUrl, cls.session)
try: try:
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch) url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
except ValueError: except ValueError:
# no previous link found, try hopping to last comic # no previous link found, try hopping to last comic
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch) return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
else: else:
data, _baseUrl = getPageContent(url, session=cls.session) data, _baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, _baseUrl, _nextSearch) return fetchUrl(url, data, _baseUrl, _nextSearch)
attrs = dict( attrs = dict(

View file

@ -26,14 +26,14 @@ def add(name, path):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
# first, try hopping to previous and next comic # first, try hopping to previous and next comic
data, baseUrl = getPageContent(_url, session=cls.session) data, baseUrl = getPageContent(_url, cls.session)
try: try:
url = fetchUrl(_url, data, baseUrl, _prevSearch) url = fetchUrl(_url, data, baseUrl, _prevSearch)
except ValueError: except ValueError:
# no previous link found, try hopping to last comic # no previous link found, try hopping to last comic
return fetchUrl(_url, data, baseUrl, _lastSearch) return fetchUrl(_url, data, baseUrl, _lastSearch)
else: else:
data, baseUrl = getPageContent(url, session=cls.session) data, baseUrl = getPageContent(url, cls.session)
return fetchUrl(url, data, baseUrl, _nextSearch) return fetchUrl(url, data, baseUrl, _nextSearch)
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,

View file

@ -31,10 +31,10 @@ def add(name, url, description, adult, bounce):
def _starter(cls): def _starter(cls):
"""Get start URL.""" """Get start URL."""
url1 = modifier(url) url1 = modifier(url)
data, baseUrl = getPageContent(url1, session=cls.session) data, baseUrl = getPageContent(url1, cls.session)
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch) url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
if bounce: if bounce:
data, baseUrl = getPageContent(url2, session=cls.session) data, baseUrl = getPageContent(url2, cls.session)
url3 = fetchUrl(url2, data, baseUrl, _nextSearch) url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
return modifier(url3) return modifier(url3)
return modifier(url2) return modifier(url2)

View file

@ -26,7 +26,7 @@ def add(name, shortname):
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" /> <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4> <h4>published: Sunday, November 11, 2012</h4>
""" """
data = getPageContent(pageUrl)[0] data = getPageContent(pageUrl, cls.session)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)') ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data) mo = ro.search(data)
if mo: if mo:

View file

@ -62,7 +62,7 @@ class _BasicScraper(object):
def getStrip(self, url): def getStrip(self, url):
"""Get comic strip for given URL.""" """Get comic strip for given URL."""
data, baseUrl = getPageContent(url, session=self.session) data, baseUrl = getPageContent(url, self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
@ -70,7 +70,7 @@ class _BasicScraper(object):
def getComicStrip(self, url, imageUrls): def getComicStrip(self, url, imageUrls):
"""Get comic strip downloader for given URL and images.""" """Get comic strip downloader for given URL and images."""
return ComicStrip(self.get_name(), url, imageUrls, self.namer) return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
def getAllStrips(self, maxstrips=None): def getAllStrips(self, maxstrips=None):
"""Get all comic strips.""" """Get all comic strips."""
@ -98,10 +98,17 @@ class _BasicScraper(object):
retrieving the given number of strips.""" retrieving the given number of strips."""
seen_urls = set() seen_urls = set()
while url: while url:
data, baseUrl = getPageContent(url, session=self.session) data, baseUrl = getPageContent(url, self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls) yield self.getComicStrip(url, imageUrls)
prevUrl = None
if self.prevSearch:
try:
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
except ValueError as msg:
# assume there is no previous URL, but print a warning
out.warn("%s Assuming no previous comic strips exist." % msg)
else:
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl) out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url) seen_urls.add(url)

View file

@ -95,16 +95,21 @@ def case_insensitive_re(name):
baseSearch = re.compile(tagre("base", "href", '([^"]*)')) baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url, max_content_bytes=MaxContentBytes, session=None): def getPageContent(url, session, max_content_bytes=MaxContentBytes):
"""Get text content of given URL.""" """Get text content of given URL."""
check_robotstxt(url) check_robotstxt(url, session)
# read page data # read page data
page = urlopen(url, max_content_bytes=max_content_bytes, session=session) page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text data = page.text
if not data: tries = 0
while not data and tries < 5:
# sometimes the python requests library is wonky - try again # sometimes the python requests library is wonky - try again
page = urlopen(url, max_content_bytes=max_content_bytes, session=session) time.sleep(5)
page = urlopen(url, session, max_content_bytes=max_content_bytes)
data = page.text data = page.text
tries += 1
if not data:
raise ValueError("Got empty data from %s" % url)
# determine base URL # determine base URL
baseUrl = None baseUrl = None
match = baseSearch.search(data) match = baseSearch.search(data)
@ -115,9 +120,9 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
return data, baseUrl return data, baseUrl
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL.""" """Get response object for given image URL."""
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrls(url, data, baseUrl, urlSearch): def fetchUrls(url, data, baseUrl, urlSearch):
@ -191,21 +196,21 @@ def get_roboturl(url):
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", "")) return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
def check_robotstxt(url): def check_robotstxt(url, session):
"""Check if robots.txt allows our user agent for the given URL. """Check if robots.txt allows our user agent for the given URL.
@raises: IOError if URL is not allowed @raises: IOError if URL is not allowed
""" """
roboturl = get_roboturl(url) roboturl = get_roboturl(url)
rp = get_robotstxt_parser(roboturl) rp = get_robotstxt_parser(roboturl, session)
if not rp.can_fetch(UserAgent, url): if not rp.can_fetch(UserAgent, url):
raise IOError("%s is disallowed by robots.txt" % url) raise IOError("%s is disallowed by robots.txt" % url)
@memoized @memoized
def get_robotstxt_parser(url): def get_robotstxt_parser(url, session):
"""Get a RobotFileParser for the given robots.txt URL.""" """Get a RobotFileParser for the given robots.txt URL."""
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False) req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
if req.status_code in (401, 403): if req.status_code in (401, 403):
rp.disallow_all = True rp.disallow_all = True
elif req.status_code >= 400: elif req.status_code >= 400:
@ -215,16 +220,14 @@ def get_robotstxt_parser(url):
return rp return rp
def urlopen(url, referrer=None, max_content_bytes=None, def urlopen(url, session, referrer=None, max_content_bytes=None,
timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True): timeout=ConnectionTimeoutSecs, raise_for_status=True):
"""Open an URL and return the response object.""" """Open an URL and return the response object."""
out.debug('Open URL %s' % url) out.debug('Open URL %s' % url)
headers = {'User-Agent': UserAgent} headers = {'User-Agent': UserAgent}
if referrer: if referrer:
headers['Referer'] = referrer headers['Referer'] = referrer
out.debug('Sending headers %s' % headers, level=3) out.debug('Sending headers %s' % headers, level=3)
if session is None:
session = requests
kwargs = { kwargs = {
"headers": headers, "headers": headers,
"timeout": timeout, "timeout": timeout,