Always use connection pooling.
This commit is contained in:
parent
39f74137de
commit
6d0fffd825
8 changed files with 51 additions and 36 deletions
|
@ -17,12 +17,13 @@ class FetchComicError(IOError):
|
|||
class ComicStrip(object):
|
||||
"""A list of comic image URLs."""
|
||||
|
||||
def __init__(self, name, stripUrl, imageUrls, namer):
|
||||
def __init__(self, name, stripUrl, imageUrls, namer, session):
|
||||
"""Store the image URL list."""
|
||||
self.name = name
|
||||
self.stripUrl = stripUrl
|
||||
self.imageUrls = imageUrls
|
||||
self.namer = namer
|
||||
self.session = session
|
||||
|
||||
def getImages(self):
|
||||
"""Get a list of image downloaders."""
|
||||
|
@ -35,13 +36,15 @@ class ComicStrip(object):
|
|||
if filename is None:
|
||||
filename = url.rsplit('/', 1)[1]
|
||||
dirname = getDirname(self.name)
|
||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename)
|
||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
|
||||
|
||||
|
||||
class ComicImage(object):
|
||||
"""A comic image downloader."""
|
||||
|
||||
def __init__(self, name, url, referrer, dirname, filename):
|
||||
ChunkBytes = 1024 * 100 # 100KB
|
||||
|
||||
def __init__(self, name, url, referrer, dirname, filename, session):
|
||||
"""Set URL and filename."""
|
||||
self.name = name
|
||||
self.referrer = referrer
|
||||
|
@ -49,11 +52,12 @@ class ComicImage(object):
|
|||
self.dirname = dirname
|
||||
filename = getFilename(filename)
|
||||
self.filename, self.ext = os.path.splitext(filename)
|
||||
self.session = session
|
||||
|
||||
def connect(self):
|
||||
"""Connect to host and get meta information."""
|
||||
try:
|
||||
self.urlobj = getImageObject(self.url, self.referrer)
|
||||
self.urlobj = getImageObject(self.url, self.referrer, self.session)
|
||||
except IOError as msg:
|
||||
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
||||
|
||||
|
@ -100,7 +104,8 @@ class ComicImage(object):
|
|||
try:
|
||||
out.debug('Writing comic to file %s...' % fn)
|
||||
with open(fn, 'wb') as comicOut:
|
||||
comicOut.write(self.urlobj.content)
|
||||
for chunk in self.urlobj.iter_content(chunk_size=self.ChunkBytes):
|
||||
comicOut.write(chunk)
|
||||
self.touch(fn)
|
||||
except Exception:
|
||||
if os.path.isfile(fn):
|
||||
|
|
|
@ -29,9 +29,9 @@ def bounceStarter(url, nextSearch):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get bounced start URL."""
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||
data, baseUrl = getPageContent(url1, cls.session)
|
||||
return fetchUrl(url1, data, baseUrl, nextSearch)
|
||||
return _starter
|
||||
|
||||
|
@ -41,6 +41,6 @@ def indirectStarter(url, latestSearch):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get indirect start URL."""
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, baseUrl, latestSearch)
|
||||
return _starter
|
||||
|
|
|
@ -25,14 +25,14 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
|
||||
data, _baseUrl = getPageContent(baseUrl, cls.session)
|
||||
try:
|
||||
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
||||
else:
|
||||
data, _baseUrl = getPageContent(url, session=cls.session)
|
||||
data, _baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
||||
|
||||
attrs = dict(
|
||||
|
|
|
@ -26,14 +26,14 @@ def add(name, path):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
data, baseUrl = getPageContent(_url, session=cls.session)
|
||||
data, baseUrl = getPageContent(_url, cls.session)
|
||||
try:
|
||||
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
||||
else:
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, baseUrl, _nextSearch)
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
|
|
|
@ -31,10 +31,10 @@ def add(name, url, description, adult, bounce):
|
|||
def _starter(cls):
|
||||
"""Get start URL."""
|
||||
url1 = modifier(url)
|
||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||
data, baseUrl = getPageContent(url1, cls.session)
|
||||
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
||||
if bounce:
|
||||
data, baseUrl = getPageContent(url2, session=cls.session)
|
||||
data, baseUrl = getPageContent(url2, cls.session)
|
||||
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
||||
return modifier(url3)
|
||||
return modifier(url2)
|
||||
|
|
|
@ -26,7 +26,7 @@ def add(name, shortname):
|
|||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||
<h4>published: Sunday, November 11, 2012</h4>
|
||||
"""
|
||||
data = getPageContent(pageUrl)[0]
|
||||
data = getPageContent(pageUrl, cls.session)[0]
|
||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||
mo = ro.search(data)
|
||||
if mo:
|
||||
|
|
|
@ -62,7 +62,7 @@ class _BasicScraper(object):
|
|||
|
||||
def getStrip(self, url):
|
||||
"""Get comic strip for given URL."""
|
||||
data, baseUrl = getPageContent(url, session=self.session)
|
||||
data, baseUrl = getPageContent(url, self.session)
|
||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||
|
@ -70,7 +70,7 @@ class _BasicScraper(object):
|
|||
|
||||
def getComicStrip(self, url, imageUrls):
|
||||
"""Get comic strip downloader for given URL and images."""
|
||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
|
||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
||||
|
||||
def getAllStrips(self, maxstrips=None):
|
||||
"""Get all comic strips."""
|
||||
|
@ -98,10 +98,17 @@ class _BasicScraper(object):
|
|||
retrieving the given number of strips."""
|
||||
seen_urls = set()
|
||||
while url:
|
||||
data, baseUrl = getPageContent(url, session=self.session)
|
||||
data, baseUrl = getPageContent(url, self.session)
|
||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||
yield self.getComicStrip(url, imageUrls)
|
||||
prevUrl = None
|
||||
if self.prevSearch:
|
||||
try:
|
||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||
except ValueError as msg:
|
||||
# assume there is no previous URL, but print a warning
|
||||
out.warn("%s Assuming no previous comic strips exist." % msg)
|
||||
else:
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug("Matched previous URL %s" % prevUrl)
|
||||
seen_urls.add(url)
|
||||
|
|
|
@ -95,16 +95,21 @@ def case_insensitive_re(name):
|
|||
|
||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||
|
||||
def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
||||
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||
"""Get text content of given URL."""
|
||||
check_robotstxt(url)
|
||||
check_robotstxt(url, session)
|
||||
# read page data
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||
data = page.text
|
||||
if not data:
|
||||
tries = 0
|
||||
while not data and tries < 5:
|
||||
# sometimes the python requests library is wonky - try again
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
||||
time.sleep(5)
|
||||
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||
data = page.text
|
||||
tries += 1
|
||||
if not data:
|
||||
raise ValueError("Got empty data from %s" % url)
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
match = baseSearch.search(data)
|
||||
|
@ -115,9 +120,9 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
|||
return data, baseUrl
|
||||
|
||||
|
||||
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
||||
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||
"""Get response object for given image URL."""
|
||||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||
|
||||
|
||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||
|
@ -191,21 +196,21 @@ def get_roboturl(url):
|
|||
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
|
||||
|
||||
|
||||
def check_robotstxt(url):
|
||||
def check_robotstxt(url, session):
|
||||
"""Check if robots.txt allows our user agent for the given URL.
|
||||
@raises: IOError if URL is not allowed
|
||||
"""
|
||||
roboturl = get_roboturl(url)
|
||||
rp = get_robotstxt_parser(roboturl)
|
||||
rp = get_robotstxt_parser(roboturl, session)
|
||||
if not rp.can_fetch(UserAgent, url):
|
||||
raise IOError("%s is disallowed by robots.txt" % url)
|
||||
|
||||
|
||||
@memoized
|
||||
def get_robotstxt_parser(url):
|
||||
def get_robotstxt_parser(url, session):
|
||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||
rp = robotparser.RobotFileParser()
|
||||
req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||
if req.status_code in (401, 403):
|
||||
rp.disallow_all = True
|
||||
elif req.status_code >= 400:
|
||||
|
@ -215,16 +220,14 @@ def get_robotstxt_parser(url):
|
|||
return rp
|
||||
|
||||
|
||||
def urlopen(url, referrer=None, max_content_bytes=None,
|
||||
timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True):
|
||||
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||
timeout=ConnectionTimeoutSecs, raise_for_status=True):
|
||||
"""Open an URL and return the response object."""
|
||||
out.debug('Open URL %s' % url)
|
||||
headers = {'User-Agent': UserAgent}
|
||||
if referrer:
|
||||
headers['Referer'] = referrer
|
||||
out.debug('Sending headers %s' % headers, level=3)
|
||||
if session is None:
|
||||
session = requests
|
||||
kwargs = {
|
||||
"headers": headers,
|
||||
"timeout": timeout,
|
||||
|
|
Loading…
Reference in a new issue