Always use connection pooling.
This commit is contained in:
parent
39f74137de
commit
6d0fffd825
8 changed files with 51 additions and 36 deletions
|
@ -17,12 +17,13 @@ class FetchComicError(IOError):
|
||||||
class ComicStrip(object):
|
class ComicStrip(object):
|
||||||
"""A list of comic image URLs."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, name, stripUrl, imageUrls, namer):
|
def __init__(self, name, stripUrl, imageUrls, namer, session):
|
||||||
"""Store the image URL list."""
|
"""Store the image URL list."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.stripUrl = stripUrl
|
self.stripUrl = stripUrl
|
||||||
self.imageUrls = imageUrls
|
self.imageUrls = imageUrls
|
||||||
self.namer = namer
|
self.namer = namer
|
||||||
|
self.session = session
|
||||||
|
|
||||||
def getImages(self):
|
def getImages(self):
|
||||||
"""Get a list of image downloaders."""
|
"""Get a list of image downloaders."""
|
||||||
|
@ -35,13 +36,15 @@ class ComicStrip(object):
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = url.rsplit('/', 1)[1]
|
filename = url.rsplit('/', 1)[1]
|
||||||
dirname = getDirname(self.name)
|
dirname = getDirname(self.name)
|
||||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename)
|
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
|
||||||
|
|
||||||
|
|
||||||
class ComicImage(object):
|
class ComicImage(object):
|
||||||
"""A comic image downloader."""
|
"""A comic image downloader."""
|
||||||
|
|
||||||
def __init__(self, name, url, referrer, dirname, filename):
|
ChunkBytes = 1024 * 100 # 100KB
|
||||||
|
|
||||||
|
def __init__(self, name, url, referrer, dirname, filename, session):
|
||||||
"""Set URL and filename."""
|
"""Set URL and filename."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
|
@ -49,11 +52,12 @@ class ComicImage(object):
|
||||||
self.dirname = dirname
|
self.dirname = dirname
|
||||||
filename = getFilename(filename)
|
filename = getFilename(filename)
|
||||||
self.filename, self.ext = os.path.splitext(filename)
|
self.filename, self.ext = os.path.splitext(filename)
|
||||||
|
self.session = session
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Connect to host and get meta information."""
|
"""Connect to host and get meta information."""
|
||||||
try:
|
try:
|
||||||
self.urlobj = getImageObject(self.url, self.referrer)
|
self.urlobj = getImageObject(self.url, self.referrer, self.session)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
||||||
|
|
||||||
|
@ -100,7 +104,8 @@ class ComicImage(object):
|
||||||
try:
|
try:
|
||||||
out.debug('Writing comic to file %s...' % fn)
|
out.debug('Writing comic to file %s...' % fn)
|
||||||
with open(fn, 'wb') as comicOut:
|
with open(fn, 'wb') as comicOut:
|
||||||
comicOut.write(self.urlobj.content)
|
for chunk in self.urlobj.iter_content(chunk_size=self.ChunkBytes):
|
||||||
|
comicOut.write(chunk)
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
except Exception:
|
except Exception:
|
||||||
if os.path.isfile(fn):
|
if os.path.isfile(fn):
|
||||||
|
|
|
@ -29,9 +29,9 @@ def bounceStarter(url, nextSearch):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get bounced start URL."""
|
"""Get bounced start URL."""
|
||||||
data, baseUrl = getPageContent(url, session=cls.session)
|
data, baseUrl = getPageContent(url, cls.session)
|
||||||
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
||||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
data, baseUrl = getPageContent(url1, cls.session)
|
||||||
return fetchUrl(url1, data, baseUrl, nextSearch)
|
return fetchUrl(url1, data, baseUrl, nextSearch)
|
||||||
return _starter
|
return _starter
|
||||||
|
|
||||||
|
@ -41,6 +41,6 @@ def indirectStarter(url, latestSearch):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get indirect start URL."""
|
"""Get indirect start URL."""
|
||||||
data, baseUrl = getPageContent(url, session=cls.session)
|
data, baseUrl = getPageContent(url, cls.session)
|
||||||
return fetchUrl(url, data, baseUrl, latestSearch)
|
return fetchUrl(url, data, baseUrl, latestSearch)
|
||||||
return _starter
|
return _starter
|
||||||
|
|
|
@ -25,14 +25,14 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
|
data, _baseUrl = getPageContent(baseUrl, cls.session)
|
||||||
try:
|
try:
|
||||||
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
||||||
else:
|
else:
|
||||||
data, _baseUrl = getPageContent(url, session=cls.session)
|
data, _baseUrl = getPageContent(url, cls.session)
|
||||||
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
||||||
|
|
||||||
attrs = dict(
|
attrs = dict(
|
||||||
|
|
|
@ -26,14 +26,14 @@ def add(name, path):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
# first, try hopping to previous and next comic
|
# first, try hopping to previous and next comic
|
||||||
data, baseUrl = getPageContent(_url, session=cls.session)
|
data, baseUrl = getPageContent(_url, cls.session)
|
||||||
try:
|
try:
|
||||||
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# no previous link found, try hopping to last comic
|
# no previous link found, try hopping to last comic
|
||||||
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
||||||
else:
|
else:
|
||||||
data, baseUrl = getPageContent(url, session=cls.session)
|
data, baseUrl = getPageContent(url, cls.session)
|
||||||
return fetchUrl(url, data, baseUrl, _nextSearch)
|
return fetchUrl(url, data, baseUrl, _nextSearch)
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
|
|
|
@ -31,10 +31,10 @@ def add(name, url, description, adult, bounce):
|
||||||
def _starter(cls):
|
def _starter(cls):
|
||||||
"""Get start URL."""
|
"""Get start URL."""
|
||||||
url1 = modifier(url)
|
url1 = modifier(url)
|
||||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
data, baseUrl = getPageContent(url1, cls.session)
|
||||||
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
||||||
if bounce:
|
if bounce:
|
||||||
data, baseUrl = getPageContent(url2, session=cls.session)
|
data, baseUrl = getPageContent(url2, cls.session)
|
||||||
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
||||||
return modifier(url3)
|
return modifier(url3)
|
||||||
return modifier(url2)
|
return modifier(url2)
|
||||||
|
|
|
@ -26,7 +26,7 @@ def add(name, shortname):
|
||||||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||||
<h4>published: Sunday, November 11, 2012</h4>
|
<h4>published: Sunday, November 11, 2012</h4>
|
||||||
"""
|
"""
|
||||||
data = getPageContent(pageUrl)[0]
|
data = getPageContent(pageUrl, cls.session)[0]
|
||||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||||
mo = ro.search(data)
|
mo = ro.search(data)
|
||||||
if mo:
|
if mo:
|
||||||
|
|
|
@ -62,7 +62,7 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
data, baseUrl = getPageContent(url, session=self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||||
|
@ -70,7 +70,7 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def getComicStrip(self, url, imageUrls):
|
def getComicStrip(self, url, imageUrls):
|
||||||
"""Get comic strip downloader for given URL and images."""
|
"""Get comic strip downloader for given URL and images."""
|
||||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer)
|
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
||||||
|
|
||||||
def getAllStrips(self, maxstrips=None):
|
def getAllStrips(self, maxstrips=None):
|
||||||
"""Get all comic strips."""
|
"""Get all comic strips."""
|
||||||
|
@ -98,12 +98,19 @@ class _BasicScraper(object):
|
||||||
retrieving the given number of strips."""
|
retrieving the given number of strips."""
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
data, baseUrl = getPageContent(url, session=self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||||
yield self.getComicStrip(url, imageUrls)
|
yield self.getComicStrip(url, imageUrls)
|
||||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
prevUrl = None
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
if self.prevSearch:
|
||||||
out.debug("Matched previous URL %s" % prevUrl)
|
try:
|
||||||
|
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||||
|
except ValueError as msg:
|
||||||
|
# assume there is no previous URL, but print a warning
|
||||||
|
out.warn("%s Assuming no previous comic strips exist." % msg)
|
||||||
|
else:
|
||||||
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
|
out.debug("Matched previous URL %s" % prevUrl)
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
if prevUrl in seen_urls:
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
|
|
|
@ -95,16 +95,21 @@ def case_insensitive_re(name):
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
def getPageContent(url, session, max_content_bytes=MaxContentBytes):
|
||||||
"""Get text content of given URL."""
|
"""Get text content of given URL."""
|
||||||
check_robotstxt(url)
|
check_robotstxt(url, session)
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
if not data:
|
tries = 0
|
||||||
|
while not data and tries < 5:
|
||||||
# sometimes the python requests library is wonky - try again
|
# sometimes the python requests library is wonky - try again
|
||||||
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
time.sleep(5)
|
||||||
|
page = urlopen(url, session, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
|
tries += 1
|
||||||
|
if not data:
|
||||||
|
raise ValueError("Got empty data from %s" % url)
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
match = baseSearch.search(data)
|
match = baseSearch.search(data)
|
||||||
|
@ -115,9 +120,9 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
||||||
return data, baseUrl
|
return data, baseUrl
|
||||||
|
|
||||||
|
|
||||||
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
|
||||||
"""Get response object for given image URL."""
|
"""Get response object for given image URL."""
|
||||||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
|
@ -191,21 +196,21 @@ def get_roboturl(url):
|
||||||
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
|
return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", ""))
|
||||||
|
|
||||||
|
|
||||||
def check_robotstxt(url):
|
def check_robotstxt(url, session):
|
||||||
"""Check if robots.txt allows our user agent for the given URL.
|
"""Check if robots.txt allows our user agent for the given URL.
|
||||||
@raises: IOError if URL is not allowed
|
@raises: IOError if URL is not allowed
|
||||||
"""
|
"""
|
||||||
roboturl = get_roboturl(url)
|
roboturl = get_roboturl(url)
|
||||||
rp = get_robotstxt_parser(roboturl)
|
rp = get_robotstxt_parser(roboturl, session)
|
||||||
if not rp.can_fetch(UserAgent, url):
|
if not rp.can_fetch(UserAgent, url):
|
||||||
raise IOError("%s is disallowed by robots.txt" % url)
|
raise IOError("%s is disallowed by robots.txt" % url)
|
||||||
|
|
||||||
|
|
||||||
@memoized
|
@memoized
|
||||||
def get_robotstxt_parser(url):
|
def get_robotstxt_parser(url, session):
|
||||||
"""Get a RobotFileParser for the given robots.txt URL."""
|
"""Get a RobotFileParser for the given robots.txt URL."""
|
||||||
rp = robotparser.RobotFileParser()
|
rp = robotparser.RobotFileParser()
|
||||||
req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False)
|
||||||
if req.status_code in (401, 403):
|
if req.status_code in (401, 403):
|
||||||
rp.disallow_all = True
|
rp.disallow_all = True
|
||||||
elif req.status_code >= 400:
|
elif req.status_code >= 400:
|
||||||
|
@ -215,16 +220,14 @@ def get_robotstxt_parser(url):
|
||||||
return rp
|
return rp
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, referrer=None, max_content_bytes=None,
|
def urlopen(url, session, referrer=None, max_content_bytes=None,
|
||||||
timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True):
|
timeout=ConnectionTimeoutSecs, raise_for_status=True):
|
||||||
"""Open an URL and return the response object."""
|
"""Open an URL and return the response object."""
|
||||||
out.debug('Open URL %s' % url)
|
out.debug('Open URL %s' % url)
|
||||||
headers = {'User-Agent': UserAgent}
|
headers = {'User-Agent': UserAgent}
|
||||||
if referrer:
|
if referrer:
|
||||||
headers['Referer'] = referrer
|
headers['Referer'] = referrer
|
||||||
out.debug('Sending headers %s' % headers, level=3)
|
out.debug('Sending headers %s' % headers, level=3)
|
||||||
if session is None:
|
|
||||||
session = requests
|
|
||||||
kwargs = {
|
kwargs = {
|
||||||
"headers": headers,
|
"headers": headers,
|
||||||
"timeout": timeout,
|
"timeout": timeout,
|
||||||
|
|
Loading…
Reference in a new issue