diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 84aeb42fe..b31b8e38d 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -17,12 +17,13 @@ class FetchComicError(IOError): class ComicStrip(object): """A list of comic image URLs.""" - def __init__(self, name, stripUrl, imageUrls, namer): + def __init__(self, name, stripUrl, imageUrls, namer, session): """Store the image URL list.""" self.name = name self.stripUrl = stripUrl self.imageUrls = imageUrls self.namer = namer + self.session = session def getImages(self): """Get a list of image downloaders.""" @@ -35,13 +36,15 @@ class ComicStrip(object): if filename is None: filename = url.rsplit('/', 1)[1] dirname = getDirname(self.name) - return ComicImage(self.name, url, self.stripUrl, dirname, filename) + return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session) class ComicImage(object): """A comic image downloader.""" - def __init__(self, name, url, referrer, dirname, filename): + ChunkBytes = 1024 * 100 # 100KB + + def __init__(self, name, url, referrer, dirname, filename, session): """Set URL and filename.""" self.name = name self.referrer = referrer @@ -49,11 +52,12 @@ class ComicImage(object): self.dirname = dirname filename = getFilename(filename) self.filename, self.ext = os.path.splitext(filename) + self.session = session def connect(self): """Connect to host and get meta information.""" try: - self.urlobj = getImageObject(self.url, self.referrer) + self.urlobj = getImageObject(self.url, self.referrer, self.session) except IOError as msg: raise FetchComicError('Unable to retrieve URL.', self.url, msg) @@ -100,7 +104,8 @@ class ComicImage(object): try: out.debug('Writing comic to file %s...' % fn) with open(fn, 'wb') as comicOut: - comicOut.write(self.urlobj.content) + for chunk in self.urlobj.iter_content(chunk_size=self.ChunkBytes): + comicOut.write(chunk) self.touch(fn) except Exception: if os.path.isfile(fn): diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 17be7372b..9a7a5825c 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -29,9 +29,9 @@ def bounceStarter(url, nextSearch): @classmethod def _starter(cls): """Get bounced start URL.""" - data, baseUrl = getPageContent(url, session=cls.session) + data, baseUrl = getPageContent(url, cls.session) url1 = fetchUrl(url, data, baseUrl, cls.prevSearch) - data, baseUrl = getPageContent(url1, session=cls.session) + data, baseUrl = getPageContent(url1, cls.session) return fetchUrl(url1, data, baseUrl, nextSearch) return _starter @@ -41,6 +41,6 @@ def indirectStarter(url, latestSearch): @classmethod def _starter(cls): """Get indirect start URL.""" - data, baseUrl = getPageContent(url, session=cls.session) + data, baseUrl = getPageContent(url, cls.session) return fetchUrl(url, data, baseUrl, latestSearch) return _starter diff --git a/dosagelib/plugins/clonemanga.py b/dosagelib/plugins/clonemanga.py index b0b6ca272..0305eb86b 100644 --- a/dosagelib/plugins/clonemanga.py +++ b/dosagelib/plugins/clonemanga.py @@ -25,14 +25,14 @@ def add(name, shortName, imageFolder=None, lastStrip=None): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, _baseUrl = getPageContent(baseUrl, session=cls.session) + data, _baseUrl = getPageContent(baseUrl, cls.session) try: url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch) except ValueError: # no previous link found, try hopping to last comic return fetchUrl(baseUrl, data, _baseUrl, _lastSearch) else: - data, _baseUrl = getPageContent(url, session=cls.session) + data, _baseUrl = getPageContent(url, cls.session) return fetchUrl(url, data, _baseUrl, _nextSearch) attrs = dict( diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index ec62e1c14..df9b55914 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -26,14 +26,14 @@ def add(name, path): @classmethod def _starter(cls): # first, try hopping to previous and next comic - data, baseUrl = getPageContent(_url, session=cls.session) + data, baseUrl = getPageContent(_url, cls.session) try: url = fetchUrl(_url, data, baseUrl, _prevSearch) except ValueError: # no previous link found, try hopping to last comic return fetchUrl(_url, data, baseUrl, _lastSearch) else: - data, baseUrl = getPageContent(url, session=cls.session) + data, baseUrl = getPageContent(url, cls.session) return fetchUrl(url, data, baseUrl, _nextSearch) globals()[classname] = make_scraper(classname, diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 392feace4..70586b652 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -31,10 +31,10 @@ def add(name, url, description, adult, bounce): def _starter(cls): """Get start URL.""" url1 = modifier(url) - data, baseUrl = getPageContent(url1, session=cls.session) + data, baseUrl = getPageContent(url1, cls.session) url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch) if bounce: - data, baseUrl = getPageContent(url2, session=cls.session) + data, baseUrl = getPageContent(url2, cls.session) url3 = fetchUrl(url2, data, baseUrl, _nextSearch) return modifier(url3) return modifier(url2) diff --git a/dosagelib/plugins/universal.py b/dosagelib/plugins/universal.py index f2d924b06..e886dc08e 100644 --- a/dosagelib/plugins/universal.py +++ b/dosagelib/plugins/universal.py @@ -26,7 +26,7 @@ def add(name, shortname): Marmaduke

published: Sunday, November 11, 2012

""" - data = getPageContent(pageUrl)[0] + data = getPageContent(pageUrl, cls.session)[0] ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+

published: ([^<]+)') mo = ro.search(data) if mo: diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index ef97c5750..64afbe5a3 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -62,7 +62,7 @@ class _BasicScraper(object): def getStrip(self, url): """Get comic strip for given URL.""" - data, baseUrl = getPageContent(url, session=self.session) + data, baseUrl = getPageContent(url, self.session) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) if len(imageUrls) > 1 and not self.multipleImagesPerStrip: out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) @@ -70,7 +70,7 @@ class _BasicScraper(object): def getComicStrip(self, url, imageUrls): """Get comic strip downloader for given URL and images.""" - return ComicStrip(self.get_name(), url, imageUrls, self.namer) + return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session) def getAllStrips(self, maxstrips=None): """Get all comic strips.""" @@ -98,12 +98,19 @@ class _BasicScraper(object): retrieving the given number of strips.""" seen_urls = set() while url: - data, baseUrl = getPageContent(url, session=self.session) + data, baseUrl = getPageContent(url, self.session) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) yield self.getComicStrip(url, imageUrls) - prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) - prevUrl = self.prevUrlModifier(prevUrl) - out.debug("Matched previous URL %s" % prevUrl) + prevUrl = None + if self.prevSearch: + try: + prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) + except ValueError as msg: + # assume there is no previous URL, but print a warning + out.warn("%s Assuming no previous comic strips exist." % msg) + else: + prevUrl = self.prevUrlModifier(prevUrl) + out.debug("Matched previous URL %s" % prevUrl) seen_urls.add(url) if prevUrl in seen_urls: # avoid recursive URL loops diff --git a/dosagelib/util.py b/dosagelib/util.py index 6faaf64e5..4d934c3c7 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -95,16 +95,21 @@ def case_insensitive_re(name): baseSearch = re.compile(tagre("base", "href", '([^"]*)')) -def getPageContent(url, max_content_bytes=MaxContentBytes, session=None): +def getPageContent(url, session, max_content_bytes=MaxContentBytes): """Get text content of given URL.""" - check_robotstxt(url) + check_robotstxt(url, session) # read page data - page = urlopen(url, max_content_bytes=max_content_bytes, session=session) + page = urlopen(url, session, max_content_bytes=max_content_bytes) data = page.text - if not data: + tries = 0 + while not data and tries < 5: # sometimes the python requests library is wonky - try again - page = urlopen(url, max_content_bytes=max_content_bytes, session=session) + time.sleep(5) + page = urlopen(url, session, max_content_bytes=max_content_bytes) data = page.text + tries += 1 + if not data: + raise ValueError("Got empty data from %s" % url) # determine base URL baseUrl = None match = baseSearch.search(data) @@ -115,9 +120,9 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None): return data, baseUrl -def getImageObject(url, referrer, max_content_bytes=MaxImageBytes): +def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): """Get response object for given image URL.""" - return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) + return urlopen(url, session, referrer=referrer, max_content_bytes=max_content_bytes) def fetchUrls(url, data, baseUrl, urlSearch): @@ -191,21 +196,21 @@ def get_roboturl(url): return urlparse.urlunparse((pu[0], pu[1], "/robots.txt", "", "", "")) -def check_robotstxt(url): +def check_robotstxt(url, session): """Check if robots.txt allows our user agent for the given URL. @raises: IOError if URL is not allowed """ roboturl = get_roboturl(url) - rp = get_robotstxt_parser(roboturl) + rp = get_robotstxt_parser(roboturl, session) if not rp.can_fetch(UserAgent, url): raise IOError("%s is disallowed by robots.txt" % url) @memoized -def get_robotstxt_parser(url): +def get_robotstxt_parser(url, session): """Get a RobotFileParser for the given robots.txt URL.""" rp = robotparser.RobotFileParser() - req = urlopen(url, max_content_bytes=MaxContentBytes, raise_for_status=False) + req = urlopen(url, session, max_content_bytes=MaxContentBytes, raise_for_status=False) if req.status_code in (401, 403): rp.disallow_all = True elif req.status_code >= 400: @@ -215,16 +220,14 @@ def get_robotstxt_parser(url): return rp -def urlopen(url, referrer=None, max_content_bytes=None, - timeout=ConnectionTimeoutSecs, session=None, raise_for_status=True): +def urlopen(url, session, referrer=None, max_content_bytes=None, + timeout=ConnectionTimeoutSecs, raise_for_status=True): """Open an URL and return the response object.""" out.debug('Open URL %s' % url) headers = {'User-Agent': UserAgent} if referrer: headers['Referer'] = referrer out.debug('Sending headers %s' % headers, level=3) - if session is None: - session = requests kwargs = { "headers": headers, "timeout": timeout,