diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 3233dd6d3..17be7372b 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -1,7 +1,7 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2013 Bastian Kleineidam -from .util import fetchUrl, getQueryParams +from .util import fetchUrl, getPageContent, getQueryParams def queryNamer(paramName, usePageUrl=False): """Get name from URL query part.""" @@ -29,24 +29,18 @@ def bounceStarter(url, nextSearch): @classmethod def _starter(cls): """Get bounced start URL.""" - url1 = fetchUrl(url, cls.prevSearch, session=cls.session) - if not url1: - raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, url)) - url2 = fetchUrl(url1, nextSearch, session=cls.session) - if not url2: - raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, url1)) - return url2 + data, baseUrl = getPageContent(url, session=cls.session) + url1 = fetchUrl(url, data, baseUrl, cls.prevSearch) + data, baseUrl = getPageContent(url1, session=cls.session) + return fetchUrl(url1, data, baseUrl, nextSearch) return _starter -def indirectStarter(baseUrl, latestSearch): +def indirectStarter(url, latestSearch): """Get start URL by indirection.""" @classmethod def _starter(cls): """Get indirect start URL.""" - url = fetchUrl(baseUrl, latestSearch, session=cls.session) - if not url: - raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl)) - return url + data, baseUrl = getPageContent(url, session=cls.session) + return fetchUrl(url, data, baseUrl, latestSearch) return _starter - diff --git a/dosagelib/plugins/clonemanga.py b/dosagelib/plugins/clonemanga.py index 00f191c31..b0b6ca272 100644 --- a/dosagelib/plugins/clonemanga.py +++ b/dosagelib/plugins/clonemanga.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2013 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, getQueryParams, fetchUrl +from ..util import tagre, getQueryParams, fetchUrl, getPageContent _linkTag = tagre("a", "href", r'([^"]+)') @@ -25,17 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None): @classmethod def _starter(cls): # first, try hopping to previous and next comic - url = fetchUrl(baseUrl, _prevSearch) - if not url: + data, _baseUrl = getPageContent(baseUrl, session=cls.session) + try: + url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch) + except ValueError: # no previous link found, try hopping to last comic - url = fetchUrl(baseUrl, _lastSearch) - if not url: - raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl)) - return url - url = fetchUrl(url, _nextSearch) - if not url: - raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url)) - return url + return fetchUrl(baseUrl, data, _baseUrl, _lastSearch) + else: + data, _baseUrl = getPageContent(url, session=cls.session) + return fetchUrl(url, data, _baseUrl, _nextSearch) attrs = dict( name='CloneManga/' + name, diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index 13ead0bd0..ec62e1c14 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -4,7 +4,7 @@ from re import compile from ..scraper import make_scraper -from ..util import tagre, fetchUrl +from ..util import tagre, fetchUrl, getPageContent # note: adding the compile() functions inside add() is a major performance hog _imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")) @@ -26,17 +26,15 @@ def add(name, path): @classmethod def _starter(cls): # first, try hopping to previous and next comic - url = fetchUrl(_url, _prevSearch) - if not url: + data, baseUrl = getPageContent(_url, session=cls.session) + try: + url = fetchUrl(_url, data, baseUrl, _prevSearch) + except ValueError: # no previous link found, try hopping to last comic - url = fetchUrl(_url, _lastSearch) - if not url: - raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, _url)) - return url - url = fetchUrl(url, _nextSearch) - if not url: - raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url)) - return url + return fetchUrl(_url, data, baseUrl, _lastSearch) + else: + data, baseUrl = getPageContent(url, session=cls.session) + return fetchUrl(url, data, baseUrl, _nextSearch) globals()[classname] = make_scraper(classname, name = 'DrunkDuck/' + name, diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 2e82afe13..392feace4 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2013 Bastian Kleineidam from re import compile from ..scraper import make_scraper -from ..util import tagre, quote, fetchUrl, case_insensitive_re +from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent _imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"')) _linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?') @@ -30,15 +30,14 @@ def add(name, url, description, adult, bounce): @classmethod def _starter(cls): """Get start URL.""" - url1 = fetchUrl(modifier(url), cls.prevSearch, session=cls.session) - if not url1: - raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, modifier(url))) + url1 = modifier(url) + data, baseUrl = getPageContent(url1, session=cls.session) + url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch) if bounce: - url2 = fetchUrl(modifier(url1), _nextSearch, session=cls.session) - if not url2: - raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, modifier(url1))) - return modifier(url2) - return modifier(url1) + data, baseUrl = getPageContent(url2, session=cls.session) + url3 = fetchUrl(url2, data, baseUrl, _nextSearch) + return modifier(url3) + return modifier(url2) @classmethod def namer(cls, imageUrl, pageUrl): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 373ca26f8..ef97c5750 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -3,7 +3,7 @@ # Copyright (C) 2012-2013 Bastian Kleineidam import requests from . import loader -from .util import fetchUrls +from .util import fetchUrl, fetchUrls, getPageContent from .comic import ComicStrip from .output import out @@ -62,7 +62,8 @@ class _BasicScraper(object): def getStrip(self, url): """Get comic strip for given URL.""" - imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0] + data, baseUrl = getPageContent(url, session=self.session) + imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) if len(imageUrls) > 1 and not self.multipleImagesPerStrip: out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) return self.getComicStrip(url, imageUrls) @@ -97,12 +98,13 @@ class _BasicScraper(object): retrieving the given number of strips.""" seen_urls = set() while url: - imageUrls, prevUrl = fetchUrls(url, self.imageSearch, - self.prevSearch, session=self.session) + data, baseUrl = getPageContent(url, session=self.session) + imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch)) + yield self.getComicStrip(url, imageUrls) + prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch) prevUrl = self.prevUrlModifier(prevUrl) out.debug("Matched previous URL %s" % prevUrl) seen_urls.add(url) - yield self.getComicStrip(url, imageUrls) if prevUrl in seen_urls: # avoid recursive URL loops out.warn("Already seen previous URL %r" % prevUrl) diff --git a/dosagelib/util.py b/dosagelib/util.py index 28f7859a8..44d6c54da 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -99,8 +99,7 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None): """Get text content of given URL.""" check_robotstxt(url) # read page data - page = urlopen(url, max_content_bytes=max_content_bytes, - session=session) + page = urlopen(url, max_content_bytes=max_content_bytes, session=session) data = page.text # determine base URL baseUrl = None @@ -117,45 +116,23 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes): return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) -def fetchUrl(url, urlSearch, session=None): - """Search for given URL pattern in a HTML page.""" - data, baseUrl = getPageContent(url, session=session) - match = urlSearch.search(data) - if match: +def fetchUrls(url, data, baseUrl, urlSearch): + """Search all entries for given URL pattern in a HTML page.""" + searchUrls = [] + for match in urlSearch.finditer(data): searchUrl = match.group(1) if not searchUrl: - raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern)) - out.debug('matched URL %r' % searchUrl) - return normaliseURL(urlparse.urljoin(baseUrl, searchUrl)) - return None + raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url)) + out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern)) + searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl))) + if not searchUrls: + raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data)) + return searchUrls -def fetchUrls(url, imageSearch, prevSearch=None, session=None): - """Search for given image and previous URL pattern in a HTML page.""" - data, baseUrl = getPageContent(url, session=session) - # match images - imageUrls = set() - for match in imageSearch.finditer(data): - imageUrl = match.group(1) - if not imageUrl: - raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern)) - out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern)) - imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl))) - if not imageUrls: - out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern)) - if prevSearch is not None: - # match previous URL - match = prevSearch.search(data) - if match: - prevUrl = match.group(1) - if not prevUrl: - raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern)) - prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl)) - else: - out.debug('no previous URL %s at %s' % (prevSearch.pattern, url)) - prevUrl = None - return imageUrls, prevUrl - return imageUrls, None +def fetchUrl(url, data, baseUrl, urlSearch): + """Search first entry for given URL pattern in a HTML page.""" + return fetchUrls(url, data, baseUrl, urlSearch)[0] def unescape(text):