Add imageUrlModifier() for scrapers.
This commit is contained in:
parent
309da4c397
commit
3712799ee0
1 changed files with 15 additions and 7 deletions
|
@ -105,13 +105,14 @@ class _BasicScraper(object):
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
return self.getComicStrip(url, data, baseUrl)
|
||||||
|
|
||||||
|
def getComicStrip(self, url, data, baseUrl):
|
||||||
|
"""Get comic strip downloader for given URL and data."""
|
||||||
|
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
||||||
|
imageUrls = set(map(self.imageUrlModifier, imageUrls))
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||||
return self.getComicStrip(url, imageUrls)
|
|
||||||
|
|
||||||
def getComicStrip(self, url, imageUrls):
|
|
||||||
"""Get comic strip downloader for given URL and images."""
|
|
||||||
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
return ComicStrip(self.get_name(), url, imageUrls, self.namer, self.session)
|
||||||
|
|
||||||
def getAllStrips(self, maxstrips=None):
|
def getAllStrips(self, maxstrips=None):
|
||||||
|
@ -145,8 +146,7 @@ class _BasicScraper(object):
|
||||||
if url in self.noImageUrls:
|
if url in self.noImageUrls:
|
||||||
self.skipUrl(url)
|
self.skipUrl(url)
|
||||||
else:
|
else:
|
||||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
yield self.getComicStrip(url, data, baseUrl)
|
||||||
yield self.getComicStrip(url, imageUrls)
|
|
||||||
if self.firstStripUrl == url:
|
if self.firstStripUrl == url:
|
||||||
out.debug("Stop at first URL %s" % url)
|
out.debug("Stop at first URL %s" % url)
|
||||||
self.hitFirstStripUrl = True
|
self.hitFirstStripUrl = True
|
||||||
|
@ -201,6 +201,14 @@ class _BasicScraper(object):
|
||||||
"""
|
"""
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def imageUrlModifier(cls, imageUrl):
|
||||||
|
"""Optional modification of parsed image URLs. Useful if the URL
|
||||||
|
needs to be fixed before usage. The default implementation does
|
||||||
|
not modify the URL.
|
||||||
|
"""
|
||||||
|
return imageUrl
|
||||||
|
|
||||||
def getFilename(self, imageUrl, pageUrl):
|
def getFilename(self, imageUrl, pageUrl):
|
||||||
"""Return filename for given image and page URL."""
|
"""Return filename for given image and page URL."""
|
||||||
return self.namer(imageUrl, pageUrl)
|
return self.namer(imageUrl, pageUrl)
|
||||||
|
|
Loading…
Reference in a new issue