Provide page data in shouldSkipUrl() function
This commit is contained in:
parent
73e1af7aba
commit
875e431edc
9 changed files with 13 additions and 13 deletions
|
@ -223,7 +223,7 @@ class AmazingSuperPowers(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
||||
help = 'Index format: yyyy/mm/name'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
# video
|
||||
|
@ -326,7 +326,7 @@ class AstronomyPOTD(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
||||
help = 'Index format: yymmdd'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
self.stripUrl % '130217', # video
|
||||
|
|
|
@ -94,7 +94,7 @@ class Eriadan(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
|
||||
help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
return url in (
|
||||
self.stripUrl % "2013/04/02/istruzioni-per-il-non-uso", # video
|
||||
)
|
||||
|
|
|
@ -113,7 +113,7 @@ class FonFlatter(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
||||
help = 'Index format: yyyy/mm/dd/number-stripname'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
return url in (
|
||||
self.stripUrl % "2006/11/30/adventskalender",
|
||||
self.stripUrl % "2006/09/21/danke",
|
||||
|
|
|
@ -120,7 +120,7 @@ class PHDComics(_BasicScraper):
|
|||
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
|
||||
help = 'Index format: number'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
self.stripUrl % '1669', # video
|
||||
|
|
|
@ -305,7 +305,7 @@ class SMBC(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover"))
|
||||
help = 'Index format: nnnn'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip promo or missing update pages."""
|
||||
return url in (
|
||||
self.stripUrl % '2865',
|
||||
|
@ -347,7 +347,7 @@ class SnowFlakes(_BasicScraper):
|
|||
ext = imageUrl.rsplit('.', 1)[1]
|
||||
return "SnowFlakes-%d.%s" % (index, ext)
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
self.stripUrl % ('279', '2'), # no comic
|
||||
|
@ -526,7 +526,7 @@ class StuffNoOneToldMe(_BasicScraper):
|
|||
parts, imagename = imageUrl.rsplit('/', 1)
|
||||
return '%s-%s-%s-%s' % (year, month, stripname, imagename)
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip pages without images."""
|
||||
return url in (
|
||||
self.stripUrl % '2012/08/self-rant', # no comic
|
||||
|
|
|
@ -158,7 +158,7 @@ class ToonHole(_BasicScraper):
|
|||
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+)' % rurl, after="prev"))
|
||||
help = 'Index format: yyyy/mm/stripname'
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
return url in (self.stripUrl % "2013/03/if-game-of-thrones-was-animated",)
|
||||
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ class WebDesignerCOTW(_BasicScraper):
|
|||
help = 'Index format: yyyy/mm/stripname'
|
||||
description = u"The content revolves around web design, blogging and funny situations that we encounter in our daily lives as designers and this week we focus on Christmas. These great cartoons are created by Jerry King, an award-winning cartoonist who’s one of the most published, prolific and versatile cartoonists in the world today."
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip non-comic URLs."""
|
||||
return 'comics-of-the-week' not in url
|
||||
|
||||
|
|
|
@ -80,7 +80,7 @@ class Zwarwald(_BasicScraper):
|
|||
help = 'Index format: number'
|
||||
waitSeconds = 1
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Some pages have flash content."""
|
||||
return url in (
|
||||
self.stripUrl % "112",
|
||||
|
|
|
@ -102,7 +102,7 @@ class _BasicScraper(object):
|
|||
"""Get hash value from name and index list."""
|
||||
return hash((self.getName(), self.indexes))
|
||||
|
||||
def shouldSkipUrl(self, url):
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Determine if search for images in given URL should be skipped."""
|
||||
return False
|
||||
|
||||
|
@ -163,7 +163,7 @@ class _BasicScraper(object):
|
|||
while url:
|
||||
out.info(u'Get strip URL %s' % url, level=1)
|
||||
data, baseUrl = getPageContent(url, self.session)
|
||||
if self.shouldSkipUrl(url):
|
||||
if self.shouldSkipUrl(url, data):
|
||||
out.info(u'Skipping URL %s' % url)
|
||||
self.skippedUrls.add(url)
|
||||
else:
|
||||
|
|
Loading…
Reference in a new issue