Provide page data in shouldSkipUrl() function

This commit is contained in:
Bastian Kleineidam 2014-02-10 21:58:09 +01:00
parent 73e1af7aba
commit 875e431edc
9 changed files with 13 additions and 13 deletions

View file

@ -223,7 +223,7 @@ class AmazingSuperPowers(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/name' help = 'Index format: yyyy/mm/name'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return url in ( return url in (
# video # video
@ -326,7 +326,7 @@ class AstronomyPOTD(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "&lt;</a>") prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "&lt;</a>")
help = 'Index format: yymmdd' help = 'Index format: yymmdd'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return url in ( return url in (
self.stripUrl % '130217', # video self.stripUrl % '130217', # video

View file

@ -94,7 +94,7 @@ class Eriadan(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev")) prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/nnn (unpadded)' help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
return url in ( return url in (
self.stripUrl % "2013/04/02/istruzioni-per-il-non-uso", # video self.stripUrl % "2013/04/02/istruzioni-per-il-non-uso", # video
) )

View file

@ -113,7 +113,7 @@ class FonFlatter(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/dd/number-stripname' help = 'Index format: yyyy/mm/dd/number-stripname'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
return url in ( return url in (
self.stripUrl % "2006/11/30/adventskalender", self.stripUrl % "2006/11/30/adventskalender",
self.stripUrl % "2006/09/21/danke", self.stripUrl % "2006/09/21/danke",

View file

@ -120,7 +120,7 @@ class PHDComics(_BasicScraper):
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote="")) tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
help = 'Index format: number' help = 'Index format: number'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return url in ( return url in (
self.stripUrl % '1669', # video self.stripUrl % '1669', # video

View file

@ -305,7 +305,7 @@ class SMBC(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover")) prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover"))
help = 'Index format: nnnn' help = 'Index format: nnnn'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip promo or missing update pages.""" """Skip promo or missing update pages."""
return url in ( return url in (
self.stripUrl % '2865', self.stripUrl % '2865',
@ -347,7 +347,7 @@ class SnowFlakes(_BasicScraper):
ext = imageUrl.rsplit('.', 1)[1] ext = imageUrl.rsplit('.', 1)[1]
return "SnowFlakes-%d.%s" % (index, ext) return "SnowFlakes-%d.%s" % (index, ext)
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return url in ( return url in (
self.stripUrl % ('279', '2'), # no comic self.stripUrl % ('279', '2'), # no comic
@ -526,7 +526,7 @@ class StuffNoOneToldMe(_BasicScraper):
parts, imagename = imageUrl.rsplit('/', 1) parts, imagename = imageUrl.rsplit('/', 1)
return '%s-%s-%s-%s' % (year, month, stripname, imagename) return '%s-%s-%s-%s' % (year, month, stripname, imagename)
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return url in ( return url in (
self.stripUrl % '2012/08/self-rant', # no comic self.stripUrl % '2012/08/self-rant', # no comic

View file

@ -158,7 +158,7 @@ class ToonHole(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+)' % rurl, after="prev")) prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/stripname' help = 'Index format: yyyy/mm/stripname'
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
return url in (self.stripUrl % "2013/03/if-game-of-thrones-was-animated",) return url in (self.stripUrl % "2013/03/if-game-of-thrones-was-animated",)

View file

@ -54,7 +54,7 @@ class WebDesignerCOTW(_BasicScraper):
help = 'Index format: yyyy/mm/stripname' help = 'Index format: yyyy/mm/stripname'
description = u"The content revolves around web design, blogging and funny situations that we encounter in our daily lives as designers and this week we focus on Christmas. These great cartoons are created by Jerry King, an award-winning cartoonist whos one of the most published, prolific and versatile cartoonists in the world today." description = u"The content revolves around web design, blogging and funny situations that we encounter in our daily lives as designers and this week we focus on Christmas. These great cartoons are created by Jerry King, an award-winning cartoonist whos one of the most published, prolific and versatile cartoonists in the world today."
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Skip non-comic URLs.""" """Skip non-comic URLs."""
return 'comics-of-the-week' not in url return 'comics-of-the-week' not in url

View file

@ -80,7 +80,7 @@ class Zwarwald(_BasicScraper):
help = 'Index format: number' help = 'Index format: number'
waitSeconds = 1 waitSeconds = 1
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Some pages have flash content.""" """Some pages have flash content."""
return url in ( return url in (
self.stripUrl % "112", self.stripUrl % "112",

View file

@ -102,7 +102,7 @@ class _BasicScraper(object):
"""Get hash value from name and index list.""" """Get hash value from name and index list."""
return hash((self.getName(), self.indexes)) return hash((self.getName(), self.indexes))
def shouldSkipUrl(self, url): def shouldSkipUrl(self, url, data):
"""Determine if search for images in given URL should be skipped.""" """Determine if search for images in given URL should be skipped."""
return False return False
@ -163,7 +163,7 @@ class _BasicScraper(object):
while url: while url:
out.info(u'Get strip URL %s' % url, level=1) out.info(u'Get strip URL %s' % url, level=1)
data, baseUrl = getPageContent(url, self.session) data, baseUrl = getPageContent(url, self.session)
if self.shouldSkipUrl(url): if self.shouldSkipUrl(url, data):
out.info(u'Skipping URL %s' % url) out.info(u'Skipping URL %s' % url)
self.skippedUrls.add(url) self.skippedUrls.add(url)
else: else: