Provide page data in shouldSkipUrl() function
This commit is contained in:
parent
73e1af7aba
commit
875e431edc
9 changed files with 13 additions and 13 deletions
|
@ -223,7 +223,7 @@ class AmazingSuperPowers(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
||||||
help = 'Index format: yyyy/mm/name'
|
help = 'Index format: yyyy/mm/name'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return url in (
|
return url in (
|
||||||
# video
|
# video
|
||||||
|
@ -326,7 +326,7 @@ class AstronomyPOTD(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
prevSearch = compile(tagre("a", "href", r'(ap\d{6}\.html)') + "<</a>")
|
||||||
help = 'Index format: yymmdd'
|
help = 'Index format: yymmdd'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % '130217', # video
|
self.stripUrl % '130217', # video
|
||||||
|
|
|
@ -94,7 +94,7 @@ class Eriadan(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
|
prevSearch = compile(tagre("a", "href", r'([^"]+)', after="prev"))
|
||||||
help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
|
help = 'Index format: yyyy/mm/dd/nnn (unpadded)'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % "2013/04/02/istruzioni-per-il-non-uso", # video
|
self.stripUrl % "2013/04/02/istruzioni-per-il-non-uso", # video
|
||||||
)
|
)
|
||||||
|
|
|
@ -113,7 +113,7 @@ class FonFlatter(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
|
||||||
help = 'Index format: yyyy/mm/dd/number-stripname'
|
help = 'Index format: yyyy/mm/dd/number-stripname'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % "2006/11/30/adventskalender",
|
self.stripUrl % "2006/11/30/adventskalender",
|
||||||
self.stripUrl % "2006/09/21/danke",
|
self.stripUrl % "2006/09/21/danke",
|
||||||
|
|
|
@ -120,7 +120,7 @@ class PHDComics(_BasicScraper):
|
||||||
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
|
tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
|
||||||
help = 'Index format: number'
|
help = 'Index format: number'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % '1669', # video
|
self.stripUrl % '1669', # video
|
||||||
|
|
|
@ -305,7 +305,7 @@ class SMBC(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover"))
|
prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover"))
|
||||||
help = 'Index format: nnnn'
|
help = 'Index format: nnnn'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip promo or missing update pages."""
|
"""Skip promo or missing update pages."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % '2865',
|
self.stripUrl % '2865',
|
||||||
|
@ -347,7 +347,7 @@ class SnowFlakes(_BasicScraper):
|
||||||
ext = imageUrl.rsplit('.', 1)[1]
|
ext = imageUrl.rsplit('.', 1)[1]
|
||||||
return "SnowFlakes-%d.%s" % (index, ext)
|
return "SnowFlakes-%d.%s" % (index, ext)
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % ('279', '2'), # no comic
|
self.stripUrl % ('279', '2'), # no comic
|
||||||
|
@ -526,7 +526,7 @@ class StuffNoOneToldMe(_BasicScraper):
|
||||||
parts, imagename = imageUrl.rsplit('/', 1)
|
parts, imagename = imageUrl.rsplit('/', 1)
|
||||||
return '%s-%s-%s-%s' % (year, month, stripname, imagename)
|
return '%s-%s-%s-%s' % (year, month, stripname, imagename)
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % '2012/08/self-rant', # no comic
|
self.stripUrl % '2012/08/self-rant', # no comic
|
||||||
|
|
|
@ -158,7 +158,7 @@ class ToonHole(_BasicScraper):
|
||||||
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+)' % rurl, after="prev"))
|
prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/[^"]+)' % rurl, after="prev"))
|
||||||
help = 'Index format: yyyy/mm/stripname'
|
help = 'Index format: yyyy/mm/stripname'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
return url in (self.stripUrl % "2013/03/if-game-of-thrones-was-animated",)
|
return url in (self.stripUrl % "2013/03/if-game-of-thrones-was-animated",)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ class WebDesignerCOTW(_BasicScraper):
|
||||||
help = 'Index format: yyyy/mm/stripname'
|
help = 'Index format: yyyy/mm/stripname'
|
||||||
description = u"The content revolves around web design, blogging and funny situations that we encounter in our daily lives as designers and this week we focus on Christmas. These great cartoons are created by Jerry King, an award-winning cartoonist who’s one of the most published, prolific and versatile cartoonists in the world today."
|
description = u"The content revolves around web design, blogging and funny situations that we encounter in our daily lives as designers and this week we focus on Christmas. These great cartoons are created by Jerry King, an award-winning cartoonist who’s one of the most published, prolific and versatile cartoonists in the world today."
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip non-comic URLs."""
|
"""Skip non-comic URLs."""
|
||||||
return 'comics-of-the-week' not in url
|
return 'comics-of-the-week' not in url
|
||||||
|
|
||||||
|
|
|
@ -80,7 +80,7 @@ class Zwarwald(_BasicScraper):
|
||||||
help = 'Index format: number'
|
help = 'Index format: number'
|
||||||
waitSeconds = 1
|
waitSeconds = 1
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Some pages have flash content."""
|
"""Some pages have flash content."""
|
||||||
return url in (
|
return url in (
|
||||||
self.stripUrl % "112",
|
self.stripUrl % "112",
|
||||||
|
|
|
@ -102,7 +102,7 @@ class _BasicScraper(object):
|
||||||
"""Get hash value from name and index list."""
|
"""Get hash value from name and index list."""
|
||||||
return hash((self.getName(), self.indexes))
|
return hash((self.getName(), self.indexes))
|
||||||
|
|
||||||
def shouldSkipUrl(self, url):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Determine if search for images in given URL should be skipped."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -163,7 +163,7 @@ class _BasicScraper(object):
|
||||||
while url:
|
while url:
|
||||||
out.info(u'Get strip URL %s' % url, level=1)
|
out.info(u'Get strip URL %s' % url, level=1)
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data, baseUrl = getPageContent(url, self.session)
|
||||||
if self.shouldSkipUrl(url):
|
if self.shouldSkipUrl(url, data):
|
||||||
out.info(u'Skipping URL %s' % url)
|
out.info(u'Skipping URL %s' % url)
|
||||||
self.skippedUrls.add(url)
|
self.skippedUrls.add(url)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in a new issue