Fix AbstruseGoose

This commit is contained in:
Tobias Gruetzmacher 2020-01-09 22:59:12 +01:00
parent 42b5aa9321
commit 3fe40326e0

View file

@ -19,24 +19,21 @@ class AbbysAgency(_WordPressScraper):
firstStripUrl = stripUrl % 'a' firstStripUrl = stripUrl % 'a'
class AbstruseGoose(_BasicScraper): class AbstruseGoose(_ParserScraper):
url = 'http://abstrusegoose.com/' url = 'https://abstrusegoose.com/'
rurl = escape(url)
starter = bounceStarter starter = bounceStarter
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(tagre('img', 'src', imageSearch = '//img[contains(@src, "/strips/")]'
r'(http://abstrusegoose\.com/strips/[^<>"]+)')) textSearch = imageSearch + '/@title'
prevSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) + textOptional = True
r'&laquo; Previous') prevSearch = '//a[contains(text(), "Previous")]'
nextSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) + nextSearch = '//a[contains(text(), "Next")]'
r'Next &raquo;')
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
textSearch = compile(tagre("img", "title", r'([^"]+)'))
def namer(self, image_url, page_url): def namer(self, imageurl, pageurl):
index = int(page_url.rstrip('/').split('/')[-1]) index = int(pageurl.rsplit('/', 1)[1])
name = image_url.split('/')[-1].split('.')[0] name = imageurl.rsplit('/', 1)[1]
return 'c%03d-%s' % (index, name) return 'c%03d-%s' % (index, name)