Fix AbstruseGoose

2020-01-09 22:59:12 +01:00 · 2020-01-09 22:59:12 +01:00 · 3fe40326e0
commit 3fe40326e0
parent 42b5aa9321
1 changed files with 10 additions and 13 deletions
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@ -19,24 +19,21 @@ class AbbysAgency(_WordPressScraper):
    firstStripUrl = stripUrl % 'a'
-class AbstruseGoose(_BasicScraper):
+class AbstruseGoose(_ParserScraper):
-    url = 'http://abstrusegoose.com/'
+    url = 'https://abstrusegoose.com/'
    rurl = escape(url)
    starter = bounceStarter
    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '1'
-    imageSearch = compile(tagre('img', 'src',
+    imageSearch = '//img[contains(@src, "/strips/")]'
-                                r'(http://abstrusegoose\.com/strips/[^<>"]+)'))
+    textSearch = imageSearch + '/@title'
-    prevSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) +
+    textOptional = True
-                         r'&laquo; Previous')
+    prevSearch = '//a[contains(text(), "Previous")]'
-    nextSearch = compile(tagre('a', 'href', r'(%s\d+)' % rurl) +
+    nextSearch = '//a[contains(text(), "Next")]'
                         r'Next &raquo;')
    help = 'Index format: n (unpadded)'
    textSearch = compile(tagre("img", "title", r'([^"]+)'))
-    def namer(self, image_url, page_url):
+    def namer(self, imageurl, pageurl):
-        index = int(page_url.rstrip('/').split('/')[-1])
+        index = int(pageurl.rsplit('/', 1)[1])
-        name = image_url.split('/')[-1].split('.')[0]
+        name = imageurl.rsplit('/', 1)[1]
        return 'c%03d-%s' % (index, name)