Fix some comic modules.

2016-05-05 20:55:14 +02:00 · 2016-05-05 20:55:14 +02:00 · 77ed0218e0
commit 77ed0218e0
parent bb2ac39639
5 changed files with 101 additions and 146 deletions
--- a/dosagelib/plugins/n.py
+++ b/dosagelib/plugins/n.py
@ -7,7 +7,7 @@ from __future__ import absolute_import, division, print_function

 from re import compile, escape

-from ..scraper import _BasicScraper
+from ..scraper import _BasicScraper, _ParserScraper
 from ..helpers import indirectStarter
 from ..util import tagre
 from .common import (_ComicControlScraper, _WordPressScraper, WP_LATEST_SEARCH,
@ -18,13 +18,14 @@ class Namesake(_ComicControlScraper):
    url = 'http://namesakecomic.com/'


-class NamirDeiter(_BasicScraper):
-    url = 'http://www.namirdeiter.com/'
-    rurl = escape(url)
-    stripUrl = url + 'comics/index.php?date=%s'
-    firstStripUrl = stripUrl % '19991128'
-    imageSearch = compile(tagre("img", "src", r"'?(%scomics/\d+\.jpg)'?" % rurl, quote=""))
-    prevSearch = compile(tagre("a", "href", r'(%scomics/index\.php\?date=\d+)' % rurl, quote="'") + "Previous")
+class NamirDeiter(_ParserScraper):
+    baseUrl = 'http://www.namirdeiter.com/comics/'
+    stripUrl = baseUrl + 'index.php?date=%s'
+    url = stripUrl % '20150410'
+    firstStripUrl = baseUrl
+    imageSearch = '//a/img'
+    prevSearch = '//a[text()="Previous"]'
+    endOfLife = True
    help = 'Index format: yyyymmdd'


@ -89,8 +90,8 @@ class Nicky510(_WordPressScraper):

 class Nimona(_BasicScraper):
    url = 'http://gingerhaze.com/nimona/'
-    stripUrl = url + '%s/'
-    firstStripUrl = stripUrl % "comic/page-1"
+    stripUrl = url + 'comic/%s'
+    firstStripUrl = stripUrl % "page-1"
    imageSearch = compile(tagre("img", "src", r'(http://gingerhaze\.com/sites/default/files/nimona-pages/.+?)'))
    prevSearch = compile(r'<a href="(/nimona/comic/[^"]+)"><img src="http://gingerhaze\.com/sites/default/files/comicdrop/comicdrop_prev_label_file\.png"')
    help = 'Index format: stripname'
@ -115,31 +116,21 @@ class NoMoreSavePoints(_WordPressScraper):
    starter = indirectStarter


-class NoNeedForBushido(_BasicScraper):
+class NoNeedForBushido(_ParserScraper):
    url = 'http://nn4b.com/'
-    rurl = escape(url)
-    stripUrl = url + '?webcomic1=%s'
-    imageSearch = compile(
-      tagre("a", "rel", "next") +
-      tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl,
-            after="attachment-full"))
-    prevSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
-                               after="previous-webcomic"))
-    latestSearch = compile(tagre("a", "href", r'(%s\?webcomic1=[^"]+)' % rurl,
-                                 after="last-webcomic"))
+    stripUrl = url + 'comic/%s'
+    imageSearch = '//div[@id="comic-image"]//img'
+    prevSearch = '//a[@rel="prev"]'
    help = 'Index format: nnn'
-    starter = indirectStarter


-class NotInventedHere(_BasicScraper):
+class NotInventedHere(_ParserScraper):
    url = 'http://notinventedhe.re/'
-    rurl = escape(url)
-    stripUrl = url + '%s/'
-    firstStripUrl = stripUrl % 'on/2009-9-21'
-    imageSearch = compile(tagre("img", "src", r'(http://thiswas.notinventedhe.re/on/\d+-\d+-\d+)'))
-    prevSearch = compile(tagre("a", "href", r'(/on/\d+-\d+-\d+)') +
-                         '\s*Previous')
-    help = 'Index format: yyyy-mm-dd'
+    stripUrl = url + 'on/%s'
+    firstStripUrl = stripUrl % '2009-9-21'
+    imageSearch = '//div[@id="comic-content"]//img'
+    prevSearch = '//a[@id="nav-previous"]'
+    help = 'Index format: yyyy-m-d'


 class Nukees(_BasicScraper):
--- a/dosagelib/plugins/o.py
+++ b/dosagelib/plugins/o.py
@ -56,8 +56,8 @@ class OkCancel(_BasicScraper):


 class OmakeTheater(_ParserScraper):
-    url = 'http://omaketheater.com/comics/'
-    stripUrl = url + '%s/'
+    url = 'http://omaketheater.com/comic/'
+    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '1'
    css = True
    imageSearch = ".comicImage img"
--- a/dosagelib/plugins/p.py
+++ b/dosagelib/plugins/p.py
@ -76,28 +76,16 @@ class PennyAndAggie(_BasicScraper):
    help = 'Index format: n (unpadded)'


-class PennyArcade(_BasicScraper):
-    url = 'http://penny-arcade.com/comic/'
+class PennyArcade(_ParserScraper):
+    url = 'http://www.penny-arcade.com/comic/'
    rurl = escape(url)
    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '1998/11/18'
-    imageSearch = compile(tagre("img", "src", r'(http://art\.penny-arcade\.com/photos/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl,
-                               before="btnPrev"))
-    nextSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl,
-                               before="btnNext"))
+    imageSearch = '//div[@id="comicFrame"]//img'
+    prevSearch = '//a[%s]' % xpath_class('btnPrev')
+    nextSearch = '//a[%s]' % xpath_class('btnNext')
    starter = bounceStarter
-    help = 'Index format: yyyy/mm/dd/'
-
-    def prevUrlModifier(self, prev_url):
-        if prev_url:
-            dummy, yyyy, mm, dd = prev_url.rsplit('/', 3)
-            try:
-                int(dd)
-            except ValueError:
-                # URL has form yyyy/mm/dd/stripname
-                prev_url = "%s/%s/%s" % (dummy, yyyy, mm)
-            return prev_url
+    help = 'Index format: yyyy/mm/dd'

    def namer(self, image_url, page_url):
        p = page_url.split('/')
@ -116,17 +104,15 @@ class PeppermintSaga(_BasicScraper):
    adult = True


-class PHDComics(_BasicScraper):
+class PHDComics(_ParserScraper):
    baseUrl = 'http://phdcomics.com/'
    url = baseUrl + 'comics.php'
    stripUrl = baseUrl + 'comics/archive.php?comicid=%s'
    firstStripUrl = stripUrl % '1'
-    imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd[^ ]+)', quote=""))
-    prevSearch = compile(
-        tagre("a", "href", r'((?:comics/)?archive\.php\?comicid=\d+)',
-              quote="") +
-        tagre("img", "src", r'(?:comics/)?images/prev_button\.gif', quote=""))
-    help = 'Index format: number'
+    imageSearch = '//img[@id="comic"]'
+    prevSearch = '//a[img[contains(@src, "prev_button")]]'
+    nextSearch = '//a[img[contains(@src, "next_button")]]'
+    help = 'Index format: n (unpadded)'

    def shouldSkipUrl(self, url, data):
        """Skip pages without images."""
@ -150,15 +136,9 @@ class PicPakDog(_BasicScraper):
    help = 'Index format: stripname'


-class PiledHigherAndDeeper(_BasicScraper):
-    url = 'http://www.phdcomics.com/comics.php'
+# Keep, because naming is different to PHDComics...
+class PiledHigherAndDeeper(PHDComics):
    starter = bounceStarter
-    stripUrl = url + '?comicid=%s'
-    firstStripUrl = stripUrl % '1'
-    imageSearch = compile(tagre("img", "src", r'(http://www\.phdcomics\.com/comics/archive/phd\d+s\d?\.\w{3,4})', quote=""))
-    prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif')
-    nextSearch = compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')
-    help = 'Index format: n (unpadded)'
    namer = queryNamer('comicid', use_page_url=True)


@ -232,11 +212,9 @@ class Precocious(_BasicScraper):

 class PS238(_ParserScraper):
    url = 'http://ps238.nodwick.com/'
-    stripUrl = url + '/comic/%s/'
-    starter = bounceStarter
+    stripUrl = url + 'comic/%s/'
    imageSearch = '//div[@id="comic"]//img'
    prevSearch = '//a[@class="comic-nav-base comic-nav-previous"]'
-    nextSearch = '//a[@class="comic-nav-base comic-nav-next"]'
    help = 'Index format: yyyy-mm-dd'


--- a/dosagelib/plugins/r.py
+++ b/dosagelib/plugins/r.py
@ -5,11 +5,16 @@

 from __future__ import absolute_import, division, print_function

-from re import compile, escape
+from re import compile
+try:
+    from urllib.parse import urljoin
+except ImportError:
+    from urlparse import urljoin

 from ..scraper import _BasicScraper, _ParserScraper
-from ..helpers import bounceStarter
+from ..helpers import indirectStarter
 from ..util import tagre
+from .common import _WordPressScraper, xpath_class


 class RadioactivePanda(_BasicScraper):
@ -20,23 +25,23 @@ class RadioactivePanda(_BasicScraper):
    help = 'Index format: n (no padding)'


-class RalfTheDestroyer(_ParserScraper):
+class RalfTheDestroyer(_WordPressScraper):
    url = 'http://ralfthedestroyer.com/'
-    stripUrl = url + '%s/'
-    css = True
-    imageSearch = '#comic-1 > a:first-child img'
-    prevSearch = 'td.comic_navi_left > a:nth-of-type(2)'
-    help = 'Index format: stripname'


-class RealLife(_BasicScraper):
+class RealLife(_WordPressScraper):
    url = 'http://reallifecomics.com/'
-    rurl = escape(url)
    stripUrl = url + 'comic.php?comic=%s'
-    firstStripUrl = stripUrl % '991115'
-    imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
-    prevSearch = compile(tagre("a", "href", r'((?:%s)?comic\.php\?comic=[^"]+)' % rurl, after="nav-previous"))
-    help = 'Index format: monthname-dd-yyyy)'
+    firstStripUrl = stripUrl % 'title-1'
+    help = 'Index format: monthname-dd-yyyy'
+
+    def getPrevUrl(self, url, data):
+        # "Parse" JavaScript
+        prevtag = data.find_class('comic-nav-previous')
+        if not prevtag:
+            return None
+        target = prevtag[0].get('onclick').split("'")[1]
+        return urljoin(url, target)


 class RealmOfAtland(_BasicScraper):
@ -48,26 +53,14 @@ class RealmOfAtland(_BasicScraper):
    help = 'Index format: nnn'


-class RedMeat(_BasicScraper):
-    baseUrl = 'http://www.redmeat.com/redmeat/'
-    url = baseUrl + 'current/index.html'
-    starter = bounceStarter
-    stripUrl = baseUrl + '%s/index.html'
-    firstStripUrl = stripUrl % '1996-06-10'
-    imageSearch = compile(tagre("img", "src", r'(http://www\.redmeat\.com/imager/b/redmeat/[^"]*\.png)'))
-    prevSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="prev"))
-    nextSearch = compile(tagre("a", "href", r'(http://www\.redmeat\.com/[^"]*)', after="next"))
-    help = 'Index format: yyyy-mm-dd'
+class RedMeat(_ParserScraper):
+    url = 'http://www.redmeat.com/max-cannon/FreshMeat'
+    imageSearch = '//div[@class="comicStrip"]//img'
+    prevSearch = '//a[@class="prev"]'

-
-class RedsPlanet(_BasicScraper):
-    url = 'http://www.redsplanet.com/comic/'
-    rurl = escape(url)
-    stripUrl = url + 'rp/%s/'
-    firstStripUrl = stripUrl % 'pro/prologue-01'
-    imageSearch = compile(tagre("img", "src", r'(%scomics/\d+-\d+-\d+_[^"/]+)' % rurl))
-    prevSearch = compile(tagre("a", "href", r'(%srp/[^"/]+/[^"/]+/)' % rurl))
-    help = 'Index format: chapter/stripname'
+    def namer(self, image_url, page_url):
+        parts = image_url.rsplit('/', 2)
+        return '_'.join(parts[1:3])


 class RedString(_BasicScraper):
@ -79,30 +72,30 @@ class RedString(_BasicScraper):
    help = 'Index format: nnn'


-class RomanticallyApocalyptic(_BasicScraper):
+class RomanticallyApocalyptic(_ParserScraper):
    url = 'http://romanticallyapocalyptic.com/'
-    rurl = escape(url)
-    stripUrl = url + '%s/'
-    firstStripUrl = stripUrl % '1'
-    imageSearch = compile(tagre("img", "src", r'(%sart/\d+[^"]+)' % rurl))
-    prevSearch = compile(tagre("a", "href", r'(%s\d+[^"]+)' % rurl) + "\s*" +
-                         tagre('span', 'class', 'spritePrevious'))
+    stripUrl = url + '%s'
+    firstStripUrl = stripUrl % '0'
+    imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel')
+    prevSearch = '//a[@accesskey="p"]'
+    latestSearch = '//a[span[%s]]' % xpath_class('glyphicon-fast-forward')
+    starter = indirectStarter
    help = 'Index format: n'
    adult = True


-class Roza(_BasicScraper):
+class Roza(_ParserScraper):
    url = 'http://www.junglestudio.com/roza/index.php'
    stripUrl = url + '?date=%s'
    firstStripUrl = stripUrl % '2007-05-01'
-    imageSearch = compile(r'<img src="(pages/.+?)"')
-    prevSearch = compile(r'<a href="(index.php\?date=.+?)">[^>].+?navtable_01.gif')
+    imageSearch = '//img[contains(@src, "pages/")]'
+    prevSearch = '//a[img[contains(@src, "navtable_01.gif")]]'
    help = 'Index format: yyyy-mm-dd'


 class Ruthe(_BasicScraper):
    url = 'http://ruthe.de/'
-    stripUrl = url + 'cartoon/%s/datum/ASC'
+    stripUrl = url + 'cartoon/%s/datum/asc/'
    firstStripUrl = stripUrl % '1'
    lang = 'de'
    imageSearch = compile(tagre("img", "src", r'(/?cartoons/strip_\d+[^"]+)'))
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -46,13 +46,13 @@ class SailorsunOrg(_WordPressScraper):
    url = 'http://sailorsun.org/'


-class SamAndFuzzy(_BasicScraper):
+class SamAndFuzzy(_ParserScraper):
    url = 'http://www.samandfuzzy.com/'
-    stripUrl = 'http://samandfuzzy.com/%s'
+    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '1'
-    imageSearch = compile(r'(/comics/.+?)" alt')
-    prevSearch = compile(r'"><a href="(.+?)"><img src="imgint/nav_prev.gif"')
-    help = 'Index format: nnnn'
+    imageSearch = '//img[@class="comic-image"]'
+    prevSearch = '//li[@class="prev-page"]/a'
+    help = 'Index format: n (unpadded)'


 class SandraOnTheRocks(_BasicScraper):
@ -78,7 +78,7 @@ class ScandinaviaAndTheWorld(_ParserScraper):

 class ScaryGoRound(_ParserScraper):
    url = 'http://www.scarygoround.com/sgr/ar.php'
-    stripUrl = url + 'ar.php?date=%s'
+    stripUrl = url + '?date=%s'
    firstStripUrl = stripUrl % '20020604'
    imageSearch = '//img[contains(@src, "/strips/")]'
    prevSearch = '//a[contains(text(), "Previous")]'
@ -104,14 +104,13 @@ class ScenesFromAMultiverse(_BasicScraper):
    help = 'Index format: yyyy/mm/dd/stripname'


-class SchlockMercenary(_BasicScraper):
+class SchlockMercenary(_ParserScraper):
    url = 'http://www.schlockmercenary.com/'
    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '2000-06-12'
-    imageSearch = compile(tagre("img", "src", r'(http://static\.schlockmercenary\.com/comics/[^"]+)'))
+    imageSearch = '//div[@class="strip-image-wrapper"]/img'
    multipleImagesPerStrip = True
-    prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+)', quote="'",
-                               after="nav-previous"))
+    prevSearch = '//a[@class="previous-strip"]'
    help = 'Index format: yyyy-mm-dd'


@ -267,8 +266,10 @@ class SinFest(_BasicScraper):
    help = 'Index format: yyyy-mm-dd'


-class Sithrah(_WordPressScraper):
+class Sithrah(_ParserScraper):
    url = 'http://sithrah.com/'
+    imageSearch = '//div[@class="webcomic-image"]/img'
+    prevSearch = '//a[%s]' % xpath_class('previous-webcomic-link')


 class SkinDeep(_BasicScraper):
@ -284,8 +285,9 @@ class SleeplessDomain(_ComicControlScraper):
    url = 'http://www.sleeplessdomain.com/'


-class SlightlyDamned(_WordPressScraper):
+class SlightlyDamned(_ComicControlScraper):
    url = 'http://www.sdamned.com/'
+    firstStripUrl = url + 'comic/part-one-to-hell-and-back'


 class SluggyFreelance(_BasicScraper):
@ -299,8 +301,7 @@ class SluggyFreelance(_BasicScraper):

 class SMBC(_ParserScraper):
    url = 'http://www.smbc-comics.com/'
-    rurl = escape(url)
-    stripUrl = url + '?id=%s'
+    stripUrl = url + 'index.php?id=%s'
    firstStripUrl = stripUrl % '1'
    multipleImagesPerStrip = True
    imageSearch = ['//img[@id="comic"]', '//div[@id="aftercomic"]/img']
@ -363,14 +364,15 @@ class SomethingPositive(_BasicScraper):
    help = 'Index format: mmddyyyy'


-class Sorcery101(_BasicScraper):
-    baseUrl = 'http://www.sorcery101.net/'
-    url = baseUrl + 'sorcery-101/'
-    rurl = escape(baseUrl)
-    stripUrl = url + '%s/'
-    imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl))
-    prevSearch = compile(tagre("a", "href", r'(%ssorcery-101/[^"]+)' % rurl,
-                               after="previous-"))
+class Sorcery101(_ParserScraper):
+    baseUrl = 'http://www.sorcery101.net/sorcery-101/'
+    stripUrl = baseUrl + '%s/'
+    url = stripUrl % 'sorcery101-ch-01'
+    firstStripUrl = url
+    imageSearch = '//div[@class="webcomic-image"]/img'
+    prevSearch = '//a[@rel="prev"]'
+    latestSearch = '//a[%s]' % xpath_class('last-webcomic-link')
+    starter = indirectStarter
    help = 'Index format: stripname'


@ -423,7 +425,7 @@ class StandStillStaySilent(_ParserScraper):


 class StarCrossdDestiny(_ParserScraper):
-    baseUrl = 'http://www.starcrossd.net/'
+    baseUrl = 'http://starcrossd.net/'
    url = baseUrl + 'comic.html'
    stripUrl = baseUrl + 'archives/%s.html'
    firstStripUrl = stripUrl % '00000001'
@ -445,10 +447,10 @@ class StarCrossdDestiny(_ParserScraper):

 class StationV3(_ParserScraper):
    url = 'http://www.stationv3.com/'
-    stripUrl = url + 'd/%s.html'
+    stripUrl = url + 'd2/%s.html'
+    firstStripUrl = stripUrl % '20150628'
    imageSearch = '//img[contains(@src,"/comics2/")]'
    prevSearch = '//a[img[contains(@src,"/previous2")]]'
-
    help = 'Index format: yyyymmdd'


@ -462,15 +464,6 @@ class StickyDillyBuns(_BasicScraper):
    help = 'Index format: name'


-class StrawberryDeathCake(_BasicScraper):
-    url = 'http://strawberrydeathcake.com/'
-    rurl = escape(url)
-    imageSearch = compile(tagre("img", "src",
-                                r'(%swp-content/webcomic/[^"]+)' % rurl))
-    prevSearch = compile(tagre("a", "href", r'(%sarchive/[^"]+)' % rurl,
-                               after="previous"))
-
-
 class StreetFighter(_ComicControlScraper):
    url = 'http://www.streetfightercomics.com'