Various comics are fixed.

2012-12-13 21:05:27 +01:00 · 2012-12-13 21:05:27 +01:00 · 5f9e5ae3ca
commit 5f9e5ae3ca
parent de1b80fa4d
18 changed files with 2857 additions and 170 deletions
--- a/doc/testresults.html
+++ b/doc/testresults.html
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@ -226,14 +226,6 @@ class Angband(_BasicScraper):
    help = 'Index format: yyyy-mm-dd'


-class ActionAthena(_BasicScraper):
-    latestUrl = 'http://actionathena.com/'
-    stripUrl = latestUrl + '2%s'
-    imageSearch = compile(r'<img src=\'(http://actionathena.com/comics/.+?)\'>')
-    prevSearch = compile(r'<a href="(http://actionathena.com/.+?)">&laquo; Previous</a>')
-    help = 'Index format: yyyy/mm/dd/strip-name'
-
-
 class AlsoBagels(_BasicScraper):
    latestUrl = 'http://alsobagels.com/'
    stripUrl = latestUrl + 'index.php/comic/%s/'
--- a/dosagelib/plugins/c.py
+++ b/dosagelib/plugins/c.py
@ -181,7 +181,7 @@ class CatAndGirl(_BasicScraper):
 class CyanideAndHappiness(_BasicScraper):
    latestUrl = 'http://www.explosm.net/comics/'
    stripUrl = latestUrl + '%s/'
-    imageSearch = compile(tagre("img", "src", r'(http:\/\/www\.explosm\.net/db/files/Comics/[^"]+)'))
+    imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/Comics/[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
    help = 'Index format: n (unpadded)'

@ -234,14 +234,6 @@ class Chester5000XYV(_BasicScraper):
    help = 'Index format: nnn'


-class CalamitiesOfNature(_BasicScraper):
-    latestUrl = 'http://www.calamitiesofnature.com/'
-    stripUrl = latestUrl + 'archive/?c=%s'
-    imageSearch = compile(tagre("img", "src", r'(archive/\d+[^"]+|http://www\.calamitiesofnature\.com/archive/\d+[^"]+)'))
-    prevSearch = compile(r'<a id="previous" href="(http://www.calamitiesofnature.com/archive/\?c\=\d+)">')
-    help = 'Index format: nnn'
-
-
 class Champ2010(_BasicScraper):
    # the latest URL is hard coded since the comic is discontinued
    latestUrl = 'http://jedcollins.com/champ2010/champ-12-30-10.html'
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@ -28,7 +28,8 @@ class Damonk(_BasicScraper):
    help = 'Index format: yyyymmdd'


-class DandyAndCompany(_BasicScraper):
+# XXX disallowed /search by robots.txt
+class _DandyAndCompany(_BasicScraper):
    latestUrl = 'http://www.dandyandcompany.com/'
    stripUrl = None
    multipleImagesPerStrip = True
--- a/dosagelib/plugins/e.py
+++ b/dosagelib/plugins/e.py
@ -68,7 +68,8 @@ class EmergencyExit(_BasicScraper):
    help = 'Index format: n'


-class ErrantStory(_BasicScraper):
+# XXX disallowed by robots.txt
+class _ErrantStory(_BasicScraper):
    latestUrl = 'http://www.errantstory.com/'
    stripUrl = latestUrl + '%s'
    imageSearch = compile(r'<img[^>]+?src="([^"]*?comics/.+?)"')
--- a/dosagelib/plugins/k.py
+++ b/dosagelib/plugins/k.py
@ -58,10 +58,19 @@ class KillerKomics(_BasicScraper):
    help = 'Index format: strip-name'


-class Kofightclub(_BasicScraper):
+# XXX disallowed by robots.txt
+class _Kofightclub(_BasicScraper):
    latestUrl = 'http://www.kofightclub.com/'
    stripUrl = latestUrl + 'd/%s.html'
    imageSearch = compile(tagre("img", "src", r'(\.\./images/\d+[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'((?:http://www\.kofightclub\.com)?/d/\d+\.html)')
     + tagre("img", "alt", "Previous comic"))
    help = 'Index format: yyyymmdd'
+
+
+class KuroShouri(_BasicScraper):
+   latestUrl = 'http://kuroshouri.com/'
+   stripUrl = latestUrl + '?webcomic_post=%s'
+   imageSearch = compile(tagre("img", "src", r"(http://kuroshouri\.com/wp-content/webcomic/kuroshouri/[^'\"]+)", quote="['\"]"))
+   prevSearch = compile(tagre("a", "href", r'(http://kuroshouri\.com/\?webcomic_post=[^"]+)', after="previous"))
+   help = 'Index format: chapter-n-page-m'
--- a/dosagelib/plugins/m.py
+++ b/dosagelib/plugins/m.py
@ -72,14 +72,6 @@ class Melonpool(_BasicScraper):
    help = 'Index format: n'


-class MintCondition(_BasicScraper):
-    latestUrl = 'http://www.mintconditioncomic.com/'
-    stripUrl = latestUrl + '%s/'
-    imageSearch = compile(tagre("img", "src", r'(http://www\.mintconditioncomic\.com/comics/[^"]+)'))
-    prevSearch = compile(tagre("a", "href", r'(http://www\.mintconditioncomic\.com/[^"]+)', after="prev"))
-    help = 'Index format: yyyy/mm/dd/stripname'
-
-
 class Misfile(_BasicScraper):
    latestUrl = 'http://www.misfile.com/'
    stripUrl = latestUrl + '?date=%s'
--- a/dosagelib/plugins/o.py
+++ b/dosagelib/plugins/o.py
@ -35,7 +35,7 @@ class OnTheEdge(_BasicScraper):


 class OneQuestion(_BasicScraper):
-    latestUrl = 'http://www.onequestioncomic.com/'
+    latestUrl = 'http://onequestioncomic.com/'
    stripUrl = latestUrl + 'comic.php?strip_id=%s'
    imageSearch = compile(tagre("img", "src", r'(istrip_files/strips/\d+\.jpg)'))
    prevSearch = compile(tagre("a", "href", r'(comic\.php\?strip_id=\d+)') + tagre("img", "src", r'img/arrow_prev\.jpg'))
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@ -142,7 +142,8 @@ class SPQRBlues(_BasicScraper):
    help = 'Index format: number'


-class StationV3(_BasicScraper):
+# XXX disallowed by robots.txt
+class _StationV3(_BasicScraper):
    latestUrl = 'http://www.stationv3.com/'
    stripUrl = latestUrl + 'd/%s.html'
    imageSearch = compile(tagre("img", "src", r'(http://www\.stationv3\.com/comics/[^"]+)'))
@ -228,7 +229,8 @@ class Spamusement(_BasicScraper):
    starter = indirectStarter('http://spamusement.com/', prevSearch)


-class StrangeCandy(_BasicScraper):
+# XXX disallowed by robots.txt
+class _StrangeCandy(_BasicScraper):
    latestUrl = 'http://www.strangecandy.net/'
    stripUrl = latestUrl + 'd/%s.html'
    imageSearch = compile(tagre("img", "src", r'(/comics/\d+\.jpg)'))
--- a/dosagelib/plugins/smackjeeves.py
+++ b/dosagelib/plugins/smackjeeves.py
@ -13,12 +13,12 @@ _attrs = dict(
    next = case_insensitive_re("next"),
 )
 _prevSearch = compile(_linkSearch +
-  r'(?:<img[^>]+alt="[^"]*%(prev)s|<img[^>]+(?:button_previous|nav_prev4)\.|[^<]*%(back)s|\s*<<? (?:%(back)s|%(prev)s)|[^<]*%(prev)s)' % _attrs)
+  r'(?:<img[^>]+alt="[^"]*%(prev)s|<img[^>]+(?:button_previous|naviButtons_Previous|nav_prev4|prev|previous|webbuttonback|PrevArrow)\.|[^<]*%(back)s|\s*<<? (?:%(back)s|%(prev)s)|[^<]*%(prev)s)' % _attrs)
 _nextSearch = compile(_linkSearch +
-  r'(?:<img[^>]+alt="%(next)s|<img[^>]+(?:button_next|nav_next4)\.|\s*<?[^<]*%(next)s)' % _attrs)
+  r'(?:<img[^>]+alt="%(next)s|<img[^>]+(?:button_next|naviButtons_Next|nav_next4|next|webbuttonnext-1|NextArrow)\.|\s*<?[^<]*%(next)s)' % _attrs)

 def add(name, url, description, adult, bounce):
-    classname = 'SmackJeeves/' + name
+    classname = 'SmackJeeves_' + name

    def modifier(pageUrl):
        if adult:
@ -48,6 +48,7 @@ def add(name, url, description, adult, bounce):
        return "%s_%s" % (name, num)

    globals()[classname] = make_scraper(classname,
+        name = 'SmackJeeves/' + name,
        adult = adult,
        starter = _starter,
        prevUrlModifier = lambda cls, url: modifier(url),
--- a/dosagelib/plugins/t.py
+++ b/dosagelib/plugins/t.py
@ -71,7 +71,8 @@ class TinyKittenTeeth(_BasicScraper):
    help = 'Index format: yyyy/mm/dd/stripname (unpadded)'


-class TwoLumps(_BasicScraper):
+# XXX disallowed by robots.txt
+class _TwoLumps(_BasicScraper):
    latestUrl = 'http://www.twolumps.net/'
    stripUrl = latestUrl + 'd/%s.html'
    imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))
--- a/dosagelib/plugins/u.py
+++ b/dosagelib/plugins/u.py
@ -26,7 +26,8 @@ class UnicornJelly(_BasicScraper):
    help = 'Index format: nnn'


-class UserFriendly(_BasicScraper):
+# XXX disallowed by robots.txt
+class _UserFriendly(_BasicScraper):
    starter = bounceStarter('http://ars.userfriendly.org/cartoons/?mode=classic', compile(r'<area shape="rect" href="(/cartoons/\?id=\d{8}&mode=classic)" coords="[\d, ]+?" alt="">'))
    stripUrl = 'http://ars.userfriendly.org/cartoons/?id=%s&mode=classic'
    imageSearch = compile(r'<img border="0" src="\s*(http://www.userfriendly.org/cartoons/archives/\d{2}\w{3}/.+?\.gif)"')
--- a/dosagelib/plugins/w.py
+++ b/dosagelib/plugins/w.py
@ -57,7 +57,8 @@ class WotNow(_BasicScraper):
    help = 'Index format: n (unpadded)'


-class WorldOfWarcraftEh(_BasicScraper):
+# XXX disallowed by robots.txt
+class _WorldOfWarcraftEh(_BasicScraper):
    latestUrl = 'http://woweh.com/'
    stripUrl = None
    imageSearch = compile(r'http://woweh.com/(comics/.+?)"')
--- a/scripts/drunkduck.py
+++ b/scripts/drunkduck.py
@ -30,6 +30,7 @@ exclude_comics = [
    "Apartment_408_Full_Size", # broken images
    "Apple_Valley", # broken images
    "Apt_408_Minis", # broken images
+    "Art_dump", # broken images
    "Atxs", # broken images
    "A_Word_Of_Wisdom", # broken images
    "Brathalla", # broken images
@ -64,6 +65,7 @@ exclude_comics = [
    "Inside_OuT", # broken images
    "Journey_to_Raifina", # broken images
    "KALA_dan", # broken images
+    "Kuro_Shouri", # page moved
    "Live_to_tell", # start page requires login
    "Locoma", # broken images
    "London_Underworld", # broken images
@ -141,7 +143,7 @@ exclude_comics = [
    "Weave", # broken images
    "Weirdlings", # template error
    "Welcome_To_Border_City", # broken images
-    "what_comes_first", # start page requires login
+    "What_comes_first", # start page requires login
    "Within_Shadows", # broken images
    "Xolta", # start page requires login
    "XTIN__The_Dragons_Dream_World", # start page requires login
--- a/scripts/gocomics.py
+++ b/scripts/gocomics.py
@ -38,6 +38,7 @@ exclude_comics = [
    "RichardsPoorAlmanac", # missing images
    "SherpaAid", # comic unavailable
    "SparComics", # comic unavailable
+    "SurvivingSingle", # comic unavailable
 ]


--- a/scripts/mktestpage.py
+++ b/scripts/mktestpage.py
@ -23,6 +23,9 @@ htmltemplate = """
 </head>
 <body>
 <p>Dosage test results from %(date)s</p>
+<p>Note that it is almost impossible to get a 100% OK test run
+due to temporary network failures or sites that are just updating
+the comic page.</p>
 <div id="container">
 %(content)s
 </div>
@ -70,14 +73,16 @@ def get_content(filename):
    with open(filename, "r") as f:
        print("Tests parsed: 0", end=" ", file=sys.stderr)
        num_tests = 0
+        add_reason = False
        for line in f:
            if line.startswith((". ", "F ")) and "test_comics" in line:
+                add_reason = line.startswith("F ")
                num_tests += 1
                try:
                    tests.append(get_test(line))
-                    add_reason = line.startswith("F ")
                except Exception as msg:
                    print("WARNING:", msg, file=sys.stderr)
+                    continue
            elif add_reason and line.startswith(" E "):
                reason = line[3:].strip()
                tests[-1][-1] = reason
--- a/scripts/smackjeeves.py
+++ b/scripts/smackjeeves.py
@ -40,6 +40,8 @@ exclude_comics = [
    "MylifewithFel", # does not follow standard layout
    "NegativeZen", # does not follow standard layout
    "NightShot", # does not follow standard layout
+    "NormalIsBoring", # does not follow standard layout
+    "Okamirai", # images are 403 forbidden
    "OmnisSpriteShowcase", # missing images
    "OpticalDisarray", # does not follow standard layout
    "PicturesofYou", # does not follow standard layout
@ -49,6 +51,7 @@ exclude_comics = [
    "Ribon", # does not follow standard layout
    "SecretSanta2011", # missing images
    "ShinkaTheLastEevee", # does not follow standard layout
+    "SimplePixel", # does not follow standard layout
    "SJArtCollab", # missing images
    "SlightlyDifferent", # missing images
    "TheAfterSubtract", # does not follow standard layout
--- a/scripts/universal.py
+++ b/scripts/universal.py
@ -30,6 +30,7 @@ exclude_comics = [
    "JamesBond", # not a comic
    "Men", # not a comic
    "NEA", # not a comic
+    "PeanutsPortuguese", # not found
    "Pets", # not a comic
    "SundayOnly", # not a comic
    "WebExclusive", # not a comic