Rework/fix KeenSpot modules.

2016-10-14 00:14:53 +02:00 · 2016-10-14 00:14:53 +02:00 · b17d6e5f22
commit b17d6e5f22
parent bb6199af65
6 changed files with 130 additions and 143 deletions
--- a/dosagelib/plugins/comicgenesis.py
+++ b/dosagelib/plugins/comicgenesis.py
@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper):
            cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
            cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
            cls('TheEasyBreather', 'easybreather'),
-            cls('TheLounge', 'thelounge'),
            cls('TheMisadventuresofOkk', 'okk'),
            cls('ThePath', 'thepath'),
            cls('TheTalesofKalduras', 'kalduras'),
--- a/dosagelib/plugins/g.py
+++ b/dosagelib/plugins/g.py
@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper):
    help = 'Index format: ddmmyyyy'


+class GodChild(_WordPressScraper):
+    url = 'http://godchild.keenspot.com/'
+
+
 class GoGetARoomie(_ComicControlScraper):
    url = 'http://www.gogetaroomie.com'

--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@ -5,76 +5,117 @@

 from __future__ import absolute_import, division, print_function

-from re import compile
-
-from ..scraper import _BasicScraper
-from ..util import tagre
+from ..scraper import _ParserScraper


-class KeenSpot(_BasicScraper):
-    imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
-    _stripPattern = r'([^"]*/d/\d{8}\.html)'
+class KeenSpot(_ParserScraper):
+    multipleImagesPerStrip = True
+    imageSearch = (
+        '//img[contains(@src, "/comics/")]',
+        # Shockwave Darkside
+        '//img[contains(@src, "/comics2D/")]',
+        '//img[contains(@src, "com/shockwave")]',
+        # Sore Thumbs
+        '//img[contains(@src, "com/st2")]',
+        # Wayward Sons
+        '//img[contains(@src, "com/2")]',
+    )
    prevSearch = (
-        compile(tagre("link", "href", _stripPattern, before="prev")),
-        compile(tagre("a", "href", _stripPattern, after="prev")),
-        compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
-        compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
+        '//link[@rel="prev"]',
+        '//a[@rel="prev"]',
+        # Exposure
+        '//a[img[@id="exp29"]]',
+        # Hero By Night
+        '//area[contains(@coords, ",-7,")]',
+        # Katrina
+        '//a[img[@id="katc7"]]',
+        # No Room For Magic, Everyone Loves Adis, Wisdom Of Moo
+        '//a[text()="Previous comic"]',
+        # Supernovas
+        '//a[img[@id="p_top_nav"]]',
    )
    help = 'Index format: yyyymmdd'

-    def __init__(self, name, sub):
+    def __init__(self, name, sub, last=None, path='d/%s.html'):
        super(KeenSpot, self).__init__('KeenSpot/' + name)
        self.url = 'http://%s.keenspot.com/' % sub
-        self.stripUrl = self.url + 'd/%s.html'
+        self.stripUrl = self.url + path
+
+        if last:
+            self.url = self.stripUrl % last
+            self.endOfLife = True

    @classmethod
    def getmodules(cls):
-        return [
+        return (
+            # Not on frontpage...
+            cls('Buzzboy', 'buzzboy'),
+            cls('EveryoneLovesAdis', 'adis'),
+
            # do not edit anything below since these entries are generated from
            # scripts/update_plugins.sh
            # START AUTOUPDATE
            cls('27TwentySeven', 'twenty-seven'),
-            cls('Adventurers', 'adventurers'),
-            cls('AntiheroForHire', 'antihero'),
+            cls('Avengelyne', 'avengelyne'),
            cls('BanzaiGirl', 'banzaigirl'),
            cls('Barker', 'barkercomic'),
-            cls('Buzzboy', 'buzzboy'),
            cls('ChoppingBlock', 'choppingblock'),
            cls('ClichFlamb', 'clicheflambe'),
            cls('CountYourSheep', 'countyoursheep'),
+            cls('CrowScare', 'crowscare', last="20111031"),
+            cls('Dreamless', 'dreamless', last="20100726"),
            cls('EverythingJake', 'everythingjake'),
+            cls('Exposure', 'exposure'),
            cls('FallOutToyWorks', 'fallouttoyworks'),
            cls('FriarAndBrimstone', 'friarandbrimstone'),
            cls('GeneCatlow', 'genecatlow'),
            cls('GodMode', 'godmode'),
            cls('GreenWake', 'greenwake'),
            cls('HeadTrip', 'headtrip'),
+            cls('HeroByNight', 'herobynight'),
            cls('HoaxHunters', 'hoaxhunters'),
+            cls('InfinityRefugees', 'newshounds'),
            cls('InHere', 'inhere'),
+            cls('JadeWarriors', 'jadewarriors'),
            cls('Katrina', 'katrina'),
            cls('Landis', 'landis'),
+            cls('LutherStrode', 'lutherstrode'),
            cls('MakeshiftMiracle', 'makeshiftmiracle'),
            cls('Marksmen', 'marksmen'),
            cls('MarryMe', 'marryme'),
            cls('MedusasDaughter', 'medusasdaughter'),
            cls('MonsterMassacre', 'monstermassacre'),
-            cls('Newshounds', 'newshounds'),
+            cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"),
            cls('NoPinkPonies', 'nopinkponies'),
+            cls('NoRoomForMagic', 'noroomformagic'),
            cls('OutThere', 'outthere'),
            cls('Porcelain', 'porcelain'),
+            cls('PunchAnPie', 'punchanpie', path="daily/%s.html"),
            cls('QUILTBAG', 'quiltbag'),
            cls('RedSpike', 'redspike'),
            cls('RumbleFall', 'rumblefall'),
            cls('SamuraisBlood', 'samuraisblood'),
            cls('Sharky', 'sharky'),
+            cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"),
            cls('SomethingHappens', 'somethinghappens'),
            cls('SoreThumbs', 'sorethumbs'),
            cls('Striptease', 'striptease'),
+            cls('Supernovas', 'supernovas'),
            cls('Superosity', 'superosity'),
            cls('TheFirstDaughter', 'thefirstdaughter'),
-            cls('TheGodChild', 'godchild'),
-            cls('TheHuntersofSalamanstra', 'salamanstra'),
+            cls('TheHopeVirus', 'hopevirus'),
+            cls('TheHuntersOfSalamanstra', 'salamanstra'),
            cls('TheLounge', 'thelounge'),
+            cls('TheVault', 'thevault'),
+            cls('WaywardSons', 'waywardsons'),
+            cls('WeirdingWillows', 'weirdingwillows'),
            cls('WICKEDPOWERED', 'wickedpowered'),
+            cls('WisdomOfMoo', 'wisdomofmoo'),
+            cls('Yirmumah', 'yirmumah', path="%s/"),
            # END AUTOUPDATE
-        ]
+        )
+
+    def shouldSkipUrl(self, url, data):
+        return url in (
+            'http://sorethumbs.keenspot.com/d/20160117.html'
+        )
--- a/dosagelib/plugins/old.py
+++ b/dosagelib/plugins/old.py
@ -16,6 +16,7 @@ class Removed(Scraper):
        'block': 'The comic site is blocking us.',
        'unk': 'Comic was removed for an unknown reason.',
        'brk': 'Comic navigation is broken.',
+        'mov': 'Comic moved to a new hoster and no new module was written.',
    }

    def __init__(self, name, reason='del'):
@ -238,6 +239,8 @@ class Removed(Scraper):
            cls('GunnerkrigCourt'),
            cls('HorribleVille'),
            cls('KatzenfutterGeleespritzer'),
+            cls('KeenSpot/Adventurers', 'mov'),
+            cls('KeenSpot/AntiheroForHire', 'mov'),
            cls('KillerKomics'),
            cls('Lint'),
            cls('LinuxComFridayFunnies'),
@ -350,7 +353,7 @@ class Renamed(Scraper):

    @classmethod
    def getmodules(cls):
-        return [
+        return (
            # Renamed in 2.16
            cls('1997', '1977'),
            cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'),
@ -361,12 +364,15 @@ class Renamed(Scraper):
            cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'),
            cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'),
            cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'),
+            cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'),
            cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'),
            cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'),
            cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'),
            cls('DarkWings', 'Eryl'),
            cls('FoulLanguage', 'GoComics/FowlLanguage'),
            cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'),
+            cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'),
+            cls('KeenSpot/TheGodChild', 'GodChild'),
            cls('Wulffmorgenthaler', 'WuMo'),
            cls('ZebraGirl', 'ComicFury/ZebraGirl'),
-        ]
+        )
--- a/dosagelib/plugins/t.py
+++ b/dosagelib/plugins/t.py
@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE
 from ..scraper import _BasicScraper, _ParserScraper
 from ..helpers import indirectStarter
 from ..util import tagre
-from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper
+from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper,
+                     xpath_class)


 class TheBrads(_BasicScraper):
@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper):
    adult = True


+class Twokinds(_ParserScraper):
+    url = 'http://twokinds.keenspot.com/'
+    imageSearch = ('//p[@id="cg_img"]//img',
+                   '//article/p//img')
+    prevSearch = ('//a[@id="cg_back"]',
+                  '//a[%s]' % xpath_class('navprev'))
+
+
 class TwoLumps(_BasicScraper):
    url = 'http://www.twolumps.net/'
    stripUrl = url + 'd/%s.html'
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@ -9,132 +9,60 @@ JSON file for further processing.
 """
 from __future__ import absolute_import, division, print_function

-import codecs
-import re
-import sys
-import os
+from six.moves.urllib.parse import urlsplit

-import requests
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
-from dosagelib.util import get_page, tagre, check_robotstxt
-from dosagelib.scraper import get_scrapers
-from scriptutil import (contains_case_insensitive, save_result, load_result,
-                        truncate_name, format_name)
+from scriptutil import ComicListUpdater
+from dosagelib.util import check_robotstxt


-json_file = __file__.replace(".py", ".json")
+class KeenSpotUpdater(ComicListUpdater):
+    dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s")

+    # names of comics to exclude
+    excluded_comics = (
+        # non-standard navigation
+        "BrawlInTheFamily",
+        "Flipside",
+        "LastBlood",
+        "TheGodChild",
+        "Twokinds",
+    )

-url_matcher = re.compile(
-  tagre("td", "onmouseover", r'([^"]+)') +
-  tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
-  r"(?:<b>)?([^<]+)(?:</b>)?</a>"
-)
+    extra = {
+        'CrowScare': 'last="20111031"',
+        'Dreamless': 'last="20100726"',
+        'MysticRevolution': 'path="?cid=%s"',
+        'PunchAnPie': 'path="daily/%s.html"',
+        'ShockwaveDarkside': 'path="2d/%s.html"',
+        'Yirmumah': 'path="%s/"',
+    }

+    def collect_results(self):
+        """Parse the front page."""
+        data = self.get_url('http://keenspot.com/')

-# names of comics to exclude
-exclude_comics = [
-    "BrawlintheFamily", # non-standard navigation
-    "CrowScare", # non-standard navigation
-    "Dreamless", # non-standard navigation
-    "EV", # non-standard navigation
-    "Exposure", # non-standard navigation
-    "Flipside", # non-standard navigation
-    "HerobyNight", # non-standard navigation
-    "JadeWarriors", # non-standard navigation
-    "LastBlood", # non-standard navigation
-    "MysticRevolution", # non-standard navigation
-    "NoRoomForMagic", # non-standard navigation
-    "PunchanPie", # non-standard navigation
-    "RoadWaffles", # non-standard navigation
-    "Shadowbinders", # non-standard navigation
-    "ShockwaveDarkside", # non-standard navigation
-    "Supernovas", # non-standard navigation
-    "Twokinds", # non-standard navigation
-    "WisdomofMoo", # non-standard navigation
-    "Yirmumah", # non-standard navigation
-    "YouDamnKid", # non-standard navigation
-]
-
-
-# links to last valid strips
-url_overrides = {
-}
-
-
-def handle_url(url, session, res):
-    """Parse one search result page."""
-    print("Parsing", url, file=sys.stderr)
-    try:
-        data = get_page(url, session).text
-    except IOError as msg:
-        print("ERROR:", msg, file=sys.stderr)
-        return
-    for match in url_matcher.finditer(data):
-        comicurl = match.group(2)
-        name = format_name(match.group(3))
-        if name in exclude_comics:
-            continue
-        if contains_case_insensitive(res, name):
-            # we cannot handle two comics that only differ in case
-            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
-            continue
-        try:
-            if "/d/" not in comicurl:
-                check_robotstxt(comicurl + "d/", session)
-            else:
-                check_robotstxt(comicurl, session)
-        except IOError:
-            print("INFO: robots.txt denied for keenspot", repr(name))
-            continue
-        res[name] = comicurl
-
-
-def get_results():
-    """Parse all search result pages."""
-    # store info in a dictionary {name -> shortname}
-    res = {}
-    session = requests.Session()
-    base = 'http://keenspot.com/'
-    handle_url(base, session, res)
-    save_result(res, json_file)
-
-
-def has_comic(name):
-    """Check if comic name already exists."""
-    names = [
-        ("Creators/%s" % name).lower(),
-        ("GoComics/%s" % name).lower(),
-        ("ComicGenesis/%s" % name).lower(),
-    ]
-    for scraperobj in get_scrapers():
-        lname = scraperobj.name.lower()
-        if lname in names:
-            return True
-    return False
-
-
-def print_results(args):
-    """Print all comics."""
-    min_comics, filename = args
-    with codecs.open(filename, 'a', 'utf-8') as fp:
-        for name, entry in sorted(load_result(json_file).items()):
-            if name in exclude_comics:
+        for comiclink in data.xpath('//td[@id]/a'):
+            comicurl = comiclink.attrib['href']
+            name = comiclink.xpath("string()")
+            try:
+                if "/d/" not in comicurl:
+                    check_robotstxt(comicurl + "d/", self.session)
+                else:
+                    check_robotstxt(comicurl, self.session)
+            except IOError as e:
+                print("[%s] INFO: robots.txt denied: %s" % (name, e))
                continue
-            url = entry
-            if has_comic(name):
-                prefix = u'#'
-            else:
-                prefix = u''
-            name = truncate_name(name)
-            fp.write(u"%sadd(%r, %r)\n" % (
-              prefix, str(name), str(url))
-            )
+
+            self.add_comic(name, comicurl)
+
+    def get_entry(self, name, url):
+        sub = urlsplit(url).hostname.split('.', 1)[0]
+        if name in self.extra:
+            extra = ', ' + self.extra[name]
+        else:
+            extra = ''
+        return u"cls('%s', '%s'%s)," % (name, sub, extra)


 if __name__ == '__main__':
-    if len(sys.argv) > 1:
-        print_results(sys.argv[1:])
-    else:
-        get_results()
+    KeenSpotUpdater(__file__).run()