From b17d6e5f22ad06d4f842838ef935aa0aa0f7dcb3 Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Fri, 14 Oct 2016 00:14:53 +0200
Subject: [PATCH] Rework/fix KeenSpot modules.

---
 dosagelib/plugins/comicgenesis.py |   1 -
 dosagelib/plugins/g.py            |   4 +
 dosagelib/plugins/keenspot.py     |  83 +++++++++++----
 dosagelib/plugins/old.py          |  10 +-
 dosagelib/plugins/t.py            |  11 +-
 scripts/keenspot.py               | 164 +++++++++---------------------
 6 files changed, 130 insertions(+), 143 deletions(-)

diff --git a/dosagelib/plugins/comicgenesis.py b/dosagelib/plugins/comicgenesis.py
index 0795f1c43..c2c86b20b 100644
--- a/dosagelib/plugins/comicgenesis.py
+++ b/dosagelib/plugins/comicgenesis.py
@@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper):
             cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
             cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
             cls('TheEasyBreather', 'easybreather'),
-            cls('TheLounge', 'thelounge'),
             cls('TheMisadventuresofOkk', 'okk'),
             cls('ThePath', 'thepath'),
             cls('TheTalesofKalduras', 'kalduras'),
diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py
index c2125e719..44b781853 100644
--- a/dosagelib/plugins/g.py
+++ b/dosagelib/plugins/g.py
@@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper):
     help = 'Index format: ddmmyyyy'
 
 
+class GodChild(_WordPressScraper):
+    url = 'http://godchild.keenspot.com/'
+
+
 class GoGetARoomie(_ComicControlScraper):
     url = 'http://www.gogetaroomie.com'
 
diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py
index 33036243d..fa2680bbf 100644
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@@ -5,76 +5,117 @@
 
 from __future__ import absolute_import, division, print_function
 
-from re import compile
-
-from ..scraper import _BasicScraper
-from ..util import tagre
+from ..scraper import _ParserScraper
 
 
-class KeenSpot(_BasicScraper):
-    imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
-    _stripPattern = r'([^"]*/d/\d{8}\.html)'
+class KeenSpot(_ParserScraper):
+    multipleImagesPerStrip = True
+    imageSearch = (
+        '//img[contains(@src, "/comics/")]',
+        # Shockwave Darkside
+        '//img[contains(@src, "/comics2D/")]',
+        '//img[contains(@src, "com/shockwave")]',
+        # Sore Thumbs
+        '//img[contains(@src, "com/st2")]',
+        # Wayward Sons
+        '//img[contains(@src, "com/2")]',
+    )
     prevSearch = (
-        compile(tagre("link", "href", _stripPattern, before="prev")),
-        compile(tagre("a", "href", _stripPattern, after="prev")),
-        compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
-        compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
+        '//link[@rel="prev"]',
+        '//a[@rel="prev"]',
+        # Exposure
+        '//a[img[@id="exp29"]]',
+        # Hero By Night
+        '//area[contains(@coords, ",-7,")]',
+        # Katrina
+        '//a[img[@id="katc7"]]',
+        # No Room For Magic, Everyone Loves Adis, Wisdom Of Moo
+        '//a[text()="Previous comic"]',
+        # Supernovas
+        '//a[img[@id="p_top_nav"]]',
     )
     help = 'Index format: yyyymmdd'
 
-    def __init__(self, name, sub):
+    def __init__(self, name, sub, last=None, path='d/%s.html'):
         super(KeenSpot, self).__init__('KeenSpot/' + name)
         self.url = 'http://%s.keenspot.com/' % sub
-        self.stripUrl = self.url + 'd/%s.html'
+        self.stripUrl = self.url + path
+
+        if last:
+            self.url = self.stripUrl % last
+            self.endOfLife = True
 
     @classmethod
     def getmodules(cls):
-        return [
+        return (
+            # Not on frontpage...
+            cls('Buzzboy', 'buzzboy'),
+            cls('EveryoneLovesAdis', 'adis'),
+
             # do not edit anything below since these entries are generated from
             # scripts/update_plugins.sh
             # START AUTOUPDATE
             cls('27TwentySeven', 'twenty-seven'),
-            cls('Adventurers', 'adventurers'),
-            cls('AntiheroForHire', 'antihero'),
+            cls('Avengelyne', 'avengelyne'),
             cls('BanzaiGirl', 'banzaigirl'),
             cls('Barker', 'barkercomic'),
-            cls('Buzzboy', 'buzzboy'),
             cls('ChoppingBlock', 'choppingblock'),
             cls('ClichFlamb', 'clicheflambe'),
             cls('CountYourSheep', 'countyoursheep'),
+            cls('CrowScare', 'crowscare', last="20111031"),
+            cls('Dreamless', 'dreamless', last="20100726"),
             cls('EverythingJake', 'everythingjake'),
+            cls('Exposure', 'exposure'),
             cls('FallOutToyWorks', 'fallouttoyworks'),
             cls('FriarAndBrimstone', 'friarandbrimstone'),
             cls('GeneCatlow', 'genecatlow'),
             cls('GodMode', 'godmode'),
             cls('GreenWake', 'greenwake'),
             cls('HeadTrip', 'headtrip'),
+            cls('HeroByNight', 'herobynight'),
             cls('HoaxHunters', 'hoaxhunters'),
+            cls('InfinityRefugees', 'newshounds'),
             cls('InHere', 'inhere'),
+            cls('JadeWarriors', 'jadewarriors'),
             cls('Katrina', 'katrina'),
             cls('Landis', 'landis'),
+            cls('LutherStrode', 'lutherstrode'),
             cls('MakeshiftMiracle', 'makeshiftmiracle'),
             cls('Marksmen', 'marksmen'),
             cls('MarryMe', 'marryme'),
             cls('MedusasDaughter', 'medusasdaughter'),
             cls('MonsterMassacre', 'monstermassacre'),
-            cls('Newshounds', 'newshounds'),
+            cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"),
             cls('NoPinkPonies', 'nopinkponies'),
+            cls('NoRoomForMagic', 'noroomformagic'),
             cls('OutThere', 'outthere'),
             cls('Porcelain', 'porcelain'),
+            cls('PunchAnPie', 'punchanpie', path="daily/%s.html"),
             cls('QUILTBAG', 'quiltbag'),
             cls('RedSpike', 'redspike'),
             cls('RumbleFall', 'rumblefall'),
             cls('SamuraisBlood', 'samuraisblood'),
             cls('Sharky', 'sharky'),
+            cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"),
             cls('SomethingHappens', 'somethinghappens'),
             cls('SoreThumbs', 'sorethumbs'),
             cls('Striptease', 'striptease'),
+            cls('Supernovas', 'supernovas'),
             cls('Superosity', 'superosity'),
             cls('TheFirstDaughter', 'thefirstdaughter'),
-            cls('TheGodChild', 'godchild'),
-            cls('TheHuntersofSalamanstra', 'salamanstra'),
+            cls('TheHopeVirus', 'hopevirus'),
+            cls('TheHuntersOfSalamanstra', 'salamanstra'),
             cls('TheLounge', 'thelounge'),
+            cls('TheVault', 'thevault'),
+            cls('WaywardSons', 'waywardsons'),
+            cls('WeirdingWillows', 'weirdingwillows'),
             cls('WICKEDPOWERED', 'wickedpowered'),
+            cls('WisdomOfMoo', 'wisdomofmoo'),
+            cls('Yirmumah', 'yirmumah', path="%s/"),
             # END AUTOUPDATE
-        ]
+        )
+
+    def shouldSkipUrl(self, url, data):
+        return url in (
+            'http://sorethumbs.keenspot.com/d/20160117.html'
+        )
diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py
index 52994e924..eb5ef00c3 100644
--- a/dosagelib/plugins/old.py
+++ b/dosagelib/plugins/old.py
@@ -16,6 +16,7 @@ class Removed(Scraper):
         'block': 'The comic site is blocking us.',
         'unk': 'Comic was removed for an unknown reason.',
         'brk': 'Comic navigation is broken.',
+        'mov': 'Comic moved to a new hoster and no new module was written.',
     }
 
     def __init__(self, name, reason='del'):
@@ -238,6 +239,8 @@ class Removed(Scraper):
             cls('GunnerkrigCourt'),
             cls('HorribleVille'),
             cls('KatzenfutterGeleespritzer'),
+            cls('KeenSpot/Adventurers', 'mov'),
+            cls('KeenSpot/AntiheroForHire', 'mov'),
             cls('KillerKomics'),
             cls('Lint'),
             cls('LinuxComFridayFunnies'),
@@ -350,7 +353,7 @@ class Renamed(Scraper):
 
     @classmethod
     def getmodules(cls):
-        return [
+        return (
             # Renamed in 2.16
             cls('1997', '1977'),
             cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'),
@@ -361,12 +364,15 @@ class Renamed(Scraper):
             cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'),
             cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'),
             cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'),
+            cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'),
             cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'),
             cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'),
             cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'),
             cls('DarkWings', 'Eryl'),
             cls('FoulLanguage', 'GoComics/FowlLanguage'),
             cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'),
+            cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'),
+            cls('KeenSpot/TheGodChild', 'GodChild'),
             cls('Wulffmorgenthaler', 'WuMo'),
             cls('ZebraGirl', 'ComicFury/ZebraGirl'),
-        ]
+        )
diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py
index 1ee833daa..b503c3b1d 100644
--- a/dosagelib/plugins/t.py
+++ b/dosagelib/plugins/t.py
@@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE
 from ..scraper import _BasicScraper, _ParserScraper
 from ..helpers import indirectStarter
 from ..util import tagre
-from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper
+from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper,
+                     xpath_class)
 
 
 class TheBrads(_BasicScraper):
@@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper):
     adult = True
 
 
+class Twokinds(_ParserScraper):
+    url = 'http://twokinds.keenspot.com/'
+    imageSearch = ('//p[@id="cg_img"]//img',
+                   '//article/p//img')
+    prevSearch = ('//a[@id="cg_back"]',
+                  '//a[%s]' % xpath_class('navprev'))
+
+
 class TwoLumps(_BasicScraper):
     url = 'http://www.twolumps.net/'
     stripUrl = url + 'd/%s.html'
diff --git a/scripts/keenspot.py b/scripts/keenspot.py
index 3891c82b6..a339d6395 100755
--- a/scripts/keenspot.py
+++ b/scripts/keenspot.py
@@ -9,132 +9,60 @@ JSON file for further processing.
 """
 from __future__ import absolute_import, division, print_function
 
-import codecs
-import re
-import sys
-import os
+from six.moves.urllib.parse import urlsplit
 
-import requests
-
-sys.path.append(os.path.join(os.path.dirname(__file__), ".."))  # noqa
-from dosagelib.util import get_page, tagre, check_robotstxt
-from dosagelib.scraper import get_scrapers
-from scriptutil import (contains_case_insensitive, save_result, load_result,
-                        truncate_name, format_name)
+from scriptutil import ComicListUpdater
+from dosagelib.util import check_robotstxt
 
 
-json_file = __file__.replace(".py", ".json")
+class KeenSpotUpdater(ComicListUpdater):
+    dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s")
 
+    # names of comics to exclude
+    excluded_comics = (
+        # non-standard navigation
+        "BrawlInTheFamily",
+        "Flipside",
+        "LastBlood",
+        "TheGodChild",
+        "Twokinds",
+    )
 
-url_matcher = re.compile(
-  tagre("td", "onmouseover", r'([^"]+)') +
-  tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
-  r"(?:<b>)?([^<]+)(?:</b>)?</a>"
-)
+    extra = {
+        'CrowScare': 'last="20111031"',
+        'Dreamless': 'last="20100726"',
+        'MysticRevolution': 'path="?cid=%s"',
+        'PunchAnPie': 'path="daily/%s.html"',
+        'ShockwaveDarkside': 'path="2d/%s.html"',
+        'Yirmumah': 'path="%s/"',
+    }
 
+    def collect_results(self):
+        """Parse the front page."""
+        data = self.get_url('http://keenspot.com/')
 
-# names of comics to exclude
-exclude_comics = [
-    "BrawlintheFamily", # non-standard navigation
-    "CrowScare", # non-standard navigation
-    "Dreamless", # non-standard navigation
-    "EV", # non-standard navigation
-    "Exposure", # non-standard navigation
-    "Flipside", # non-standard navigation
-    "HerobyNight", # non-standard navigation
-    "JadeWarriors", # non-standard navigation
-    "LastBlood", # non-standard navigation
-    "MysticRevolution", # non-standard navigation
-    "NoRoomForMagic", # non-standard navigation
-    "PunchanPie", # non-standard navigation
-    "RoadWaffles", # non-standard navigation
-    "Shadowbinders", # non-standard navigation
-    "ShockwaveDarkside", # non-standard navigation
-    "Supernovas", # non-standard navigation
-    "Twokinds", # non-standard navigation
-    "WisdomofMoo", # non-standard navigation
-    "Yirmumah", # non-standard navigation
-    "YouDamnKid", # non-standard navigation
-]
-
-
-# links to last valid strips
-url_overrides = {
-}
-
-
-def handle_url(url, session, res):
-    """Parse one search result page."""
-    print("Parsing", url, file=sys.stderr)
-    try:
-        data = get_page(url, session).text
-    except IOError as msg:
-        print("ERROR:", msg, file=sys.stderr)
-        return
-    for match in url_matcher.finditer(data):
-        comicurl = match.group(2)
-        name = format_name(match.group(3))
-        if name in exclude_comics:
-            continue
-        if contains_case_insensitive(res, name):
-            # we cannot handle two comics that only differ in case
-            print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
-            continue
-        try:
-            if "/d/" not in comicurl:
-                check_robotstxt(comicurl + "d/", session)
-            else:
-                check_robotstxt(comicurl, session)
-        except IOError:
-            print("INFO: robots.txt denied for keenspot", repr(name))
-            continue
-        res[name] = comicurl
-
-
-def get_results():
-    """Parse all search result pages."""
-    # store info in a dictionary {name -> shortname}
-    res = {}
-    session = requests.Session()
-    base = 'http://keenspot.com/'
-    handle_url(base, session, res)
-    save_result(res, json_file)
-
-
-def has_comic(name):
-    """Check if comic name already exists."""
-    names = [
-        ("Creators/%s" % name).lower(),
-        ("GoComics/%s" % name).lower(),
-        ("ComicGenesis/%s" % name).lower(),
-    ]
-    for scraperobj in get_scrapers():
-        lname = scraperobj.name.lower()
-        if lname in names:
-            return True
-    return False
-
-
-def print_results(args):
-    """Print all comics."""
-    min_comics, filename = args
-    with codecs.open(filename, 'a', 'utf-8') as fp:
-        for name, entry in sorted(load_result(json_file).items()):
-            if name in exclude_comics:
+        for comiclink in data.xpath('//td[@id]/a'):
+            comicurl = comiclink.attrib['href']
+            name = comiclink.xpath("string()")
+            try:
+                if "/d/" not in comicurl:
+                    check_robotstxt(comicurl + "d/", self.session)
+                else:
+                    check_robotstxt(comicurl, self.session)
+            except IOError as e:
+                print("[%s] INFO: robots.txt denied: %s" % (name, e))
                 continue
-            url = entry
-            if has_comic(name):
-                prefix = u'#'
-            else:
-                prefix = u''
-            name = truncate_name(name)
-            fp.write(u"%sadd(%r, %r)\n" % (
-              prefix, str(name), str(url))
-            )
+
+            self.add_comic(name, comicurl)
+
+    def get_entry(self, name, url):
+        sub = urlsplit(url).hostname.split('.', 1)[0]
+        if name in self.extra:
+            extra = ', ' + self.extra[name]
+        else:
+            extra = ''
+        return u"cls('%s', '%s'%s)," % (name, sub, extra)
 
 
 if __name__ == '__main__':
-    if len(sys.argv) > 1:
-        print_results(sys.argv[1:])
-    else:
-        get_results()
+    KeenSpotUpdater(__file__).run()