From b17d6e5f22ad06d4f842838ef935aa0aa0f7dcb3 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Fri, 14 Oct 2016 00:14:53 +0200 Subject: [PATCH] Rework/fix KeenSpot modules. --- dosagelib/plugins/comicgenesis.py | 1 - dosagelib/plugins/g.py | 4 + dosagelib/plugins/keenspot.py | 83 +++++++++++---- dosagelib/plugins/old.py | 10 +- dosagelib/plugins/t.py | 11 +- scripts/keenspot.py | 164 +++++++++--------------------- 6 files changed, 130 insertions(+), 143 deletions(-) diff --git a/dosagelib/plugins/comicgenesis.py b/dosagelib/plugins/comicgenesis.py index 0795f1c43..c2c86b20b 100644 --- a/dosagelib/plugins/comicgenesis.py +++ b/dosagelib/plugins/comicgenesis.py @@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper): cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'), cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'), cls('TheEasyBreather', 'easybreather'), - cls('TheLounge', 'thelounge'), cls('TheMisadventuresofOkk', 'okk'), cls('ThePath', 'thepath'), cls('TheTalesofKalduras', 'kalduras'), diff --git a/dosagelib/plugins/g.py b/dosagelib/plugins/g.py index c2125e719..44b781853 100644 --- a/dosagelib/plugins/g.py +++ b/dosagelib/plugins/g.py @@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper): help = 'Index format: ddmmyyyy' +class GodChild(_WordPressScraper): + url = 'http://godchild.keenspot.com/' + + class GoGetARoomie(_ComicControlScraper): url = 'http://www.gogetaroomie.com' diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py index 33036243d..fa2680bbf 100644 --- a/dosagelib/plugins/keenspot.py +++ b/dosagelib/plugins/keenspot.py @@ -5,76 +5,117 @@ from __future__ import absolute_import, division, print_function -from re import compile - -from ..scraper import _BasicScraper -from ..util import tagre +from ..scraper import _ParserScraper -class KeenSpot(_BasicScraper): - imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) - _stripPattern = r'([^"]*/d/\d{8}\.html)' +class KeenSpot(_ParserScraper): + multipleImagesPerStrip = True + imageSearch = ( + '//img[contains(@src, "/comics/")]', + # Shockwave Darkside + '//img[contains(@src, "/comics2D/")]', + '//img[contains(@src, "com/shockwave")]', + # Sore Thumbs + '//img[contains(@src, "com/st2")]', + # Wayward Sons + '//img[contains(@src, "com/2")]', + ) prevSearch = ( - compile(tagre("link", "href", _stripPattern, before="prev")), - compile(tagre("a", "href", _stripPattern, after="prev")), - compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")), - compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")), + '//link[@rel="prev"]', + '//a[@rel="prev"]', + # Exposure + '//a[img[@id="exp29"]]', + # Hero By Night + '//area[contains(@coords, ",-7,")]', + # Katrina + '//a[img[@id="katc7"]]', + # No Room For Magic, Everyone Loves Adis, Wisdom Of Moo + '//a[text()="Previous comic"]', + # Supernovas + '//a[img[@id="p_top_nav"]]', ) help = 'Index format: yyyymmdd' - def __init__(self, name, sub): + def __init__(self, name, sub, last=None, path='d/%s.html'): super(KeenSpot, self).__init__('KeenSpot/' + name) self.url = 'http://%s.keenspot.com/' % sub - self.stripUrl = self.url + 'd/%s.html' + self.stripUrl = self.url + path + + if last: + self.url = self.stripUrl % last + self.endOfLife = True @classmethod def getmodules(cls): - return [ + return ( + # Not on frontpage... + cls('Buzzboy', 'buzzboy'), + cls('EveryoneLovesAdis', 'adis'), + # do not edit anything below since these entries are generated from # scripts/update_plugins.sh # START AUTOUPDATE cls('27TwentySeven', 'twenty-seven'), - cls('Adventurers', 'adventurers'), - cls('AntiheroForHire', 'antihero'), + cls('Avengelyne', 'avengelyne'), cls('BanzaiGirl', 'banzaigirl'), cls('Barker', 'barkercomic'), - cls('Buzzboy', 'buzzboy'), cls('ChoppingBlock', 'choppingblock'), cls('ClichFlamb', 'clicheflambe'), cls('CountYourSheep', 'countyoursheep'), + cls('CrowScare', 'crowscare', last="20111031"), + cls('Dreamless', 'dreamless', last="20100726"), cls('EverythingJake', 'everythingjake'), + cls('Exposure', 'exposure'), cls('FallOutToyWorks', 'fallouttoyworks'), cls('FriarAndBrimstone', 'friarandbrimstone'), cls('GeneCatlow', 'genecatlow'), cls('GodMode', 'godmode'), cls('GreenWake', 'greenwake'), cls('HeadTrip', 'headtrip'), + cls('HeroByNight', 'herobynight'), cls('HoaxHunters', 'hoaxhunters'), + cls('InfinityRefugees', 'newshounds'), cls('InHere', 'inhere'), + cls('JadeWarriors', 'jadewarriors'), cls('Katrina', 'katrina'), cls('Landis', 'landis'), + cls('LutherStrode', 'lutherstrode'), cls('MakeshiftMiracle', 'makeshiftmiracle'), cls('Marksmen', 'marksmen'), cls('MarryMe', 'marryme'), cls('MedusasDaughter', 'medusasdaughter'), cls('MonsterMassacre', 'monstermassacre'), - cls('Newshounds', 'newshounds'), + cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"), cls('NoPinkPonies', 'nopinkponies'), + cls('NoRoomForMagic', 'noroomformagic'), cls('OutThere', 'outthere'), cls('Porcelain', 'porcelain'), + cls('PunchAnPie', 'punchanpie', path="daily/%s.html"), cls('QUILTBAG', 'quiltbag'), cls('RedSpike', 'redspike'), cls('RumbleFall', 'rumblefall'), cls('SamuraisBlood', 'samuraisblood'), cls('Sharky', 'sharky'), + cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"), cls('SomethingHappens', 'somethinghappens'), cls('SoreThumbs', 'sorethumbs'), cls('Striptease', 'striptease'), + cls('Supernovas', 'supernovas'), cls('Superosity', 'superosity'), cls('TheFirstDaughter', 'thefirstdaughter'), - cls('TheGodChild', 'godchild'), - cls('TheHuntersofSalamanstra', 'salamanstra'), + cls('TheHopeVirus', 'hopevirus'), + cls('TheHuntersOfSalamanstra', 'salamanstra'), cls('TheLounge', 'thelounge'), + cls('TheVault', 'thevault'), + cls('WaywardSons', 'waywardsons'), + cls('WeirdingWillows', 'weirdingwillows'), cls('WICKEDPOWERED', 'wickedpowered'), + cls('WisdomOfMoo', 'wisdomofmoo'), + cls('Yirmumah', 'yirmumah', path="%s/"), # END AUTOUPDATE - ] + ) + + def shouldSkipUrl(self, url, data): + return url in ( + 'http://sorethumbs.keenspot.com/d/20160117.html' + ) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 52994e924..eb5ef00c3 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -16,6 +16,7 @@ class Removed(Scraper): 'block': 'The comic site is blocking us.', 'unk': 'Comic was removed for an unknown reason.', 'brk': 'Comic navigation is broken.', + 'mov': 'Comic moved to a new hoster and no new module was written.', } def __init__(self, name, reason='del'): @@ -238,6 +239,8 @@ class Removed(Scraper): cls('GunnerkrigCourt'), cls('HorribleVille'), cls('KatzenfutterGeleespritzer'), + cls('KeenSpot/Adventurers', 'mov'), + cls('KeenSpot/AntiheroForHire', 'mov'), cls('KillerKomics'), cls('Lint'), cls('LinuxComFridayFunnies'), @@ -350,7 +353,7 @@ class Renamed(Scraper): @classmethod def getmodules(cls): - return [ + return ( # Renamed in 2.16 cls('1997', '1977'), cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'), @@ -361,12 +364,15 @@ class Renamed(Scraper): cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'), cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'), cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'), + cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'), cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'), cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'), cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'), cls('DarkWings', 'Eryl'), cls('FoulLanguage', 'GoComics/FowlLanguage'), cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'), + cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'), + cls('KeenSpot/TheGodChild', 'GodChild'), cls('Wulffmorgenthaler', 'WuMo'), cls('ZebraGirl', 'ComicFury/ZebraGirl'), - ] + ) diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index 1ee833daa..b503c3b1d 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE from ..scraper import _BasicScraper, _ParserScraper from ..helpers import indirectStarter from ..util import tagre -from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper +from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper, + xpath_class) class TheBrads(_BasicScraper): @@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper): adult = True +class Twokinds(_ParserScraper): + url = 'http://twokinds.keenspot.com/' + imageSearch = ('//p[@id="cg_img"]//img', + '//article/p//img') + prevSearch = ('//a[@id="cg_back"]', + '//a[%s]' % xpath_class('navprev')) + + class TwoLumps(_BasicScraper): url = 'http://www.twolumps.net/' stripUrl = url + 'd/%s.html' diff --git a/scripts/keenspot.py b/scripts/keenspot.py index 3891c82b6..a339d6395 100755 --- a/scripts/keenspot.py +++ b/scripts/keenspot.py @@ -9,132 +9,60 @@ JSON file for further processing. """ from __future__ import absolute_import, division, print_function -import codecs -import re -import sys -import os +from six.moves.urllib.parse import urlsplit -import requests - -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page, tagre, check_robotstxt -from dosagelib.scraper import get_scrapers -from scriptutil import (contains_case_insensitive, save_result, load_result, - truncate_name, format_name) +from scriptutil import ComicListUpdater +from dosagelib.util import check_robotstxt -json_file = __file__.replace(".py", ".json") +class KeenSpotUpdater(ComicListUpdater): + dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s") + # names of comics to exclude + excluded_comics = ( + # non-standard navigation + "BrawlInTheFamily", + "Flipside", + "LastBlood", + "TheGodChild", + "Twokinds", + ) -url_matcher = re.compile( - tagre("td", "onmouseover", r'([^"]+)') + - tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + - r"(?:)?([^<]+)(?:)?" -) + extra = { + 'CrowScare': 'last="20111031"', + 'Dreamless': 'last="20100726"', + 'MysticRevolution': 'path="?cid=%s"', + 'PunchAnPie': 'path="daily/%s.html"', + 'ShockwaveDarkside': 'path="2d/%s.html"', + 'Yirmumah': 'path="%s/"', + } + def collect_results(self): + """Parse the front page.""" + data = self.get_url('http://keenspot.com/') -# names of comics to exclude -exclude_comics = [ - "BrawlintheFamily", # non-standard navigation - "CrowScare", # non-standard navigation - "Dreamless", # non-standard navigation - "EV", # non-standard navigation - "Exposure", # non-standard navigation - "Flipside", # non-standard navigation - "HerobyNight", # non-standard navigation - "JadeWarriors", # non-standard navigation - "LastBlood", # non-standard navigation - "MysticRevolution", # non-standard navigation - "NoRoomForMagic", # non-standard navigation - "PunchanPie", # non-standard navigation - "RoadWaffles", # non-standard navigation - "Shadowbinders", # non-standard navigation - "ShockwaveDarkside", # non-standard navigation - "Supernovas", # non-standard navigation - "Twokinds", # non-standard navigation - "WisdomofMoo", # non-standard navigation - "Yirmumah", # non-standard navigation - "YouDamnKid", # non-standard navigation -] - - -# links to last valid strips -url_overrides = { -} - - -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = get_page(url, session).text - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return - for match in url_matcher.finditer(data): - comicurl = match.group(2) - name = format_name(match.group(3)) - if name in exclude_comics: - continue - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) - continue - try: - if "/d/" not in comicurl: - check_robotstxt(comicurl + "d/", session) - else: - check_robotstxt(comicurl, session) - except IOError: - print("INFO: robots.txt denied for keenspot", repr(name)) - continue - res[name] = comicurl - - -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> shortname} - res = {} - session = requests.Session() - base = 'http://keenspot.com/' - handle_url(base, session, res) - save_result(res, json_file) - - -def has_comic(name): - """Check if comic name already exists.""" - names = [ - ("Creators/%s" % name).lower(), - ("GoComics/%s" % name).lower(), - ("ComicGenesis/%s" % name).lower(), - ] - for scraperobj in get_scrapers(): - lname = scraperobj.name.lower() - if lname in names: - return True - return False - - -def print_results(args): - """Print all comics.""" - min_comics, filename = args - with codecs.open(filename, 'a', 'utf-8') as fp: - for name, entry in sorted(load_result(json_file).items()): - if name in exclude_comics: + for comiclink in data.xpath('//td[@id]/a'): + comicurl = comiclink.attrib['href'] + name = comiclink.xpath("string()") + try: + if "/d/" not in comicurl: + check_robotstxt(comicurl + "d/", self.session) + else: + check_robotstxt(comicurl, self.session) + except IOError as e: + print("[%s] INFO: robots.txt denied: %s" % (name, e)) continue - url = entry - if has_comic(name): - prefix = u'#' - else: - prefix = u'' - name = truncate_name(name) - fp.write(u"%sadd(%r, %r)\n" % ( - prefix, str(name), str(url)) - ) + + self.add_comic(name, comicurl) + + def get_entry(self, name, url): + sub = urlsplit(url).hostname.split('.', 1)[0] + if name in self.extra: + extra = ', ' + self.extra[name] + else: + extra = '' + return u"cls('%s', '%s'%s)," % (name, sub, extra) if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results() + KeenSpotUpdater(__file__).run()