Rework/fix KeenSpot modules.

This commit is contained in:
Tobias Gruetzmacher 2016-10-14 00:14:53 +02:00
parent bb6199af65
commit b17d6e5f22
6 changed files with 130 additions and 143 deletions

View file

@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper):
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'), cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'), cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
cls('TheEasyBreather', 'easybreather'), cls('TheEasyBreather', 'easybreather'),
cls('TheLounge', 'thelounge'),
cls('TheMisadventuresofOkk', 'okk'), cls('TheMisadventuresofOkk', 'okk'),
cls('ThePath', 'thepath'), cls('ThePath', 'thepath'),
cls('TheTalesofKalduras', 'kalduras'), cls('TheTalesofKalduras', 'kalduras'),

View file

@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper):
help = 'Index format: ddmmyyyy' help = 'Index format: ddmmyyyy'
class GodChild(_WordPressScraper):
url = 'http://godchild.keenspot.com/'
class GoGetARoomie(_ComicControlScraper): class GoGetARoomie(_ComicControlScraper):
url = 'http://www.gogetaroomie.com' url = 'http://www.gogetaroomie.com'

View file

@ -5,76 +5,117 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
from re import compile from ..scraper import _ParserScraper
from ..scraper import _BasicScraper
from ..util import tagre
class KeenSpot(_BasicScraper): class KeenSpot(_ParserScraper):
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) multipleImagesPerStrip = True
_stripPattern = r'([^"]*/d/\d{8}\.html)' imageSearch = (
'//img[contains(@src, "/comics/")]',
# Shockwave Darkside
'//img[contains(@src, "/comics2D/")]',
'//img[contains(@src, "com/shockwave")]',
# Sore Thumbs
'//img[contains(@src, "com/st2")]',
# Wayward Sons
'//img[contains(@src, "com/2")]',
)
prevSearch = ( prevSearch = (
compile(tagre("link", "href", _stripPattern, before="prev")), '//link[@rel="prev"]',
compile(tagre("a", "href", _stripPattern, after="prev")), '//a[@rel="prev"]',
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")), # Exposure
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")), '//a[img[@id="exp29"]]',
# Hero By Night
'//area[contains(@coords, ",-7,")]',
# Katrina
'//a[img[@id="katc7"]]',
# No Room For Magic, Everyone Loves Adis, Wisdom Of Moo
'//a[text()="Previous comic"]',
# Supernovas
'//a[img[@id="p_top_nav"]]',
) )
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
def __init__(self, name, sub): def __init__(self, name, sub, last=None, path='d/%s.html'):
super(KeenSpot, self).__init__('KeenSpot/' + name) super(KeenSpot, self).__init__('KeenSpot/' + name)
self.url = 'http://%s.keenspot.com/' % sub self.url = 'http://%s.keenspot.com/' % sub
self.stripUrl = self.url + 'd/%s.html' self.stripUrl = self.url + path
if last:
self.url = self.stripUrl % last
self.endOfLife = True
@classmethod @classmethod
def getmodules(cls): def getmodules(cls):
return [ return (
# Not on frontpage...
cls('Buzzboy', 'buzzboy'),
cls('EveryoneLovesAdis', 'adis'),
# do not edit anything below since these entries are generated from # do not edit anything below since these entries are generated from
# scripts/update_plugins.sh # scripts/update_plugins.sh
# START AUTOUPDATE # START AUTOUPDATE
cls('27TwentySeven', 'twenty-seven'), cls('27TwentySeven', 'twenty-seven'),
cls('Adventurers', 'adventurers'), cls('Avengelyne', 'avengelyne'),
cls('AntiheroForHire', 'antihero'),
cls('BanzaiGirl', 'banzaigirl'), cls('BanzaiGirl', 'banzaigirl'),
cls('Barker', 'barkercomic'), cls('Barker', 'barkercomic'),
cls('Buzzboy', 'buzzboy'),
cls('ChoppingBlock', 'choppingblock'), cls('ChoppingBlock', 'choppingblock'),
cls('ClichFlamb', 'clicheflambe'), cls('ClichFlamb', 'clicheflambe'),
cls('CountYourSheep', 'countyoursheep'), cls('CountYourSheep', 'countyoursheep'),
cls('CrowScare', 'crowscare', last="20111031"),
cls('Dreamless', 'dreamless', last="20100726"),
cls('EverythingJake', 'everythingjake'), cls('EverythingJake', 'everythingjake'),
cls('Exposure', 'exposure'),
cls('FallOutToyWorks', 'fallouttoyworks'), cls('FallOutToyWorks', 'fallouttoyworks'),
cls('FriarAndBrimstone', 'friarandbrimstone'), cls('FriarAndBrimstone', 'friarandbrimstone'),
cls('GeneCatlow', 'genecatlow'), cls('GeneCatlow', 'genecatlow'),
cls('GodMode', 'godmode'), cls('GodMode', 'godmode'),
cls('GreenWake', 'greenwake'), cls('GreenWake', 'greenwake'),
cls('HeadTrip', 'headtrip'), cls('HeadTrip', 'headtrip'),
cls('HeroByNight', 'herobynight'),
cls('HoaxHunters', 'hoaxhunters'), cls('HoaxHunters', 'hoaxhunters'),
cls('InfinityRefugees', 'newshounds'),
cls('InHere', 'inhere'), cls('InHere', 'inhere'),
cls('JadeWarriors', 'jadewarriors'),
cls('Katrina', 'katrina'), cls('Katrina', 'katrina'),
cls('Landis', 'landis'), cls('Landis', 'landis'),
cls('LutherStrode', 'lutherstrode'),
cls('MakeshiftMiracle', 'makeshiftmiracle'), cls('MakeshiftMiracle', 'makeshiftmiracle'),
cls('Marksmen', 'marksmen'), cls('Marksmen', 'marksmen'),
cls('MarryMe', 'marryme'), cls('MarryMe', 'marryme'),
cls('MedusasDaughter', 'medusasdaughter'), cls('MedusasDaughter', 'medusasdaughter'),
cls('MonsterMassacre', 'monstermassacre'), cls('MonsterMassacre', 'monstermassacre'),
cls('Newshounds', 'newshounds'), cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"),
cls('NoPinkPonies', 'nopinkponies'), cls('NoPinkPonies', 'nopinkponies'),
cls('NoRoomForMagic', 'noroomformagic'),
cls('OutThere', 'outthere'), cls('OutThere', 'outthere'),
cls('Porcelain', 'porcelain'), cls('Porcelain', 'porcelain'),
cls('PunchAnPie', 'punchanpie', path="daily/%s.html"),
cls('QUILTBAG', 'quiltbag'), cls('QUILTBAG', 'quiltbag'),
cls('RedSpike', 'redspike'), cls('RedSpike', 'redspike'),
cls('RumbleFall', 'rumblefall'), cls('RumbleFall', 'rumblefall'),
cls('SamuraisBlood', 'samuraisblood'), cls('SamuraisBlood', 'samuraisblood'),
cls('Sharky', 'sharky'), cls('Sharky', 'sharky'),
cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"),
cls('SomethingHappens', 'somethinghappens'), cls('SomethingHappens', 'somethinghappens'),
cls('SoreThumbs', 'sorethumbs'), cls('SoreThumbs', 'sorethumbs'),
cls('Striptease', 'striptease'), cls('Striptease', 'striptease'),
cls('Supernovas', 'supernovas'),
cls('Superosity', 'superosity'), cls('Superosity', 'superosity'),
cls('TheFirstDaughter', 'thefirstdaughter'), cls('TheFirstDaughter', 'thefirstdaughter'),
cls('TheGodChild', 'godchild'), cls('TheHopeVirus', 'hopevirus'),
cls('TheHuntersofSalamanstra', 'salamanstra'), cls('TheHuntersOfSalamanstra', 'salamanstra'),
cls('TheLounge', 'thelounge'), cls('TheLounge', 'thelounge'),
cls('TheVault', 'thevault'),
cls('WaywardSons', 'waywardsons'),
cls('WeirdingWillows', 'weirdingwillows'),
cls('WICKEDPOWERED', 'wickedpowered'), cls('WICKEDPOWERED', 'wickedpowered'),
cls('WisdomOfMoo', 'wisdomofmoo'),
cls('Yirmumah', 'yirmumah', path="%s/"),
# END AUTOUPDATE # END AUTOUPDATE
] )
def shouldSkipUrl(self, url, data):
return url in (
'http://sorethumbs.keenspot.com/d/20160117.html'
)

View file

@ -16,6 +16,7 @@ class Removed(Scraper):
'block': 'The comic site is blocking us.', 'block': 'The comic site is blocking us.',
'unk': 'Comic was removed for an unknown reason.', 'unk': 'Comic was removed for an unknown reason.',
'brk': 'Comic navigation is broken.', 'brk': 'Comic navigation is broken.',
'mov': 'Comic moved to a new hoster and no new module was written.',
} }
def __init__(self, name, reason='del'): def __init__(self, name, reason='del'):
@ -238,6 +239,8 @@ class Removed(Scraper):
cls('GunnerkrigCourt'), cls('GunnerkrigCourt'),
cls('HorribleVille'), cls('HorribleVille'),
cls('KatzenfutterGeleespritzer'), cls('KatzenfutterGeleespritzer'),
cls('KeenSpot/Adventurers', 'mov'),
cls('KeenSpot/AntiheroForHire', 'mov'),
cls('KillerKomics'), cls('KillerKomics'),
cls('Lint'), cls('Lint'),
cls('LinuxComFridayFunnies'), cls('LinuxComFridayFunnies'),
@ -350,7 +353,7 @@ class Renamed(Scraper):
@classmethod @classmethod
def getmodules(cls): def getmodules(cls):
return [ return (
# Renamed in 2.16 # Renamed in 2.16
cls('1997', '1977'), cls('1997', '1977'),
cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'), cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'),
@ -361,12 +364,15 @@ class Renamed(Scraper):
cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'), cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'),
cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'), cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'),
cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'), cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'),
cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'),
cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'), cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'),
cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'), cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'),
cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'), cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'),
cls('DarkWings', 'Eryl'), cls('DarkWings', 'Eryl'),
cls('FoulLanguage', 'GoComics/FowlLanguage'), cls('FoulLanguage', 'GoComics/FowlLanguage'),
cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'), cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'),
cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'),
cls('KeenSpot/TheGodChild', 'GodChild'),
cls('Wulffmorgenthaler', 'WuMo'), cls('Wulffmorgenthaler', 'WuMo'),
cls('ZebraGirl', 'ComicFury/ZebraGirl'), cls('ZebraGirl', 'ComicFury/ZebraGirl'),
] )

View file

@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper,
xpath_class)
class TheBrads(_BasicScraper): class TheBrads(_BasicScraper):
@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper):
adult = True adult = True
class Twokinds(_ParserScraper):
url = 'http://twokinds.keenspot.com/'
imageSearch = ('//p[@id="cg_img"]//img',
'//article/p//img')
prevSearch = ('//a[@id="cg_back"]',
'//a[%s]' % xpath_class('navprev'))
class TwoLumps(_BasicScraper): class TwoLumps(_BasicScraper):
url = 'http://www.twolumps.net/' url = 'http://www.twolumps.net/'
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'

View file

@ -9,132 +9,60 @@ JSON file for further processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs from six.moves.urllib.parse import urlsplit
import re
import sys
import os
import requests from scriptutil import ComicListUpdater
from dosagelib.util import check_robotstxt
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json") class KeenSpotUpdater(ComicListUpdater):
dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s")
# names of comics to exclude
excluded_comics = (
# non-standard navigation
"BrawlInTheFamily",
"Flipside",
"LastBlood",
"TheGodChild",
"Twokinds",
)
url_matcher = re.compile( extra = {
tagre("td", "onmouseover", r'([^"]+)') + 'CrowScare': 'last="20111031"',
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') + 'Dreamless': 'last="20100726"',
r"(?:<b>)?([^<]+)(?:</b>)?</a>" 'MysticRevolution': 'path="?cid=%s"',
) 'PunchAnPie': 'path="daily/%s.html"',
'ShockwaveDarkside': 'path="2d/%s.html"',
'Yirmumah': 'path="%s/"',
}
def collect_results(self):
"""Parse the front page."""
data = self.get_url('http://keenspot.com/')
# names of comics to exclude for comiclink in data.xpath('//td[@id]/a'):
exclude_comics = [ comicurl = comiclink.attrib['href']
"BrawlintheFamily", # non-standard navigation name = comiclink.xpath("string()")
"CrowScare", # non-standard navigation
"Dreamless", # non-standard navigation
"EV", # non-standard navigation
"Exposure", # non-standard navigation
"Flipside", # non-standard navigation
"HerobyNight", # non-standard navigation
"JadeWarriors", # non-standard navigation
"LastBlood", # non-standard navigation
"MysticRevolution", # non-standard navigation
"NoRoomForMagic", # non-standard navigation
"PunchanPie", # non-standard navigation
"RoadWaffles", # non-standard navigation
"Shadowbinders", # non-standard navigation
"ShockwaveDarkside", # non-standard navigation
"Supernovas", # non-standard navigation
"Twokinds", # non-standard navigation
"WisdomofMoo", # non-standard navigation
"Yirmumah", # non-standard navigation
"YouDamnKid", # non-standard navigation
]
# links to last valid strips
url_overrides = {
}
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(2)
name = format_name(match.group(3))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
try: try:
if "/d/" not in comicurl: if "/d/" not in comicurl:
check_robotstxt(comicurl + "d/", session) check_robotstxt(comicurl + "d/", self.session)
else: else:
check_robotstxt(comicurl, session) check_robotstxt(comicurl, self.session)
except IOError: except IOError as e:
print("INFO: robots.txt denied for keenspot", repr(name)) print("[%s] INFO: robots.txt denied: %s" % (name, e))
continue continue
res[name] = comicurl
self.add_comic(name, comicurl)
def get_results(): def get_entry(self, name, url):
"""Parse all search result pages.""" sub = urlsplit(url).hostname.split('.', 1)[0]
# store info in a dictionary {name -> shortname} if name in self.extra:
res = {} extra = ', ' + self.extra[name]
session = requests.Session()
base = 'http://keenspot.com/'
handle_url(base, session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
]
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names:
return True
return False
def print_results(args):
"""Print all comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url = entry
if has_comic(name):
prefix = u'#'
else: else:
prefix = u'' extra = ''
name = truncate_name(name) return u"cls('%s', '%s'%s)," % (name, sub, extra)
fp.write(u"%sadd(%r, %r)\n" % (
prefix, str(name), str(url))
)
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: KeenSpotUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()