Rework/fix KeenSpot modules.

This commit is contained in:
Tobias Gruetzmacher 2016-10-14 00:14:53 +02:00
parent bb6199af65
commit b17d6e5f22
6 changed files with 130 additions and 143 deletions

View file

@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper):
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
cls('TheEasyBreather', 'easybreather'),
cls('TheLounge', 'thelounge'),
cls('TheMisadventuresofOkk', 'okk'),
cls('ThePath', 'thepath'),
cls('TheTalesofKalduras', 'kalduras'),

View file

@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper):
help = 'Index format: ddmmyyyy'
class GodChild(_WordPressScraper):
url = 'http://godchild.keenspot.com/'
class GoGetARoomie(_ComicControlScraper):
url = 'http://www.gogetaroomie.com'

View file

@ -5,76 +5,117 @@
from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import _BasicScraper
from ..util import tagre
from ..scraper import _ParserScraper
class KeenSpot(_BasicScraper):
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_stripPattern = r'([^"]*/d/\d{8}\.html)'
class KeenSpot(_ParserScraper):
multipleImagesPerStrip = True
imageSearch = (
'//img[contains(@src, "/comics/")]',
# Shockwave Darkside
'//img[contains(@src, "/comics2D/")]',
'//img[contains(@src, "com/shockwave")]',
# Sore Thumbs
'//img[contains(@src, "com/st2")]',
# Wayward Sons
'//img[contains(@src, "com/2")]',
)
prevSearch = (
compile(tagre("link", "href", _stripPattern, before="prev")),
compile(tagre("a", "href", _stripPattern, after="prev")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
'//link[@rel="prev"]',
'//a[@rel="prev"]',
# Exposure
'//a[img[@id="exp29"]]',
# Hero By Night
'//area[contains(@coords, ",-7,")]',
# Katrina
'//a[img[@id="katc7"]]',
# No Room For Magic, Everyone Loves Adis, Wisdom Of Moo
'//a[text()="Previous comic"]',
# Supernovas
'//a[img[@id="p_top_nav"]]',
)
help = 'Index format: yyyymmdd'
def __init__(self, name, sub):
def __init__(self, name, sub, last=None, path='d/%s.html'):
super(KeenSpot, self).__init__('KeenSpot/' + name)
self.url = 'http://%s.keenspot.com/' % sub
self.stripUrl = self.url + 'd/%s.html'
self.stripUrl = self.url + path
if last:
self.url = self.stripUrl % last
self.endOfLife = True
@classmethod
def getmodules(cls):
return [
return (
# Not on frontpage...
cls('Buzzboy', 'buzzboy'),
cls('EveryoneLovesAdis', 'adis'),
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# START AUTOUPDATE
cls('27TwentySeven', 'twenty-seven'),
cls('Adventurers', 'adventurers'),
cls('AntiheroForHire', 'antihero'),
cls('Avengelyne', 'avengelyne'),
cls('BanzaiGirl', 'banzaigirl'),
cls('Barker', 'barkercomic'),
cls('Buzzboy', 'buzzboy'),
cls('ChoppingBlock', 'choppingblock'),
cls('ClichFlamb', 'clicheflambe'),
cls('CountYourSheep', 'countyoursheep'),
cls('CrowScare', 'crowscare', last="20111031"),
cls('Dreamless', 'dreamless', last="20100726"),
cls('EverythingJake', 'everythingjake'),
cls('Exposure', 'exposure'),
cls('FallOutToyWorks', 'fallouttoyworks'),
cls('FriarAndBrimstone', 'friarandbrimstone'),
cls('GeneCatlow', 'genecatlow'),
cls('GodMode', 'godmode'),
cls('GreenWake', 'greenwake'),
cls('HeadTrip', 'headtrip'),
cls('HeroByNight', 'herobynight'),
cls('HoaxHunters', 'hoaxhunters'),
cls('InfinityRefugees', 'newshounds'),
cls('InHere', 'inhere'),
cls('JadeWarriors', 'jadewarriors'),
cls('Katrina', 'katrina'),
cls('Landis', 'landis'),
cls('LutherStrode', 'lutherstrode'),
cls('MakeshiftMiracle', 'makeshiftmiracle'),
cls('Marksmen', 'marksmen'),
cls('MarryMe', 'marryme'),
cls('MedusasDaughter', 'medusasdaughter'),
cls('MonsterMassacre', 'monstermassacre'),
cls('Newshounds', 'newshounds'),
cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"),
cls('NoPinkPonies', 'nopinkponies'),
cls('NoRoomForMagic', 'noroomformagic'),
cls('OutThere', 'outthere'),
cls('Porcelain', 'porcelain'),
cls('PunchAnPie', 'punchanpie', path="daily/%s.html"),
cls('QUILTBAG', 'quiltbag'),
cls('RedSpike', 'redspike'),
cls('RumbleFall', 'rumblefall'),
cls('SamuraisBlood', 'samuraisblood'),
cls('Sharky', 'sharky'),
cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"),
cls('SomethingHappens', 'somethinghappens'),
cls('SoreThumbs', 'sorethumbs'),
cls('Striptease', 'striptease'),
cls('Supernovas', 'supernovas'),
cls('Superosity', 'superosity'),
cls('TheFirstDaughter', 'thefirstdaughter'),
cls('TheGodChild', 'godchild'),
cls('TheHuntersofSalamanstra', 'salamanstra'),
cls('TheHopeVirus', 'hopevirus'),
cls('TheHuntersOfSalamanstra', 'salamanstra'),
cls('TheLounge', 'thelounge'),
cls('TheVault', 'thevault'),
cls('WaywardSons', 'waywardsons'),
cls('WeirdingWillows', 'weirdingwillows'),
cls('WICKEDPOWERED', 'wickedpowered'),
cls('WisdomOfMoo', 'wisdomofmoo'),
cls('Yirmumah', 'yirmumah', path="%s/"),
# END AUTOUPDATE
]
)
def shouldSkipUrl(self, url, data):
return url in (
'http://sorethumbs.keenspot.com/d/20160117.html'
)

View file

@ -16,6 +16,7 @@ class Removed(Scraper):
'block': 'The comic site is blocking us.',
'unk': 'Comic was removed for an unknown reason.',
'brk': 'Comic navigation is broken.',
'mov': 'Comic moved to a new hoster and no new module was written.',
}
def __init__(self, name, reason='del'):
@ -238,6 +239,8 @@ class Removed(Scraper):
cls('GunnerkrigCourt'),
cls('HorribleVille'),
cls('KatzenfutterGeleespritzer'),
cls('KeenSpot/Adventurers', 'mov'),
cls('KeenSpot/AntiheroForHire', 'mov'),
cls('KillerKomics'),
cls('Lint'),
cls('LinuxComFridayFunnies'),
@ -350,7 +353,7 @@ class Renamed(Scraper):
@classmethod
def getmodules(cls):
return [
return (
# Renamed in 2.16
cls('1997', '1977'),
cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'),
@ -361,12 +364,15 @@ class Renamed(Scraper):
cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'),
cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'),
cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'),
cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'),
cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'),
cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'),
cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'),
cls('DarkWings', 'Eryl'),
cls('FoulLanguage', 'GoComics/FowlLanguage'),
cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'),
cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'),
cls('KeenSpot/TheGodChild', 'GodChild'),
cls('Wulffmorgenthaler', 'WuMo'),
cls('ZebraGirl', 'ComicFury/ZebraGirl'),
]
)

View file

@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper
from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper,
xpath_class)
class TheBrads(_BasicScraper):
@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper):
adult = True
class Twokinds(_ParserScraper):
url = 'http://twokinds.keenspot.com/'
imageSearch = ('//p[@id="cg_img"]//img',
'//article/p//img')
prevSearch = ('//a[@id="cg_back"]',
'//a[%s]' % xpath_class('navprev'))
class TwoLumps(_BasicScraper):
url = 'http://www.twolumps.net/'
stripUrl = url + 'd/%s.html'

View file

@ -9,132 +9,60 @@ JSON file for further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
from six.moves.urllib.parse import urlsplit
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
from scriptutil import ComicListUpdater
from dosagelib.util import check_robotstxt
json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(
tagre("td", "onmouseover", r'([^"]+)') +
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
)
class KeenSpotUpdater(ComicListUpdater):
dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s")
# names of comics to exclude
exclude_comics = [
"BrawlintheFamily", # non-standard navigation
"CrowScare", # non-standard navigation
"Dreamless", # non-standard navigation
"EV", # non-standard navigation
"Exposure", # non-standard navigation
"Flipside", # non-standard navigation
"HerobyNight", # non-standard navigation
"JadeWarriors", # non-standard navigation
"LastBlood", # non-standard navigation
"MysticRevolution", # non-standard navigation
"NoRoomForMagic", # non-standard navigation
"PunchanPie", # non-standard navigation
"RoadWaffles", # non-standard navigation
"Shadowbinders", # non-standard navigation
"ShockwaveDarkside", # non-standard navigation
"Supernovas", # non-standard navigation
"Twokinds", # non-standard navigation
"WisdomofMoo", # non-standard navigation
"Yirmumah", # non-standard navigation
"YouDamnKid", # non-standard navigation
]
excluded_comics = (
# non-standard navigation
"BrawlInTheFamily",
"Flipside",
"LastBlood",
"TheGodChild",
"Twokinds",
)
# links to last valid strips
url_overrides = {
extra = {
'CrowScare': 'last="20111031"',
'Dreamless': 'last="20100726"',
'MysticRevolution': 'path="?cid=%s"',
'PunchAnPie': 'path="daily/%s.html"',
'ShockwaveDarkside': 'path="2d/%s.html"',
'Yirmumah': 'path="%s/"',
}
def collect_results(self):
"""Parse the front page."""
data = self.get_url('http://keenspot.com/')
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(2)
name = format_name(match.group(3))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
for comiclink in data.xpath('//td[@id]/a'):
comicurl = comiclink.attrib['href']
name = comiclink.xpath("string()")
try:
if "/d/" not in comicurl:
check_robotstxt(comicurl + "d/", session)
check_robotstxt(comicurl + "d/", self.session)
else:
check_robotstxt(comicurl, session)
except IOError:
print("INFO: robots.txt denied for keenspot", repr(name))
check_robotstxt(comicurl, self.session)
except IOError as e:
print("[%s] INFO: robots.txt denied: %s" % (name, e))
continue
res[name] = comicurl
self.add_comic(name, comicurl)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
base = 'http://keenspot.com/'
handle_url(base, session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
]
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names:
return True
return False
def print_results(args):
"""Print all comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url = entry
if has_comic(name):
prefix = u'#'
def get_entry(self, name, url):
sub = urlsplit(url).hostname.split('.', 1)[0]
if name in self.extra:
extra = ', ' + self.extra[name]
else:
prefix = u''
name = truncate_name(name)
fp.write(u"%sadd(%r, %r)\n" % (
prefix, str(name), str(url))
)
extra = ''
return u"cls('%s', '%s'%s)," % (name, sub, extra)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
KeenSpotUpdater(__file__).run()