Rework/fix KeenSpot modules.
This commit is contained in:
parent
bb6199af65
commit
b17d6e5f22
6 changed files with 130 additions and 143 deletions
|
@ -119,7 +119,6 @@ class ComicGenesis(_BasicScraper):
|
|||
cls('TheAdventuresofKaniraBaxter', 'kanirabaxter'),
|
||||
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
|
||||
cls('TheEasyBreather', 'easybreather'),
|
||||
cls('TheLounge', 'thelounge'),
|
||||
cls('TheMisadventuresofOkk', 'okk'),
|
||||
cls('ThePath', 'thepath'),
|
||||
cls('TheTalesofKalduras', 'kalduras'),
|
||||
|
|
|
@ -130,6 +130,10 @@ class GoblinsComic(_ParserScraper):
|
|||
help = 'Index format: ddmmyyyy'
|
||||
|
||||
|
||||
class GodChild(_WordPressScraper):
|
||||
url = 'http://godchild.keenspot.com/'
|
||||
|
||||
|
||||
class GoGetARoomie(_ComicControlScraper):
|
||||
url = 'http://www.gogetaroomie.com'
|
||||
|
||||
|
|
|
@ -5,76 +5,117 @@
|
|||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from re import compile
|
||||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
from ..scraper import _ParserScraper
|
||||
|
||||
|
||||
class KeenSpot(_BasicScraper):
|
||||
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
_stripPattern = r'([^"]*/d/\d{8}\.html)'
|
||||
class KeenSpot(_ParserScraper):
|
||||
multipleImagesPerStrip = True
|
||||
imageSearch = (
|
||||
'//img[contains(@src, "/comics/")]',
|
||||
# Shockwave Darkside
|
||||
'//img[contains(@src, "/comics2D/")]',
|
||||
'//img[contains(@src, "com/shockwave")]',
|
||||
# Sore Thumbs
|
||||
'//img[contains(@src, "com/st2")]',
|
||||
# Wayward Sons
|
||||
'//img[contains(@src, "com/2")]',
|
||||
)
|
||||
prevSearch = (
|
||||
compile(tagre("link", "href", _stripPattern, before="prev")),
|
||||
compile(tagre("a", "href", _stripPattern, after="prev")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"previous_day1")),
|
||||
compile(tagre("a", "href", _stripPattern) + tagre("img", "id", r"katc7")),
|
||||
'//link[@rel="prev"]',
|
||||
'//a[@rel="prev"]',
|
||||
# Exposure
|
||||
'//a[img[@id="exp29"]]',
|
||||
# Hero By Night
|
||||
'//area[contains(@coords, ",-7,")]',
|
||||
# Katrina
|
||||
'//a[img[@id="katc7"]]',
|
||||
# No Room For Magic, Everyone Loves Adis, Wisdom Of Moo
|
||||
'//a[text()="Previous comic"]',
|
||||
# Supernovas
|
||||
'//a[img[@id="p_top_nav"]]',
|
||||
)
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
def __init__(self, name, sub):
|
||||
def __init__(self, name, sub, last=None, path='d/%s.html'):
|
||||
super(KeenSpot, self).__init__('KeenSpot/' + name)
|
||||
self.url = 'http://%s.keenspot.com/' % sub
|
||||
self.stripUrl = self.url + 'd/%s.html'
|
||||
self.stripUrl = self.url + path
|
||||
|
||||
if last:
|
||||
self.url = self.stripUrl % last
|
||||
self.endOfLife = True
|
||||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
return (
|
||||
# Not on frontpage...
|
||||
cls('Buzzboy', 'buzzboy'),
|
||||
cls('EveryoneLovesAdis', 'adis'),
|
||||
|
||||
# do not edit anything below since these entries are generated from
|
||||
# scripts/update_plugins.sh
|
||||
# START AUTOUPDATE
|
||||
cls('27TwentySeven', 'twenty-seven'),
|
||||
cls('Adventurers', 'adventurers'),
|
||||
cls('AntiheroForHire', 'antihero'),
|
||||
cls('Avengelyne', 'avengelyne'),
|
||||
cls('BanzaiGirl', 'banzaigirl'),
|
||||
cls('Barker', 'barkercomic'),
|
||||
cls('Buzzboy', 'buzzboy'),
|
||||
cls('ChoppingBlock', 'choppingblock'),
|
||||
cls('ClichFlamb', 'clicheflambe'),
|
||||
cls('CountYourSheep', 'countyoursheep'),
|
||||
cls('CrowScare', 'crowscare', last="20111031"),
|
||||
cls('Dreamless', 'dreamless', last="20100726"),
|
||||
cls('EverythingJake', 'everythingjake'),
|
||||
cls('Exposure', 'exposure'),
|
||||
cls('FallOutToyWorks', 'fallouttoyworks'),
|
||||
cls('FriarAndBrimstone', 'friarandbrimstone'),
|
||||
cls('GeneCatlow', 'genecatlow'),
|
||||
cls('GodMode', 'godmode'),
|
||||
cls('GreenWake', 'greenwake'),
|
||||
cls('HeadTrip', 'headtrip'),
|
||||
cls('HeroByNight', 'herobynight'),
|
||||
cls('HoaxHunters', 'hoaxhunters'),
|
||||
cls('InfinityRefugees', 'newshounds'),
|
||||
cls('InHere', 'inhere'),
|
||||
cls('JadeWarriors', 'jadewarriors'),
|
||||
cls('Katrina', 'katrina'),
|
||||
cls('Landis', 'landis'),
|
||||
cls('LutherStrode', 'lutherstrode'),
|
||||
cls('MakeshiftMiracle', 'makeshiftmiracle'),
|
||||
cls('Marksmen', 'marksmen'),
|
||||
cls('MarryMe', 'marryme'),
|
||||
cls('MedusasDaughter', 'medusasdaughter'),
|
||||
cls('MonsterMassacre', 'monstermassacre'),
|
||||
cls('Newshounds', 'newshounds'),
|
||||
cls('MysticRevolution', 'mysticrevolution', path="?cid=%s"),
|
||||
cls('NoPinkPonies', 'nopinkponies'),
|
||||
cls('NoRoomForMagic', 'noroomformagic'),
|
||||
cls('OutThere', 'outthere'),
|
||||
cls('Porcelain', 'porcelain'),
|
||||
cls('PunchAnPie', 'punchanpie', path="daily/%s.html"),
|
||||
cls('QUILTBAG', 'quiltbag'),
|
||||
cls('RedSpike', 'redspike'),
|
||||
cls('RumbleFall', 'rumblefall'),
|
||||
cls('SamuraisBlood', 'samuraisblood'),
|
||||
cls('Sharky', 'sharky'),
|
||||
cls('ShockwaveDarkside', 'shockwave', path="2d/%s.html"),
|
||||
cls('SomethingHappens', 'somethinghappens'),
|
||||
cls('SoreThumbs', 'sorethumbs'),
|
||||
cls('Striptease', 'striptease'),
|
||||
cls('Supernovas', 'supernovas'),
|
||||
cls('Superosity', 'superosity'),
|
||||
cls('TheFirstDaughter', 'thefirstdaughter'),
|
||||
cls('TheGodChild', 'godchild'),
|
||||
cls('TheHuntersofSalamanstra', 'salamanstra'),
|
||||
cls('TheHopeVirus', 'hopevirus'),
|
||||
cls('TheHuntersOfSalamanstra', 'salamanstra'),
|
||||
cls('TheLounge', 'thelounge'),
|
||||
cls('TheVault', 'thevault'),
|
||||
cls('WaywardSons', 'waywardsons'),
|
||||
cls('WeirdingWillows', 'weirdingwillows'),
|
||||
cls('WICKEDPOWERED', 'wickedpowered'),
|
||||
cls('WisdomOfMoo', 'wisdomofmoo'),
|
||||
cls('Yirmumah', 'yirmumah', path="%s/"),
|
||||
# END AUTOUPDATE
|
||||
]
|
||||
)
|
||||
|
||||
def shouldSkipUrl(self, url, data):
|
||||
return url in (
|
||||
'http://sorethumbs.keenspot.com/d/20160117.html'
|
||||
)
|
||||
|
|
|
@ -16,6 +16,7 @@ class Removed(Scraper):
|
|||
'block': 'The comic site is blocking us.',
|
||||
'unk': 'Comic was removed for an unknown reason.',
|
||||
'brk': 'Comic navigation is broken.',
|
||||
'mov': 'Comic moved to a new hoster and no new module was written.',
|
||||
}
|
||||
|
||||
def __init__(self, name, reason='del'):
|
||||
|
@ -238,6 +239,8 @@ class Removed(Scraper):
|
|||
cls('GunnerkrigCourt'),
|
||||
cls('HorribleVille'),
|
||||
cls('KatzenfutterGeleespritzer'),
|
||||
cls('KeenSpot/Adventurers', 'mov'),
|
||||
cls('KeenSpot/AntiheroForHire', 'mov'),
|
||||
cls('KillerKomics'),
|
||||
cls('Lint'),
|
||||
cls('LinuxComFridayFunnies'),
|
||||
|
@ -350,7 +353,7 @@ class Renamed(Scraper):
|
|||
|
||||
@classmethod
|
||||
def getmodules(cls):
|
||||
return [
|
||||
return (
|
||||
# Renamed in 2.16
|
||||
cls('1997', '1977'),
|
||||
cls('ComicFury/Alya', 'ComicFury/AlyaTheLastChildOfLight'),
|
||||
|
@ -361,12 +364,15 @@ class Renamed(Scraper):
|
|||
cls('ComicFury/ICanSeeYourFeels', 'ComicFury/SeeYourFeels'),
|
||||
cls('ComicFury/MAGISAupdatesMonWedFri', 'ComicFury/MAGISAPARASAYOupdatesMonFri'),
|
||||
cls('ComicFury/ThomasAndZachary', 'ComicFury/ThomasAndZacharyArchives'),
|
||||
cls('ComicGenesis/TheLounge', 'KeenSpot/TheLounge'),
|
||||
cls('Creators/ArchieinSpanish', 'Creators/ArchieSpanish'),
|
||||
cls('Creators/HeathcliffinSpanish', 'Creators/HeathcliffSpanish'),
|
||||
cls('Creators/TheWizardofIdinSpanish', 'Creators/WizardOfIdSpanish'),
|
||||
cls('DarkWings', 'Eryl'),
|
||||
cls('FoulLanguage', 'GoComics/FowlLanguage'),
|
||||
cls('GoComics/BloomCounty2015', 'GoComics/BloomCounty2016'),
|
||||
cls('KeenSpot/Newshounds', 'KeenSpot/InfinityRefugees'),
|
||||
cls('KeenSpot/TheGodChild', 'GodChild'),
|
||||
cls('Wulffmorgenthaler', 'WuMo'),
|
||||
cls('ZebraGirl', 'ComicFury/ZebraGirl'),
|
||||
]
|
||||
)
|
||||
|
|
|
@ -10,7 +10,8 @@ from re import compile, escape, IGNORECASE
|
|||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _TumblrScraper, _WordPressScraper
|
||||
from .common import (_ComicControlScraper, _TumblrScraper, _WordPressScraper,
|
||||
xpath_class)
|
||||
|
||||
|
||||
class TheBrads(_BasicScraper):
|
||||
|
@ -176,6 +177,14 @@ class TwoGuysAndGuy(_BasicScraper):
|
|||
adult = True
|
||||
|
||||
|
||||
class Twokinds(_ParserScraper):
|
||||
url = 'http://twokinds.keenspot.com/'
|
||||
imageSearch = ('//p[@id="cg_img"]//img',
|
||||
'//article/p//img')
|
||||
prevSearch = ('//a[@id="cg_back"]',
|
||||
'//a[%s]' % xpath_class('navprev'))
|
||||
|
||||
|
||||
class TwoLumps(_BasicScraper):
|
||||
url = 'http://www.twolumps.net/'
|
||||
stripUrl = url + 'd/%s.html'
|
||||
|
|
|
@ -9,132 +9,60 @@ JSON file for further processing.
|
|||
"""
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
from six.moves.urllib.parse import urlsplit
|
||||
|
||||
import requests
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import get_page, tagre, check_robotstxt
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||
truncate_name, format_name)
|
||||
from scriptutil import ComicListUpdater
|
||||
from dosagelib.util import check_robotstxt
|
||||
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
class KeenSpotUpdater(ComicListUpdater):
|
||||
dup_templates = ('Creators/%s', "GoComics/%s", "ComicGenesis/%s")
|
||||
|
||||
# names of comics to exclude
|
||||
excluded_comics = (
|
||||
# non-standard navigation
|
||||
"BrawlInTheFamily",
|
||||
"Flipside",
|
||||
"LastBlood",
|
||||
"TheGodChild",
|
||||
"Twokinds",
|
||||
)
|
||||
|
||||
url_matcher = re.compile(
|
||||
tagre("td", "onmouseover", r'([^"]+)') +
|
||||
tagre("a", "href", r'([^"]+\.keenspot\.com/)[^"]*') +
|
||||
r"(?:<b>)?([^<]+)(?:</b>)?</a>"
|
||||
)
|
||||
extra = {
|
||||
'CrowScare': 'last="20111031"',
|
||||
'Dreamless': 'last="20100726"',
|
||||
'MysticRevolution': 'path="?cid=%s"',
|
||||
'PunchAnPie': 'path="daily/%s.html"',
|
||||
'ShockwaveDarkside': 'path="2d/%s.html"',
|
||||
'Yirmumah': 'path="%s/"',
|
||||
}
|
||||
|
||||
def collect_results(self):
|
||||
"""Parse the front page."""
|
||||
data = self.get_url('http://keenspot.com/')
|
||||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"BrawlintheFamily", # non-standard navigation
|
||||
"CrowScare", # non-standard navigation
|
||||
"Dreamless", # non-standard navigation
|
||||
"EV", # non-standard navigation
|
||||
"Exposure", # non-standard navigation
|
||||
"Flipside", # non-standard navigation
|
||||
"HerobyNight", # non-standard navigation
|
||||
"JadeWarriors", # non-standard navigation
|
||||
"LastBlood", # non-standard navigation
|
||||
"MysticRevolution", # non-standard navigation
|
||||
"NoRoomForMagic", # non-standard navigation
|
||||
"PunchanPie", # non-standard navigation
|
||||
"RoadWaffles", # non-standard navigation
|
||||
"Shadowbinders", # non-standard navigation
|
||||
"ShockwaveDarkside", # non-standard navigation
|
||||
"Supernovas", # non-standard navigation
|
||||
"Twokinds", # non-standard navigation
|
||||
"WisdomofMoo", # non-standard navigation
|
||||
"Yirmumah", # non-standard navigation
|
||||
"YouDamnKid", # non-standard navigation
|
||||
]
|
||||
|
||||
|
||||
# links to last valid strips
|
||||
url_overrides = {
|
||||
}
|
||||
|
||||
|
||||
def handle_url(url, session, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = get_page(url, session).text
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
comicurl = match.group(2)
|
||||
name = format_name(match.group(3))
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
||||
continue
|
||||
try:
|
||||
if "/d/" not in comicurl:
|
||||
check_robotstxt(comicurl + "d/", session)
|
||||
else:
|
||||
check_robotstxt(comicurl, session)
|
||||
except IOError:
|
||||
print("INFO: robots.txt denied for keenspot", repr(name))
|
||||
continue
|
||||
res[name] = comicurl
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
session = requests.Session()
|
||||
base = 'http://keenspot.com/'
|
||||
handle_url(base, session, res)
|
||||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
"""Check if comic name already exists."""
|
||||
names = [
|
||||
("Creators/%s" % name).lower(),
|
||||
("GoComics/%s" % name).lower(),
|
||||
("ComicGenesis/%s" % name).lower(),
|
||||
]
|
||||
for scraperobj in get_scrapers():
|
||||
lname = scraperobj.name.lower()
|
||||
if lname in names:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics."""
|
||||
min_comics, filename = args
|
||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||
for name, entry in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
for comiclink in data.xpath('//td[@id]/a'):
|
||||
comicurl = comiclink.attrib['href']
|
||||
name = comiclink.xpath("string()")
|
||||
try:
|
||||
if "/d/" not in comicurl:
|
||||
check_robotstxt(comicurl + "d/", self.session)
|
||||
else:
|
||||
check_robotstxt(comicurl, self.session)
|
||||
except IOError as e:
|
||||
print("[%s] INFO: robots.txt denied: %s" % (name, e))
|
||||
continue
|
||||
url = entry
|
||||
if has_comic(name):
|
||||
prefix = u'#'
|
||||
else:
|
||||
prefix = u''
|
||||
name = truncate_name(name)
|
||||
fp.write(u"%sadd(%r, %r)\n" % (
|
||||
prefix, str(name), str(url))
|
||||
)
|
||||
|
||||
self.add_comic(name, comicurl)
|
||||
|
||||
def get_entry(self, name, url):
|
||||
sub = urlsplit(url).hostname.split('.', 1)[0]
|
||||
if name in self.extra:
|
||||
extra = ', ' + self.extra[name]
|
||||
else:
|
||||
extra = ''
|
||||
return u"cls('%s', '%s'%s)," % (name, sub, extra)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
||||
KeenSpotUpdater(__file__).run()
|
||||
|
|
Loading…
Reference in a new issue