Remove make_scraper magic from Arcamax.

This commit is contained in:
Tobias Gruetzmacher 2016-04-14 00:17:59 +02:00
parent db87ed95e7
commit 497653c448
2 changed files with 235 additions and 128 deletions

View file

@ -1,111 +1,214 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
""" # Copyright (C) 2012-2014 Bastian Kleineidam
Arcamax comic strips # Copyright (C) 2015-2016 Tobias Gruetzmacher
"""
from re import compile from __future__ import absolute_import, division, print_function
from ..scraper import make_scraper
from ..util import tagre from ..scraper import _ParserScraper
_imageSearch = compile(tagre("img", "data-zoom-image", r'(/newspics/[^"]+)')) class _Arcamax(_ParserScraper):
_prevSearch = compile(tagre("a", "href", r'(/[^"]+)', before='prev')) imageSearch = '//img[@id="comic-zoom"]'
prevSearch = '//a[@class="prev"]'
def add(name, shortname): @property
url = 'http://www.arcamax.com%s' % shortname def url(self):
classname = 'Arcamax_%s' % name return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
globals()[classname] = make_scraper(classname, @property
name='Arcamax/' + name, def name(self):
url = url, return 'Arcamax/' + super(_Arcamax, self).name
stripUrl = url + '%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: none',
)
# do not edit anything below since these entries are generated from scripts/update.sh
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# DO NOT REMOVE # DO NOT REMOVE
#add('9ChickweedLane', '/thefunnies/ninechickweedlane/') # 9ChickweedLane has a duplicate in GoComics/9ChickweedLane
#add('Agnes', '/thefunnies/agnes/') # Agnes has a duplicate in GoComics/Agnes
#add('AndyCapp', '/thefunnies/andycapp/') # AndyCapp has a duplicate in GoComics/AndyCapp
#add('Archie', '/thefunnies/archie/') # Archie has a duplicate in Creators/Archie
add('ArcticCircle', '/thefunnies/arcticcircle/')
#add('AskShagg', '/thefunnies/askshagg/')
#add('BC', '/thefunnies/bc/') class ArcticCircle(_Arcamax):
add('BabyBlues', '/thefunnies/babyblues/') path = 'arcticcircle'
#add('BallardStreet', '/thefunnies/ballardstreet/') # AskShagg has a duplicate in GoComics/AskShagg
#add('BarneyAndClyde', '/thefunnies/barneyandclyde/')
add('BarneyGoogleAndSnuffySmith', '/thefunnies/barneygoogle/')
add('BeetleBailey', '/thefunnies/beetlebailey/') class BabyBlues(_Arcamax):
add('Bizarro', '/thefunnies/bizarro/') path = 'babyblues'
add('BleekerTheRechargeableDog', '/thefunnies/bleekertherechargeabledog/') # BallardStreet has a duplicate in GoComics/BallardStreet
add('Blondie', '/thefunnies/blondie/') # BarneyAndClyde has a duplicate in GoComics/BarneyAndClyde
add('Boondocks', '/thefunnies/boondocks/')
add('BrilliantMindofEdisonLee', '/thefunnies/brilliantmindofedisonlee/')
#add('Candorville', '/thefunnies/candorville/') class BarneyGoogleAndSnuffySmith(_Arcamax):
#add('Cathy', '/thefunnies/cathy/') path = 'barneygoogle'
#add('ChuckleBros', '/thefunnies/chucklebros/') # BC has a duplicate in GoComics/BC
add('Crankshaft', '/thefunnies/crankshaft/')
#add('CuldeSac', '/thefunnies/culdesac/')
add('Curtis', '/thefunnies/curtis/') class BeetleBailey(_Arcamax):
#add('DaddysHome', '/thefunnies/daddyshome/') path = 'beetlebailey'
add('DeFlocked', '/thefunnies/deflocked/')
add('DennistheMenace', '/thefunnies/dennisthemenace/')
#add('DiamondLil', '/thefunnies/diamondlil/') class Bizarro(_Arcamax):
#add('Dilbert', '/thefunnies/dilbert/') path = 'bizarro'
add('DinetteSet', '/thefunnies/thedinetteset/') # BleekerTheRechargeableDog has a duplicate in GoComics/BleekerTheRechargeableDog
#add('DogEatDoug', '/thefunnies/dogeatdoug/')
#add('DogsofCKennel', '/thefunnies/dogsofckennel/')
#add('Doonesbury', '/thefunnies/doonesbury/') class Blondie(_Arcamax):
add('Dustin', '/thefunnies/dustin/') path = 'blondie'
add('FamilyCircus', '/thefunnies/familycircus/')
#add('FloAndFriends', '/thefunnies/floandfriends/')
#add('ForHeavensSake', '/thefunnies/forheavenssake/') class Boondocks(_Arcamax):
#add('FortKnox', '/thefunnies/fortknox/') path = 'boondocks'
#add('FreeRange', '/thefunnies/freerange/')
#add('Garfield', '/thefunnies/garfield/')
#add('GetFuzzy', '/thefunnies/getfuzzy/') class BrilliantMindOfEdisonLee(_Arcamax):
#add('Heathcliff', '/thefunnies/heathcliff/') path = 'brilliantmindofedisonlee'
#add('HerbandJamaal', '/thefunnies/herbandjamaal/') # Candorville has a duplicate in GoComics/Candorville
add('HiandLois', '/thefunnies/hiandlois/')
#add('HomeAndAway', '/thefunnies/homeandaway/')
add('IntelligentLife', '/thefunnies/intelligentlife/') class CarpeDiem(_Arcamax):
add('JerryKingCartoons', '/thefunnies/humorcartoon/') path = 'carpediem'
#add('LittleDogLost', '/thefunnies/littledoglost/') # Cathy has a duplicate in GoComics/Cathy
#add('LongStoryShort', '/thefunnies/longstoryshort/') # ChipBok has a duplicate in GoComics/ChipBok
#add('LooseParts', '/thefunnies/looseparts/') # ChuckleBros has a duplicate in GoComics/ChuckleBros
#add('Luann', '/thefunnies/luann/') # ClayBennett has a duplicate in GoComics/ClayBennett
add('MallardFillmore', '/thefunnies/mallardfillmore/')
add('Marvin', '/thefunnies/marvin/')
add('MeaningofLila', '/thefunnies/meaningoflila/') class Crankshaft(_Arcamax):
#add('MikeDuJour', '/thefunnies/mikedujour/') path = 'crankshaft'
#add('Momma', '/thefunnies/momma/') # CulDeSac has a duplicate in GoComics/CulDeSac
add('MotherGooseAndGrimm', '/thefunnies/mothergooseandgrimm/')
add('Mutts', '/thefunnies/mutts/')
#add('NestHeads', '/thefunnies/nestheads/') class Curtis(_Arcamax):
#add('NonSequitur', '/thefunnies/nonsequitur/') path = 'curtis'
#add('OneBigHappy', '/thefunnies/onebighappy/') # DaddysHome has a duplicate in GoComics/DaddysHome
#add('Peanuts', '/thefunnies/peanuts/') # DarrinBell has a duplicate in GoComics/DarrinBell
#add('PearlsBeforeSwine', '/thefunnies/pearlsbeforeswine/')
#add('Pickles', '/thefunnies/pickles/')
#add('RedandRover', '/thefunnies/redandrover/') class DeFlocked(_Arcamax):
#add('ReplyAll', '/thefunnies/replyall/') path = 'deflocked'
add('RhymeswithOrange', '/thefunnies/rhymeswithorange/')
#add('Rubes', '/thefunnies/rubes/')
#add('RudyPark', '/thefunnies/rudypark/') class DennisTheMenace(_Arcamax):
#add('Rugrats', '/thefunnies/rugrats/') path = 'dennisthemenace'
#add('ScaryGary', '/thefunnies/scarygary/') # DiamondLil has a duplicate in GoComics/DiamondLil
#add('SpeedBump', '/thefunnies/speedbump/') # Dilbert has a duplicate in Dilbert
#add('StrangeBrew', '/thefunnies/strangebrew/')
add('TakeItFromTheTinkersons', '/thefunnies/takeitfromthetinkersons/')
#add('TheBarn', '/thefunnies/thebarn/') class DinetteSet(_Arcamax):
add('TheLockhorns', '/thefunnies/thelockhorns/') path = 'thedinetteset'
#add('TheOtherCoast', '/thefunnies/theothercoast/') # DogEatDoug has a duplicate in GoComics/DogEatDoug
add('TinasGroove', '/thefunnies/tinasgroove/') # DogsOfCKennel has a duplicate in GoComics/DogsOfCKennel
#add('WeePals', '/thefunnies/weepals/') # Doonesbury has a duplicate in GoComics/Doonesbury
#add('WizardofId', '/thefunnies/wizardofid/')
#add('WorkingitOut', '/thefunnies/workingitout/')
#add('Wumo', '/thefunnies/wumo/') class Dustin(_Arcamax):
#add('ZackHill', '/thefunnies/zackhill/') path = 'dustin'
add('Zits', '/thefunnies/zits/')
class FamilyCircus(_Arcamax):
path = 'familycircus'
# FloAndFriends has a duplicate in GoComics/FloAndFriends
# ForBetterOrForWorse has a duplicate in GoComics/ForBetterOrForWorse
# ForHeavensSake has a duplicate in GoComics/ForHeavensSake
# FortKnox has a duplicate in GoComics/FortKnox
# FreeRange has a duplicate in GoComics/FreeRange
# Garfield has a duplicate in GoComics/Garfield
# GetFuzzy has a duplicate in GoComics/GetFuzzy
# HagarTheHorrible has a duplicate in HagarTheHorrible
# Heathcliff has a duplicate in GoComics/Heathcliff
# HerbAndJamaal has a duplicate in GoComics/HerbAndJamaal
class HiAndLois(_Arcamax):
path = 'hiandlois'
class IntelligentLife(_Arcamax):
path = 'intelligentlife'
class JerryKingCartoons(_Arcamax):
path = 'humorcartoon'
# LisaBenson has a duplicate in GoComics/LisaBenson
# LittleDogLost has a duplicate in GoComics/LittleDogLost
# LongStoryShort has a duplicate in Creators/LongStoryShort
# LooseParts has a duplicate in GoComics/LooseParts
# Luann has a duplicate in GoComics/Luann
class MallardFillmore(_Arcamax):
path = 'mallardfillmore'
class Marvin(_Arcamax):
path = 'marvin'
class MasterStrokesGolfTips(_Arcamax):
path = 'masterstrokes'
class MeaningOfLila(_Arcamax):
path = 'meaningoflila'
# MichaelRamirez has a duplicate in GoComics/MichaelRamirez
# MikeDuJour has a duplicate in GoComics/MikeDuJour
# MikeLester has a duplicate in GoComics/MikeLester
# MikeLuckovich has a duplicate in GoComics/MikeLuckovich
# Momma has a duplicate in GoComics/Momma
class MotherGooseAndGrimm(_Arcamax):
path = 'mothergooseandgrimm'
class Mutts(_Arcamax):
path = 'mutts'
# NestHeads has a duplicate in GoComics/NestHeads
# NickAnderson has a duplicate in GoComics/NickAnderson
# NonSequitur has a duplicate in GoComics/NonSequitur
# OneBigHappy has a duplicate in GoComics/OneBigHappy
# Peanuts has a duplicate in GoComics/Peanuts
# PearlsBeforeSwine has a duplicate in GoComics/PearlsBeforeSwine
# Pickles has a duplicate in GoComics/Pickles
# RedAndRover has a duplicate in GoComics/RedAndRover
# ReplyAll has a duplicate in GoComics/ReplyAll
class RhymesWithOrange(_Arcamax):
path = 'rhymeswithorange'
# Rubes has a duplicate in GoComics/Rubes
# RudyPark has a duplicate in GoComics/RudyPark
# Rugrats has a duplicate in Creators/Rugrats
# ScaryGary has a duplicate in GoComics/ScaryGary
# Shoe has a duplicate in GoComics/Shoe
# SigneWilkinson has a duplicate in GoComics/SigneWilkinson
# SpeedBump has a duplicate in GoComics/SpeedBump
# SteveBenson has a duplicate in GoComics/SteveBenson
# SteveBreen has a duplicate in GoComics/SteveBreen
# StrangeBrew has a duplicate in GoComics/StrangeBrew
class TakeItFromTheTinkersons(_Arcamax):
path = 'takeitfromthetinkersons'
# TheBarn has a duplicate in GoComics/TheBarn
class TheLockhorns(_Arcamax):
path = 'thelockhorns'
# TheOtherCoast has a duplicate in GoComics/TheOtherCoast
class TinasGroove(_Arcamax):
path = 'tinasgroove'
# WeePals has a duplicate in GoComics/WeePals
# WizardOfId has a duplicate in GoComics/WizardOfId
# WorkingItOut has a duplicate in GoComics/WorkingItOut
# Wumo has a duplicate in GoComics/WuMo
# ZackHill has a duplicate in GoComics/ZackHill
class Zits(_Arcamax):
path = 'zits'

View file

@ -10,21 +10,20 @@ processing.
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs import codecs
import re
import sys import sys
import os import os
import requests import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"HagartheHorrible", # better source available "HagartheHorrible", # better source available
@ -35,20 +34,22 @@ def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = get_page(url, session).text data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data):
shortname = match.group(1) for comiclink in data.cssselect('a.comic-icon'):
name = format_name(match.group(2)) path = comiclink.attrib['href']
name = format_name(comiclink.attrib['title'])
if name in exclude_comics: if name in exclude_comics:
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue continue
res[name] = shortname res[name] = path.rsplit('/', 2)[1]
if not res: if not res:
print("ERROR:", "did not match any comics", file=sys.stderr) print("ERROR:", "did not match any comics", file=sys.stderr)
@ -62,7 +63,7 @@ def get_results():
save_result(res, json_file) save_result(res, json_file)
def has_comic(name): def find_dups(name):
"""Check if comic name already exists.""" """Check if comic name already exists."""
names = [ names = [
("Creators/%s" % name).lower(), ("Creators/%s" % name).lower(),
@ -72,26 +73,29 @@ def has_comic(name):
("ComicGenesis/%s" % name).lower(), ("ComicGenesis/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(), ("SmackJeeves/%s" % name).lower(),
] ]
for scraperclass in get_scraperclasses(): for scraperobj in get_scrapers():
lname = scraperclass.getName().lower() lname = scraperobj.name.lower()
if lname in names or lname == name.lower(): if lname in names or lname == name.lower():
return True return scraperobj.name
return False return None
def first_lower(x):
return x[0].lower()
def print_results(args): def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips.""" """Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp: with codecs.open(filename, 'a', 'utf-8') as fp:
for name, shortname in sorted(load_result(json_file).items()): data = load_result(json_file)
if name in exclude_comics: for name, path in sorted(data.items(), key=first_lower):
continue dup = find_dups(name)
if has_comic(name): if dup is not None:
prefix = u'#' fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else: else:
prefix = u'' fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)), truncate_name(name), path))
str(shortname)))
if __name__ == '__main__': if __name__ == '__main__':