Remove make_scraper magic from Arcamax.

This commit is contained in:
Tobias Gruetzmacher 2016-04-14 00:17:59 +02:00
parent db87ed95e7
commit 497653c448
2 changed files with 235 additions and 128 deletions

View file

@ -1,111 +1,214 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2013-2014 Bastian Kleineidam
"""
Arcamax comic strips
"""
from re import compile
from ..scraper import make_scraper
from ..util import tagre
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from ..scraper import _ParserScraper
_imageSearch = compile(tagre("img", "data-zoom-image", r'(/newspics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'(/[^"]+)', before='prev'))
class _Arcamax(_ParserScraper):
imageSearch = '//img[@id="comic-zoom"]'
prevSearch = '//a[@class="prev"]'
def add(name, shortname):
url = 'http://www.arcamax.com%s' % shortname
classname = 'Arcamax_%s' % name
@property
def url(self):
return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
globals()[classname] = make_scraper(classname,
name='Arcamax/' + name,
url = url,
stripUrl = url + '%s',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: none',
)
@property
def name(self):
return 'Arcamax/' + super(_Arcamax, self).name
# do not edit anything below since these entries are generated from scripts/update.sh
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# DO NOT REMOVE
#add('9ChickweedLane', '/thefunnies/ninechickweedlane/')
#add('Agnes', '/thefunnies/agnes/')
#add('AndyCapp', '/thefunnies/andycapp/')
#add('Archie', '/thefunnies/archie/')
add('ArcticCircle', '/thefunnies/arcticcircle/')
#add('AskShagg', '/thefunnies/askshagg/')
#add('BC', '/thefunnies/bc/')
add('BabyBlues', '/thefunnies/babyblues/')
#add('BallardStreet', '/thefunnies/ballardstreet/')
#add('BarneyAndClyde', '/thefunnies/barneyandclyde/')
add('BarneyGoogleAndSnuffySmith', '/thefunnies/barneygoogle/')
add('BeetleBailey', '/thefunnies/beetlebailey/')
add('Bizarro', '/thefunnies/bizarro/')
add('BleekerTheRechargeableDog', '/thefunnies/bleekertherechargeabledog/')
add('Blondie', '/thefunnies/blondie/')
add('Boondocks', '/thefunnies/boondocks/')
add('BrilliantMindofEdisonLee', '/thefunnies/brilliantmindofedisonlee/')
#add('Candorville', '/thefunnies/candorville/')
#add('Cathy', '/thefunnies/cathy/')
#add('ChuckleBros', '/thefunnies/chucklebros/')
add('Crankshaft', '/thefunnies/crankshaft/')
#add('CuldeSac', '/thefunnies/culdesac/')
add('Curtis', '/thefunnies/curtis/')
#add('DaddysHome', '/thefunnies/daddyshome/')
add('DeFlocked', '/thefunnies/deflocked/')
add('DennistheMenace', '/thefunnies/dennisthemenace/')
#add('DiamondLil', '/thefunnies/diamondlil/')
#add('Dilbert', '/thefunnies/dilbert/')
add('DinetteSet', '/thefunnies/thedinetteset/')
#add('DogEatDoug', '/thefunnies/dogeatdoug/')
#add('DogsofCKennel', '/thefunnies/dogsofckennel/')
#add('Doonesbury', '/thefunnies/doonesbury/')
add('Dustin', '/thefunnies/dustin/')
add('FamilyCircus', '/thefunnies/familycircus/')
#add('FloAndFriends', '/thefunnies/floandfriends/')
#add('ForHeavensSake', '/thefunnies/forheavenssake/')
#add('FortKnox', '/thefunnies/fortknox/')
#add('FreeRange', '/thefunnies/freerange/')
#add('Garfield', '/thefunnies/garfield/')
#add('GetFuzzy', '/thefunnies/getfuzzy/')
#add('Heathcliff', '/thefunnies/heathcliff/')
#add('HerbandJamaal', '/thefunnies/herbandjamaal/')
add('HiandLois', '/thefunnies/hiandlois/')
#add('HomeAndAway', '/thefunnies/homeandaway/')
add('IntelligentLife', '/thefunnies/intelligentlife/')
add('JerryKingCartoons', '/thefunnies/humorcartoon/')
#add('LittleDogLost', '/thefunnies/littledoglost/')
#add('LongStoryShort', '/thefunnies/longstoryshort/')
#add('LooseParts', '/thefunnies/looseparts/')
#add('Luann', '/thefunnies/luann/')
add('MallardFillmore', '/thefunnies/mallardfillmore/')
add('Marvin', '/thefunnies/marvin/')
add('MeaningofLila', '/thefunnies/meaningoflila/')
#add('MikeDuJour', '/thefunnies/mikedujour/')
#add('Momma', '/thefunnies/momma/')
add('MotherGooseAndGrimm', '/thefunnies/mothergooseandgrimm/')
add('Mutts', '/thefunnies/mutts/')
#add('NestHeads', '/thefunnies/nestheads/')
#add('NonSequitur', '/thefunnies/nonsequitur/')
#add('OneBigHappy', '/thefunnies/onebighappy/')
#add('Peanuts', '/thefunnies/peanuts/')
#add('PearlsBeforeSwine', '/thefunnies/pearlsbeforeswine/')
#add('Pickles', '/thefunnies/pickles/')
#add('RedandRover', '/thefunnies/redandrover/')
#add('ReplyAll', '/thefunnies/replyall/')
add('RhymeswithOrange', '/thefunnies/rhymeswithorange/')
#add('Rubes', '/thefunnies/rubes/')
#add('RudyPark', '/thefunnies/rudypark/')
#add('Rugrats', '/thefunnies/rugrats/')
#add('ScaryGary', '/thefunnies/scarygary/')
#add('SpeedBump', '/thefunnies/speedbump/')
#add('StrangeBrew', '/thefunnies/strangebrew/')
add('TakeItFromTheTinkersons', '/thefunnies/takeitfromthetinkersons/')
#add('TheBarn', '/thefunnies/thebarn/')
add('TheLockhorns', '/thefunnies/thelockhorns/')
#add('TheOtherCoast', '/thefunnies/theothercoast/')
add('TinasGroove', '/thefunnies/tinasgroove/')
#add('WeePals', '/thefunnies/weepals/')
#add('WizardofId', '/thefunnies/wizardofid/')
#add('WorkingitOut', '/thefunnies/workingitout/')
#add('Wumo', '/thefunnies/wumo/')
#add('ZackHill', '/thefunnies/zackhill/')
add('Zits', '/thefunnies/zits/')
# 9ChickweedLane has a duplicate in GoComics/9ChickweedLane
# Agnes has a duplicate in GoComics/Agnes
# AndyCapp has a duplicate in GoComics/AndyCapp
# Archie has a duplicate in Creators/Archie
class ArcticCircle(_Arcamax):
path = 'arcticcircle'
# AskShagg has a duplicate in GoComics/AskShagg
class BabyBlues(_Arcamax):
path = 'babyblues'
# BallardStreet has a duplicate in GoComics/BallardStreet
# BarneyAndClyde has a duplicate in GoComics/BarneyAndClyde
class BarneyGoogleAndSnuffySmith(_Arcamax):
path = 'barneygoogle'
# BC has a duplicate in GoComics/BC
class BeetleBailey(_Arcamax):
path = 'beetlebailey'
class Bizarro(_Arcamax):
path = 'bizarro'
# BleekerTheRechargeableDog has a duplicate in GoComics/BleekerTheRechargeableDog
class Blondie(_Arcamax):
path = 'blondie'
class Boondocks(_Arcamax):
path = 'boondocks'
class BrilliantMindOfEdisonLee(_Arcamax):
path = 'brilliantmindofedisonlee'
# Candorville has a duplicate in GoComics/Candorville
class CarpeDiem(_Arcamax):
path = 'carpediem'
# Cathy has a duplicate in GoComics/Cathy
# ChipBok has a duplicate in GoComics/ChipBok
# ChuckleBros has a duplicate in GoComics/ChuckleBros
# ClayBennett has a duplicate in GoComics/ClayBennett
class Crankshaft(_Arcamax):
path = 'crankshaft'
# CulDeSac has a duplicate in GoComics/CulDeSac
class Curtis(_Arcamax):
path = 'curtis'
# DaddysHome has a duplicate in GoComics/DaddysHome
# DarrinBell has a duplicate in GoComics/DarrinBell
class DeFlocked(_Arcamax):
path = 'deflocked'
class DennisTheMenace(_Arcamax):
path = 'dennisthemenace'
# DiamondLil has a duplicate in GoComics/DiamondLil
# Dilbert has a duplicate in Dilbert
class DinetteSet(_Arcamax):
path = 'thedinetteset'
# DogEatDoug has a duplicate in GoComics/DogEatDoug
# DogsOfCKennel has a duplicate in GoComics/DogsOfCKennel
# Doonesbury has a duplicate in GoComics/Doonesbury
class Dustin(_Arcamax):
path = 'dustin'
class FamilyCircus(_Arcamax):
path = 'familycircus'
# FloAndFriends has a duplicate in GoComics/FloAndFriends
# ForBetterOrForWorse has a duplicate in GoComics/ForBetterOrForWorse
# ForHeavensSake has a duplicate in GoComics/ForHeavensSake
# FortKnox has a duplicate in GoComics/FortKnox
# FreeRange has a duplicate in GoComics/FreeRange
# Garfield has a duplicate in GoComics/Garfield
# GetFuzzy has a duplicate in GoComics/GetFuzzy
# HagarTheHorrible has a duplicate in HagarTheHorrible
# Heathcliff has a duplicate in GoComics/Heathcliff
# HerbAndJamaal has a duplicate in GoComics/HerbAndJamaal
class HiAndLois(_Arcamax):
path = 'hiandlois'
class IntelligentLife(_Arcamax):
path = 'intelligentlife'
class JerryKingCartoons(_Arcamax):
path = 'humorcartoon'
# LisaBenson has a duplicate in GoComics/LisaBenson
# LittleDogLost has a duplicate in GoComics/LittleDogLost
# LongStoryShort has a duplicate in Creators/LongStoryShort
# LooseParts has a duplicate in GoComics/LooseParts
# Luann has a duplicate in GoComics/Luann
class MallardFillmore(_Arcamax):
path = 'mallardfillmore'
class Marvin(_Arcamax):
path = 'marvin'
class MasterStrokesGolfTips(_Arcamax):
path = 'masterstrokes'
class MeaningOfLila(_Arcamax):
path = 'meaningoflila'
# MichaelRamirez has a duplicate in GoComics/MichaelRamirez
# MikeDuJour has a duplicate in GoComics/MikeDuJour
# MikeLester has a duplicate in GoComics/MikeLester
# MikeLuckovich has a duplicate in GoComics/MikeLuckovich
# Momma has a duplicate in GoComics/Momma
class MotherGooseAndGrimm(_Arcamax):
path = 'mothergooseandgrimm'
class Mutts(_Arcamax):
path = 'mutts'
# NestHeads has a duplicate in GoComics/NestHeads
# NickAnderson has a duplicate in GoComics/NickAnderson
# NonSequitur has a duplicate in GoComics/NonSequitur
# OneBigHappy has a duplicate in GoComics/OneBigHappy
# Peanuts has a duplicate in GoComics/Peanuts
# PearlsBeforeSwine has a duplicate in GoComics/PearlsBeforeSwine
# Pickles has a duplicate in GoComics/Pickles
# RedAndRover has a duplicate in GoComics/RedAndRover
# ReplyAll has a duplicate in GoComics/ReplyAll
class RhymesWithOrange(_Arcamax):
path = 'rhymeswithorange'
# Rubes has a duplicate in GoComics/Rubes
# RudyPark has a duplicate in GoComics/RudyPark
# Rugrats has a duplicate in Creators/Rugrats
# ScaryGary has a duplicate in GoComics/ScaryGary
# Shoe has a duplicate in GoComics/Shoe
# SigneWilkinson has a duplicate in GoComics/SigneWilkinson
# SpeedBump has a duplicate in GoComics/SpeedBump
# SteveBenson has a duplicate in GoComics/SteveBenson
# SteveBreen has a duplicate in GoComics/SteveBreen
# StrangeBrew has a duplicate in GoComics/StrangeBrew
class TakeItFromTheTinkersons(_Arcamax):
path = 'takeitfromthetinkersons'
# TheBarn has a duplicate in GoComics/TheBarn
class TheLockhorns(_Arcamax):
path = 'thelockhorns'
# TheOtherCoast has a duplicate in GoComics/TheOtherCoast
class TinasGroove(_Arcamax):
path = 'tinasgroove'
# WeePals has a duplicate in GoComics/WeePals
# WizardOfId has a duplicate in GoComics/WizardOfId
# WorkingItOut has a duplicate in GoComics/WorkingItOut
# Wumo has a duplicate in GoComics/WuMo
# ZackHill has a duplicate in GoComics/ZackHill
class Zits(_Arcamax):
path = 'zits'

View file

@ -10,21 +10,20 @@ processing.
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
# names of comics to exclude
exclude_comics = [
"HagartheHorrible", # better source available
@ -35,20 +34,22 @@ def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = get_page(url, session).text
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = format_name(match.group(2))
for comiclink in data.cssselect('a.comic-icon'):
path = comiclink.attrib['href']
name = format_name(comiclink.attrib['title'])
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = shortname
res[name] = path.rsplit('/', 2)[1]
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
@ -62,7 +63,7 @@ def get_results():
save_result(res, json_file)
def has_comic(name):
def find_dups(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
@ -72,26 +73,29 @@ def has_comic(name):
("ComicGenesis/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return True
return False
return scraperobj.name
return None
def first_lower(x):
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, shortname in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
if has_comic(name):
prefix = u'#'
data = load_result(json_file)
for name, path in sorted(data.items(), key=first_lower):
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
prefix = u''
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)),
str(shortname)))
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
truncate_name(name), path))
if __name__ == '__main__':