Remove make_scraper magic from Arcamax.
This commit is contained in:
parent
db87ed95e7
commit
497653c448
2 changed files with 235 additions and 128 deletions
|
@ -1,111 +1,214 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
"""
|
||||
Arcamax comic strips
|
||||
"""
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
from ..scraper import _ParserScraper
|
||||
|
||||
|
||||
_imageSearch = compile(tagre("img", "data-zoom-image", r'(/newspics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'(/[^"]+)', before='prev'))
|
||||
class _Arcamax(_ParserScraper):
|
||||
imageSearch = '//img[@id="comic-zoom"]'
|
||||
prevSearch = '//a[@class="prev"]'
|
||||
|
||||
def add(name, shortname):
|
||||
url = 'http://www.arcamax.com%s' % shortname
|
||||
classname = 'Arcamax_%s' % name
|
||||
@property
|
||||
def url(self):
|
||||
return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='Arcamax/' + name,
|
||||
url = url,
|
||||
stripUrl = url + '%s',
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: none',
|
||||
)
|
||||
@property
|
||||
def name(self):
|
||||
return 'Arcamax/' + super(_Arcamax, self).name
|
||||
|
||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
||||
|
||||
# do not edit anything below since these entries are generated from
|
||||
# scripts/update_plugins.sh
|
||||
# DO NOT REMOVE
|
||||
#add('9ChickweedLane', '/thefunnies/ninechickweedlane/')
|
||||
#add('Agnes', '/thefunnies/agnes/')
|
||||
#add('AndyCapp', '/thefunnies/andycapp/')
|
||||
#add('Archie', '/thefunnies/archie/')
|
||||
add('ArcticCircle', '/thefunnies/arcticcircle/')
|
||||
#add('AskShagg', '/thefunnies/askshagg/')
|
||||
#add('BC', '/thefunnies/bc/')
|
||||
add('BabyBlues', '/thefunnies/babyblues/')
|
||||
#add('BallardStreet', '/thefunnies/ballardstreet/')
|
||||
#add('BarneyAndClyde', '/thefunnies/barneyandclyde/')
|
||||
add('BarneyGoogleAndSnuffySmith', '/thefunnies/barneygoogle/')
|
||||
add('BeetleBailey', '/thefunnies/beetlebailey/')
|
||||
add('Bizarro', '/thefunnies/bizarro/')
|
||||
add('BleekerTheRechargeableDog', '/thefunnies/bleekertherechargeabledog/')
|
||||
add('Blondie', '/thefunnies/blondie/')
|
||||
add('Boondocks', '/thefunnies/boondocks/')
|
||||
add('BrilliantMindofEdisonLee', '/thefunnies/brilliantmindofedisonlee/')
|
||||
#add('Candorville', '/thefunnies/candorville/')
|
||||
#add('Cathy', '/thefunnies/cathy/')
|
||||
#add('ChuckleBros', '/thefunnies/chucklebros/')
|
||||
add('Crankshaft', '/thefunnies/crankshaft/')
|
||||
#add('CuldeSac', '/thefunnies/culdesac/')
|
||||
add('Curtis', '/thefunnies/curtis/')
|
||||
#add('DaddysHome', '/thefunnies/daddyshome/')
|
||||
add('DeFlocked', '/thefunnies/deflocked/')
|
||||
add('DennistheMenace', '/thefunnies/dennisthemenace/')
|
||||
#add('DiamondLil', '/thefunnies/diamondlil/')
|
||||
#add('Dilbert', '/thefunnies/dilbert/')
|
||||
add('DinetteSet', '/thefunnies/thedinetteset/')
|
||||
#add('DogEatDoug', '/thefunnies/dogeatdoug/')
|
||||
#add('DogsofCKennel', '/thefunnies/dogsofckennel/')
|
||||
#add('Doonesbury', '/thefunnies/doonesbury/')
|
||||
add('Dustin', '/thefunnies/dustin/')
|
||||
add('FamilyCircus', '/thefunnies/familycircus/')
|
||||
#add('FloAndFriends', '/thefunnies/floandfriends/')
|
||||
#add('ForHeavensSake', '/thefunnies/forheavenssake/')
|
||||
#add('FortKnox', '/thefunnies/fortknox/')
|
||||
#add('FreeRange', '/thefunnies/freerange/')
|
||||
#add('Garfield', '/thefunnies/garfield/')
|
||||
#add('GetFuzzy', '/thefunnies/getfuzzy/')
|
||||
#add('Heathcliff', '/thefunnies/heathcliff/')
|
||||
#add('HerbandJamaal', '/thefunnies/herbandjamaal/')
|
||||
add('HiandLois', '/thefunnies/hiandlois/')
|
||||
#add('HomeAndAway', '/thefunnies/homeandaway/')
|
||||
add('IntelligentLife', '/thefunnies/intelligentlife/')
|
||||
add('JerryKingCartoons', '/thefunnies/humorcartoon/')
|
||||
#add('LittleDogLost', '/thefunnies/littledoglost/')
|
||||
#add('LongStoryShort', '/thefunnies/longstoryshort/')
|
||||
#add('LooseParts', '/thefunnies/looseparts/')
|
||||
#add('Luann', '/thefunnies/luann/')
|
||||
add('MallardFillmore', '/thefunnies/mallardfillmore/')
|
||||
add('Marvin', '/thefunnies/marvin/')
|
||||
add('MeaningofLila', '/thefunnies/meaningoflila/')
|
||||
#add('MikeDuJour', '/thefunnies/mikedujour/')
|
||||
#add('Momma', '/thefunnies/momma/')
|
||||
add('MotherGooseAndGrimm', '/thefunnies/mothergooseandgrimm/')
|
||||
add('Mutts', '/thefunnies/mutts/')
|
||||
#add('NestHeads', '/thefunnies/nestheads/')
|
||||
#add('NonSequitur', '/thefunnies/nonsequitur/')
|
||||
#add('OneBigHappy', '/thefunnies/onebighappy/')
|
||||
#add('Peanuts', '/thefunnies/peanuts/')
|
||||
#add('PearlsBeforeSwine', '/thefunnies/pearlsbeforeswine/')
|
||||
#add('Pickles', '/thefunnies/pickles/')
|
||||
#add('RedandRover', '/thefunnies/redandrover/')
|
||||
#add('ReplyAll', '/thefunnies/replyall/')
|
||||
add('RhymeswithOrange', '/thefunnies/rhymeswithorange/')
|
||||
#add('Rubes', '/thefunnies/rubes/')
|
||||
#add('RudyPark', '/thefunnies/rudypark/')
|
||||
#add('Rugrats', '/thefunnies/rugrats/')
|
||||
#add('ScaryGary', '/thefunnies/scarygary/')
|
||||
#add('SpeedBump', '/thefunnies/speedbump/')
|
||||
#add('StrangeBrew', '/thefunnies/strangebrew/')
|
||||
add('TakeItFromTheTinkersons', '/thefunnies/takeitfromthetinkersons/')
|
||||
#add('TheBarn', '/thefunnies/thebarn/')
|
||||
add('TheLockhorns', '/thefunnies/thelockhorns/')
|
||||
#add('TheOtherCoast', '/thefunnies/theothercoast/')
|
||||
add('TinasGroove', '/thefunnies/tinasgroove/')
|
||||
#add('WeePals', '/thefunnies/weepals/')
|
||||
#add('WizardofId', '/thefunnies/wizardofid/')
|
||||
#add('WorkingitOut', '/thefunnies/workingitout/')
|
||||
#add('Wumo', '/thefunnies/wumo/')
|
||||
#add('ZackHill', '/thefunnies/zackhill/')
|
||||
add('Zits', '/thefunnies/zits/')
|
||||
# 9ChickweedLane has a duplicate in GoComics/9ChickweedLane
|
||||
# Agnes has a duplicate in GoComics/Agnes
|
||||
# AndyCapp has a duplicate in GoComics/AndyCapp
|
||||
# Archie has a duplicate in Creators/Archie
|
||||
|
||||
|
||||
class ArcticCircle(_Arcamax):
|
||||
path = 'arcticcircle'
|
||||
# AskShagg has a duplicate in GoComics/AskShagg
|
||||
|
||||
|
||||
class BabyBlues(_Arcamax):
|
||||
path = 'babyblues'
|
||||
# BallardStreet has a duplicate in GoComics/BallardStreet
|
||||
# BarneyAndClyde has a duplicate in GoComics/BarneyAndClyde
|
||||
|
||||
|
||||
class BarneyGoogleAndSnuffySmith(_Arcamax):
|
||||
path = 'barneygoogle'
|
||||
# BC has a duplicate in GoComics/BC
|
||||
|
||||
|
||||
class BeetleBailey(_Arcamax):
|
||||
path = 'beetlebailey'
|
||||
|
||||
|
||||
class Bizarro(_Arcamax):
|
||||
path = 'bizarro'
|
||||
# BleekerTheRechargeableDog has a duplicate in GoComics/BleekerTheRechargeableDog
|
||||
|
||||
|
||||
class Blondie(_Arcamax):
|
||||
path = 'blondie'
|
||||
|
||||
|
||||
class Boondocks(_Arcamax):
|
||||
path = 'boondocks'
|
||||
|
||||
|
||||
class BrilliantMindOfEdisonLee(_Arcamax):
|
||||
path = 'brilliantmindofedisonlee'
|
||||
# Candorville has a duplicate in GoComics/Candorville
|
||||
|
||||
|
||||
class CarpeDiem(_Arcamax):
|
||||
path = 'carpediem'
|
||||
# Cathy has a duplicate in GoComics/Cathy
|
||||
# ChipBok has a duplicate in GoComics/ChipBok
|
||||
# ChuckleBros has a duplicate in GoComics/ChuckleBros
|
||||
# ClayBennett has a duplicate in GoComics/ClayBennett
|
||||
|
||||
|
||||
class Crankshaft(_Arcamax):
|
||||
path = 'crankshaft'
|
||||
# CulDeSac has a duplicate in GoComics/CulDeSac
|
||||
|
||||
|
||||
class Curtis(_Arcamax):
|
||||
path = 'curtis'
|
||||
# DaddysHome has a duplicate in GoComics/DaddysHome
|
||||
# DarrinBell has a duplicate in GoComics/DarrinBell
|
||||
|
||||
|
||||
class DeFlocked(_Arcamax):
|
||||
path = 'deflocked'
|
||||
|
||||
|
||||
class DennisTheMenace(_Arcamax):
|
||||
path = 'dennisthemenace'
|
||||
# DiamondLil has a duplicate in GoComics/DiamondLil
|
||||
# Dilbert has a duplicate in Dilbert
|
||||
|
||||
|
||||
class DinetteSet(_Arcamax):
|
||||
path = 'thedinetteset'
|
||||
# DogEatDoug has a duplicate in GoComics/DogEatDoug
|
||||
# DogsOfCKennel has a duplicate in GoComics/DogsOfCKennel
|
||||
# Doonesbury has a duplicate in GoComics/Doonesbury
|
||||
|
||||
|
||||
class Dustin(_Arcamax):
|
||||
path = 'dustin'
|
||||
|
||||
|
||||
class FamilyCircus(_Arcamax):
|
||||
path = 'familycircus'
|
||||
# FloAndFriends has a duplicate in GoComics/FloAndFriends
|
||||
# ForBetterOrForWorse has a duplicate in GoComics/ForBetterOrForWorse
|
||||
# ForHeavensSake has a duplicate in GoComics/ForHeavensSake
|
||||
# FortKnox has a duplicate in GoComics/FortKnox
|
||||
# FreeRange has a duplicate in GoComics/FreeRange
|
||||
# Garfield has a duplicate in GoComics/Garfield
|
||||
# GetFuzzy has a duplicate in GoComics/GetFuzzy
|
||||
# HagarTheHorrible has a duplicate in HagarTheHorrible
|
||||
# Heathcliff has a duplicate in GoComics/Heathcliff
|
||||
# HerbAndJamaal has a duplicate in GoComics/HerbAndJamaal
|
||||
|
||||
|
||||
class HiAndLois(_Arcamax):
|
||||
path = 'hiandlois'
|
||||
|
||||
|
||||
class IntelligentLife(_Arcamax):
|
||||
path = 'intelligentlife'
|
||||
|
||||
|
||||
class JerryKingCartoons(_Arcamax):
|
||||
path = 'humorcartoon'
|
||||
# LisaBenson has a duplicate in GoComics/LisaBenson
|
||||
# LittleDogLost has a duplicate in GoComics/LittleDogLost
|
||||
# LongStoryShort has a duplicate in Creators/LongStoryShort
|
||||
# LooseParts has a duplicate in GoComics/LooseParts
|
||||
# Luann has a duplicate in GoComics/Luann
|
||||
|
||||
|
||||
class MallardFillmore(_Arcamax):
|
||||
path = 'mallardfillmore'
|
||||
|
||||
|
||||
class Marvin(_Arcamax):
|
||||
path = 'marvin'
|
||||
|
||||
|
||||
class MasterStrokesGolfTips(_Arcamax):
|
||||
path = 'masterstrokes'
|
||||
|
||||
|
||||
class MeaningOfLila(_Arcamax):
|
||||
path = 'meaningoflila'
|
||||
# MichaelRamirez has a duplicate in GoComics/MichaelRamirez
|
||||
# MikeDuJour has a duplicate in GoComics/MikeDuJour
|
||||
# MikeLester has a duplicate in GoComics/MikeLester
|
||||
# MikeLuckovich has a duplicate in GoComics/MikeLuckovich
|
||||
# Momma has a duplicate in GoComics/Momma
|
||||
|
||||
|
||||
class MotherGooseAndGrimm(_Arcamax):
|
||||
path = 'mothergooseandgrimm'
|
||||
|
||||
|
||||
class Mutts(_Arcamax):
|
||||
path = 'mutts'
|
||||
# NestHeads has a duplicate in GoComics/NestHeads
|
||||
# NickAnderson has a duplicate in GoComics/NickAnderson
|
||||
# NonSequitur has a duplicate in GoComics/NonSequitur
|
||||
# OneBigHappy has a duplicate in GoComics/OneBigHappy
|
||||
# Peanuts has a duplicate in GoComics/Peanuts
|
||||
# PearlsBeforeSwine has a duplicate in GoComics/PearlsBeforeSwine
|
||||
# Pickles has a duplicate in GoComics/Pickles
|
||||
# RedAndRover has a duplicate in GoComics/RedAndRover
|
||||
# ReplyAll has a duplicate in GoComics/ReplyAll
|
||||
|
||||
|
||||
class RhymesWithOrange(_Arcamax):
|
||||
path = 'rhymeswithorange'
|
||||
# Rubes has a duplicate in GoComics/Rubes
|
||||
# RudyPark has a duplicate in GoComics/RudyPark
|
||||
# Rugrats has a duplicate in Creators/Rugrats
|
||||
# ScaryGary has a duplicate in GoComics/ScaryGary
|
||||
# Shoe has a duplicate in GoComics/Shoe
|
||||
# SigneWilkinson has a duplicate in GoComics/SigneWilkinson
|
||||
# SpeedBump has a duplicate in GoComics/SpeedBump
|
||||
# SteveBenson has a duplicate in GoComics/SteveBenson
|
||||
# SteveBreen has a duplicate in GoComics/SteveBreen
|
||||
# StrangeBrew has a duplicate in GoComics/StrangeBrew
|
||||
|
||||
|
||||
class TakeItFromTheTinkersons(_Arcamax):
|
||||
path = 'takeitfromthetinkersons'
|
||||
# TheBarn has a duplicate in GoComics/TheBarn
|
||||
|
||||
|
||||
class TheLockhorns(_Arcamax):
|
||||
path = 'thelockhorns'
|
||||
# TheOtherCoast has a duplicate in GoComics/TheOtherCoast
|
||||
|
||||
|
||||
class TinasGroove(_Arcamax):
|
||||
path = 'tinasgroove'
|
||||
# WeePals has a duplicate in GoComics/WeePals
|
||||
# WizardOfId has a duplicate in GoComics/WizardOfId
|
||||
# WorkingItOut has a duplicate in GoComics/WorkingItOut
|
||||
# Wumo has a duplicate in GoComics/WuMo
|
||||
# ZackHill has a duplicate in GoComics/ZackHill
|
||||
|
||||
|
||||
class Zits(_Arcamax):
|
||||
path = 'zits'
|
||||
|
|
|
@ -10,21 +10,20 @@ processing.
|
|||
from __future__ import absolute_import, division, print_function
|
||||
|
||||
import codecs
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
|
||||
import requests
|
||||
from lxml import html
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||
from dosagelib.util import get_page
|
||||
from dosagelib.scraper import get_scraperclasses
|
||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||
truncate_name, format_name)
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
|
||||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"HagartheHorrible", # better source available
|
||||
|
@ -35,20 +34,22 @@ def handle_url(url, session, res):
|
|||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data = get_page(url, session).text
|
||||
data = html.document_fromstring(get_page(url, session).text)
|
||||
data.make_links_absolute(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
shortname = match.group(1)
|
||||
name = format_name(match.group(2))
|
||||
|
||||
for comiclink in data.cssselect('a.comic-icon'):
|
||||
path = comiclink.attrib['href']
|
||||
name = format_name(comiclink.attrib['title'])
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
||||
continue
|
||||
res[name] = shortname
|
||||
res[name] = path.rsplit('/', 2)[1]
|
||||
if not res:
|
||||
print("ERROR:", "did not match any comics", file=sys.stderr)
|
||||
|
||||
|
@ -62,7 +63,7 @@ def get_results():
|
|||
save_result(res, json_file)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
def find_dups(name):
|
||||
"""Check if comic name already exists."""
|
||||
names = [
|
||||
("Creators/%s" % name).lower(),
|
||||
|
@ -72,26 +73,29 @@ def has_comic(name):
|
|||
("ComicGenesis/%s" % name).lower(),
|
||||
("SmackJeeves/%s" % name).lower(),
|
||||
]
|
||||
for scraperclass in get_scraperclasses():
|
||||
lname = scraperclass.getName().lower()
|
||||
for scraperobj in get_scrapers():
|
||||
lname = scraperobj.name.lower()
|
||||
if lname in names or lname == name.lower():
|
||||
return True
|
||||
return False
|
||||
return scraperobj.name
|
||||
return None
|
||||
|
||||
|
||||
def first_lower(x):
|
||||
return x[0].lower()
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
min_comics, filename = args
|
||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||
for name, shortname in sorted(load_result(json_file).items()):
|
||||
if name in exclude_comics:
|
||||
continue
|
||||
if has_comic(name):
|
||||
prefix = u'#'
|
||||
data = load_result(json_file)
|
||||
for name, path in sorted(data.items(), key=first_lower):
|
||||
dup = find_dups(name)
|
||||
if dup is not None:
|
||||
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
|
||||
else:
|
||||
prefix = u''
|
||||
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)),
|
||||
str(shortname)))
|
||||
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
|
||||
truncate_name(name), path))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue