Remove make_scraper magic from Arcamax.
This commit is contained in:
parent
db87ed95e7
commit
497653c448
2 changed files with 235 additions and 128 deletions
|
@ -1,111 +1,214 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
"""
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
Arcamax comic strips
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
|
||||||
from re import compile
|
from __future__ import absolute_import, division, print_function
|
||||||
from ..scraper import make_scraper
|
|
||||||
from ..util import tagre
|
from ..scraper import _ParserScraper
|
||||||
|
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "data-zoom-image", r'(/newspics/[^"]+)'))
|
class _Arcamax(_ParserScraper):
|
||||||
_prevSearch = compile(tagre("a", "href", r'(/[^"]+)', before='prev'))
|
imageSearch = '//img[@id="comic-zoom"]'
|
||||||
|
prevSearch = '//a[@class="prev"]'
|
||||||
|
|
||||||
def add(name, shortname):
|
@property
|
||||||
url = 'http://www.arcamax.com%s' % shortname
|
def url(self):
|
||||||
classname = 'Arcamax_%s' % name
|
return 'http://www.arcamax.com/thefunnies/' + self.path + '/'
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
@property
|
||||||
name='Arcamax/' + name,
|
def name(self):
|
||||||
url = url,
|
return 'Arcamax/' + super(_Arcamax, self).name
|
||||||
stripUrl = url + '%s',
|
|
||||||
imageSearch = _imageSearch,
|
|
||||||
prevSearch = _prevSearch,
|
|
||||||
help = 'Index format: none',
|
|
||||||
)
|
|
||||||
|
|
||||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
|
||||||
|
# do not edit anything below since these entries are generated from
|
||||||
|
# scripts/update_plugins.sh
|
||||||
# DO NOT REMOVE
|
# DO NOT REMOVE
|
||||||
#add('9ChickweedLane', '/thefunnies/ninechickweedlane/')
|
# 9ChickweedLane has a duplicate in GoComics/9ChickweedLane
|
||||||
#add('Agnes', '/thefunnies/agnes/')
|
# Agnes has a duplicate in GoComics/Agnes
|
||||||
#add('AndyCapp', '/thefunnies/andycapp/')
|
# AndyCapp has a duplicate in GoComics/AndyCapp
|
||||||
#add('Archie', '/thefunnies/archie/')
|
# Archie has a duplicate in Creators/Archie
|
||||||
add('ArcticCircle', '/thefunnies/arcticcircle/')
|
|
||||||
#add('AskShagg', '/thefunnies/askshagg/')
|
|
||||||
#add('BC', '/thefunnies/bc/')
|
class ArcticCircle(_Arcamax):
|
||||||
add('BabyBlues', '/thefunnies/babyblues/')
|
path = 'arcticcircle'
|
||||||
#add('BallardStreet', '/thefunnies/ballardstreet/')
|
# AskShagg has a duplicate in GoComics/AskShagg
|
||||||
#add('BarneyAndClyde', '/thefunnies/barneyandclyde/')
|
|
||||||
add('BarneyGoogleAndSnuffySmith', '/thefunnies/barneygoogle/')
|
|
||||||
add('BeetleBailey', '/thefunnies/beetlebailey/')
|
class BabyBlues(_Arcamax):
|
||||||
add('Bizarro', '/thefunnies/bizarro/')
|
path = 'babyblues'
|
||||||
add('BleekerTheRechargeableDog', '/thefunnies/bleekertherechargeabledog/')
|
# BallardStreet has a duplicate in GoComics/BallardStreet
|
||||||
add('Blondie', '/thefunnies/blondie/')
|
# BarneyAndClyde has a duplicate in GoComics/BarneyAndClyde
|
||||||
add('Boondocks', '/thefunnies/boondocks/')
|
|
||||||
add('BrilliantMindofEdisonLee', '/thefunnies/brilliantmindofedisonlee/')
|
|
||||||
#add('Candorville', '/thefunnies/candorville/')
|
class BarneyGoogleAndSnuffySmith(_Arcamax):
|
||||||
#add('Cathy', '/thefunnies/cathy/')
|
path = 'barneygoogle'
|
||||||
#add('ChuckleBros', '/thefunnies/chucklebros/')
|
# BC has a duplicate in GoComics/BC
|
||||||
add('Crankshaft', '/thefunnies/crankshaft/')
|
|
||||||
#add('CuldeSac', '/thefunnies/culdesac/')
|
|
||||||
add('Curtis', '/thefunnies/curtis/')
|
class BeetleBailey(_Arcamax):
|
||||||
#add('DaddysHome', '/thefunnies/daddyshome/')
|
path = 'beetlebailey'
|
||||||
add('DeFlocked', '/thefunnies/deflocked/')
|
|
||||||
add('DennistheMenace', '/thefunnies/dennisthemenace/')
|
|
||||||
#add('DiamondLil', '/thefunnies/diamondlil/')
|
class Bizarro(_Arcamax):
|
||||||
#add('Dilbert', '/thefunnies/dilbert/')
|
path = 'bizarro'
|
||||||
add('DinetteSet', '/thefunnies/thedinetteset/')
|
# BleekerTheRechargeableDog has a duplicate in GoComics/BleekerTheRechargeableDog
|
||||||
#add('DogEatDoug', '/thefunnies/dogeatdoug/')
|
|
||||||
#add('DogsofCKennel', '/thefunnies/dogsofckennel/')
|
|
||||||
#add('Doonesbury', '/thefunnies/doonesbury/')
|
class Blondie(_Arcamax):
|
||||||
add('Dustin', '/thefunnies/dustin/')
|
path = 'blondie'
|
||||||
add('FamilyCircus', '/thefunnies/familycircus/')
|
|
||||||
#add('FloAndFriends', '/thefunnies/floandfriends/')
|
|
||||||
#add('ForHeavensSake', '/thefunnies/forheavenssake/')
|
class Boondocks(_Arcamax):
|
||||||
#add('FortKnox', '/thefunnies/fortknox/')
|
path = 'boondocks'
|
||||||
#add('FreeRange', '/thefunnies/freerange/')
|
|
||||||
#add('Garfield', '/thefunnies/garfield/')
|
|
||||||
#add('GetFuzzy', '/thefunnies/getfuzzy/')
|
class BrilliantMindOfEdisonLee(_Arcamax):
|
||||||
#add('Heathcliff', '/thefunnies/heathcliff/')
|
path = 'brilliantmindofedisonlee'
|
||||||
#add('HerbandJamaal', '/thefunnies/herbandjamaal/')
|
# Candorville has a duplicate in GoComics/Candorville
|
||||||
add('HiandLois', '/thefunnies/hiandlois/')
|
|
||||||
#add('HomeAndAway', '/thefunnies/homeandaway/')
|
|
||||||
add('IntelligentLife', '/thefunnies/intelligentlife/')
|
class CarpeDiem(_Arcamax):
|
||||||
add('JerryKingCartoons', '/thefunnies/humorcartoon/')
|
path = 'carpediem'
|
||||||
#add('LittleDogLost', '/thefunnies/littledoglost/')
|
# Cathy has a duplicate in GoComics/Cathy
|
||||||
#add('LongStoryShort', '/thefunnies/longstoryshort/')
|
# ChipBok has a duplicate in GoComics/ChipBok
|
||||||
#add('LooseParts', '/thefunnies/looseparts/')
|
# ChuckleBros has a duplicate in GoComics/ChuckleBros
|
||||||
#add('Luann', '/thefunnies/luann/')
|
# ClayBennett has a duplicate in GoComics/ClayBennett
|
||||||
add('MallardFillmore', '/thefunnies/mallardfillmore/')
|
|
||||||
add('Marvin', '/thefunnies/marvin/')
|
|
||||||
add('MeaningofLila', '/thefunnies/meaningoflila/')
|
class Crankshaft(_Arcamax):
|
||||||
#add('MikeDuJour', '/thefunnies/mikedujour/')
|
path = 'crankshaft'
|
||||||
#add('Momma', '/thefunnies/momma/')
|
# CulDeSac has a duplicate in GoComics/CulDeSac
|
||||||
add('MotherGooseAndGrimm', '/thefunnies/mothergooseandgrimm/')
|
|
||||||
add('Mutts', '/thefunnies/mutts/')
|
|
||||||
#add('NestHeads', '/thefunnies/nestheads/')
|
class Curtis(_Arcamax):
|
||||||
#add('NonSequitur', '/thefunnies/nonsequitur/')
|
path = 'curtis'
|
||||||
#add('OneBigHappy', '/thefunnies/onebighappy/')
|
# DaddysHome has a duplicate in GoComics/DaddysHome
|
||||||
#add('Peanuts', '/thefunnies/peanuts/')
|
# DarrinBell has a duplicate in GoComics/DarrinBell
|
||||||
#add('PearlsBeforeSwine', '/thefunnies/pearlsbeforeswine/')
|
|
||||||
#add('Pickles', '/thefunnies/pickles/')
|
|
||||||
#add('RedandRover', '/thefunnies/redandrover/')
|
class DeFlocked(_Arcamax):
|
||||||
#add('ReplyAll', '/thefunnies/replyall/')
|
path = 'deflocked'
|
||||||
add('RhymeswithOrange', '/thefunnies/rhymeswithorange/')
|
|
||||||
#add('Rubes', '/thefunnies/rubes/')
|
|
||||||
#add('RudyPark', '/thefunnies/rudypark/')
|
class DennisTheMenace(_Arcamax):
|
||||||
#add('Rugrats', '/thefunnies/rugrats/')
|
path = 'dennisthemenace'
|
||||||
#add('ScaryGary', '/thefunnies/scarygary/')
|
# DiamondLil has a duplicate in GoComics/DiamondLil
|
||||||
#add('SpeedBump', '/thefunnies/speedbump/')
|
# Dilbert has a duplicate in Dilbert
|
||||||
#add('StrangeBrew', '/thefunnies/strangebrew/')
|
|
||||||
add('TakeItFromTheTinkersons', '/thefunnies/takeitfromthetinkersons/')
|
|
||||||
#add('TheBarn', '/thefunnies/thebarn/')
|
class DinetteSet(_Arcamax):
|
||||||
add('TheLockhorns', '/thefunnies/thelockhorns/')
|
path = 'thedinetteset'
|
||||||
#add('TheOtherCoast', '/thefunnies/theothercoast/')
|
# DogEatDoug has a duplicate in GoComics/DogEatDoug
|
||||||
add('TinasGroove', '/thefunnies/tinasgroove/')
|
# DogsOfCKennel has a duplicate in GoComics/DogsOfCKennel
|
||||||
#add('WeePals', '/thefunnies/weepals/')
|
# Doonesbury has a duplicate in GoComics/Doonesbury
|
||||||
#add('WizardofId', '/thefunnies/wizardofid/')
|
|
||||||
#add('WorkingitOut', '/thefunnies/workingitout/')
|
|
||||||
#add('Wumo', '/thefunnies/wumo/')
|
class Dustin(_Arcamax):
|
||||||
#add('ZackHill', '/thefunnies/zackhill/')
|
path = 'dustin'
|
||||||
add('Zits', '/thefunnies/zits/')
|
|
||||||
|
|
||||||
|
class FamilyCircus(_Arcamax):
|
||||||
|
path = 'familycircus'
|
||||||
|
# FloAndFriends has a duplicate in GoComics/FloAndFriends
|
||||||
|
# ForBetterOrForWorse has a duplicate in GoComics/ForBetterOrForWorse
|
||||||
|
# ForHeavensSake has a duplicate in GoComics/ForHeavensSake
|
||||||
|
# FortKnox has a duplicate in GoComics/FortKnox
|
||||||
|
# FreeRange has a duplicate in GoComics/FreeRange
|
||||||
|
# Garfield has a duplicate in GoComics/Garfield
|
||||||
|
# GetFuzzy has a duplicate in GoComics/GetFuzzy
|
||||||
|
# HagarTheHorrible has a duplicate in HagarTheHorrible
|
||||||
|
# Heathcliff has a duplicate in GoComics/Heathcliff
|
||||||
|
# HerbAndJamaal has a duplicate in GoComics/HerbAndJamaal
|
||||||
|
|
||||||
|
|
||||||
|
class HiAndLois(_Arcamax):
|
||||||
|
path = 'hiandlois'
|
||||||
|
|
||||||
|
|
||||||
|
class IntelligentLife(_Arcamax):
|
||||||
|
path = 'intelligentlife'
|
||||||
|
|
||||||
|
|
||||||
|
class JerryKingCartoons(_Arcamax):
|
||||||
|
path = 'humorcartoon'
|
||||||
|
# LisaBenson has a duplicate in GoComics/LisaBenson
|
||||||
|
# LittleDogLost has a duplicate in GoComics/LittleDogLost
|
||||||
|
# LongStoryShort has a duplicate in Creators/LongStoryShort
|
||||||
|
# LooseParts has a duplicate in GoComics/LooseParts
|
||||||
|
# Luann has a duplicate in GoComics/Luann
|
||||||
|
|
||||||
|
|
||||||
|
class MallardFillmore(_Arcamax):
|
||||||
|
path = 'mallardfillmore'
|
||||||
|
|
||||||
|
|
||||||
|
class Marvin(_Arcamax):
|
||||||
|
path = 'marvin'
|
||||||
|
|
||||||
|
|
||||||
|
class MasterStrokesGolfTips(_Arcamax):
|
||||||
|
path = 'masterstrokes'
|
||||||
|
|
||||||
|
|
||||||
|
class MeaningOfLila(_Arcamax):
|
||||||
|
path = 'meaningoflila'
|
||||||
|
# MichaelRamirez has a duplicate in GoComics/MichaelRamirez
|
||||||
|
# MikeDuJour has a duplicate in GoComics/MikeDuJour
|
||||||
|
# MikeLester has a duplicate in GoComics/MikeLester
|
||||||
|
# MikeLuckovich has a duplicate in GoComics/MikeLuckovich
|
||||||
|
# Momma has a duplicate in GoComics/Momma
|
||||||
|
|
||||||
|
|
||||||
|
class MotherGooseAndGrimm(_Arcamax):
|
||||||
|
path = 'mothergooseandgrimm'
|
||||||
|
|
||||||
|
|
||||||
|
class Mutts(_Arcamax):
|
||||||
|
path = 'mutts'
|
||||||
|
# NestHeads has a duplicate in GoComics/NestHeads
|
||||||
|
# NickAnderson has a duplicate in GoComics/NickAnderson
|
||||||
|
# NonSequitur has a duplicate in GoComics/NonSequitur
|
||||||
|
# OneBigHappy has a duplicate in GoComics/OneBigHappy
|
||||||
|
# Peanuts has a duplicate in GoComics/Peanuts
|
||||||
|
# PearlsBeforeSwine has a duplicate in GoComics/PearlsBeforeSwine
|
||||||
|
# Pickles has a duplicate in GoComics/Pickles
|
||||||
|
# RedAndRover has a duplicate in GoComics/RedAndRover
|
||||||
|
# ReplyAll has a duplicate in GoComics/ReplyAll
|
||||||
|
|
||||||
|
|
||||||
|
class RhymesWithOrange(_Arcamax):
|
||||||
|
path = 'rhymeswithorange'
|
||||||
|
# Rubes has a duplicate in GoComics/Rubes
|
||||||
|
# RudyPark has a duplicate in GoComics/RudyPark
|
||||||
|
# Rugrats has a duplicate in Creators/Rugrats
|
||||||
|
# ScaryGary has a duplicate in GoComics/ScaryGary
|
||||||
|
# Shoe has a duplicate in GoComics/Shoe
|
||||||
|
# SigneWilkinson has a duplicate in GoComics/SigneWilkinson
|
||||||
|
# SpeedBump has a duplicate in GoComics/SpeedBump
|
||||||
|
# SteveBenson has a duplicate in GoComics/SteveBenson
|
||||||
|
# SteveBreen has a duplicate in GoComics/SteveBreen
|
||||||
|
# StrangeBrew has a duplicate in GoComics/StrangeBrew
|
||||||
|
|
||||||
|
|
||||||
|
class TakeItFromTheTinkersons(_Arcamax):
|
||||||
|
path = 'takeitfromthetinkersons'
|
||||||
|
# TheBarn has a duplicate in GoComics/TheBarn
|
||||||
|
|
||||||
|
|
||||||
|
class TheLockhorns(_Arcamax):
|
||||||
|
path = 'thelockhorns'
|
||||||
|
# TheOtherCoast has a duplicate in GoComics/TheOtherCoast
|
||||||
|
|
||||||
|
|
||||||
|
class TinasGroove(_Arcamax):
|
||||||
|
path = 'tinasgroove'
|
||||||
|
# WeePals has a duplicate in GoComics/WeePals
|
||||||
|
# WizardOfId has a duplicate in GoComics/WizardOfId
|
||||||
|
# WorkingItOut has a duplicate in GoComics/WorkingItOut
|
||||||
|
# Wumo has a duplicate in GoComics/WuMo
|
||||||
|
# ZackHill has a duplicate in GoComics/ZackHill
|
||||||
|
|
||||||
|
|
||||||
|
class Zits(_Arcamax):
|
||||||
|
path = 'zits'
|
||||||
|
|
|
@ -10,21 +10,20 @@ processing.
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from lxml import html
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
from dosagelib.util import get_page
|
from dosagelib.util import get_page
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scrapers
|
||||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
|
truncate_name, format_name)
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
url_matcher = re.compile(r'<li><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
"HagartheHorrible", # better source available
|
"HagartheHorrible", # better source available
|
||||||
|
@ -35,20 +34,22 @@ def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = get_page(url, session).text
|
data = html.document_fromstring(get_page(url, session).text)
|
||||||
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
|
||||||
shortname = match.group(1)
|
for comiclink in data.cssselect('a.comic-icon'):
|
||||||
name = format_name(match.group(2))
|
path = comiclink.attrib['href']
|
||||||
|
name = format_name(comiclink.attrib['title'])
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
res[name] = shortname
|
res[name] = path.rsplit('/', 2)[1]
|
||||||
if not res:
|
if not res:
|
||||||
print("ERROR:", "did not match any comics", file=sys.stderr)
|
print("ERROR:", "did not match any comics", file=sys.stderr)
|
||||||
|
|
||||||
|
@ -62,7 +63,7 @@ def get_results():
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
def has_comic(name):
|
def find_dups(name):
|
||||||
"""Check if comic name already exists."""
|
"""Check if comic name already exists."""
|
||||||
names = [
|
names = [
|
||||||
("Creators/%s" % name).lower(),
|
("Creators/%s" % name).lower(),
|
||||||
|
@ -72,26 +73,29 @@ def has_comic(name):
|
||||||
("ComicGenesis/%s" % name).lower(),
|
("ComicGenesis/%s" % name).lower(),
|
||||||
("SmackJeeves/%s" % name).lower(),
|
("SmackJeeves/%s" % name).lower(),
|
||||||
]
|
]
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperobj in get_scrapers():
|
||||||
lname = scraperclass.getName().lower()
|
lname = scraperobj.name.lower()
|
||||||
if lname in names or lname == name.lower():
|
if lname in names or lname == name.lower():
|
||||||
return True
|
return scraperobj.name
|
||||||
return False
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def first_lower(x):
|
||||||
|
return x[0].lower()
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
def print_results(args):
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||||
min_comics, filename = args
|
min_comics, filename = args
|
||||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||||
for name, shortname in sorted(load_result(json_file).items()):
|
data = load_result(json_file)
|
||||||
if name in exclude_comics:
|
for name, path in sorted(data.items(), key=first_lower):
|
||||||
continue
|
dup = find_dups(name)
|
||||||
if has_comic(name):
|
if dup is not None:
|
||||||
prefix = u'#'
|
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
|
||||||
else:
|
else:
|
||||||
prefix = u''
|
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
|
||||||
fp.write(u"%sadd(%r, %r)\n" % (prefix, str(truncate_name(name)),
|
truncate_name(name), path))
|
||||||
str(shortname)))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue