Add comic strips from Arcamax.

This commit is contained in:
Bastian Kleineidam 2013-01-23 19:34:11 +01:00
parent d54d787af1
commit 0e438b864e
9 changed files with 210 additions and 10 deletions

View file

@ -1,15 +1,17 @@
Dosage 1.9 (released xx.xx.xxxx)
Features:
- comics: Added AmazingSuperPowers strip.
- comics: Added PandyLand strip.
- comics: Added AmazingSuperPowers comic strip.
- comics: Added PandyLand comic strip.
- comics: Added all comic strips from Arcamax (eg. including
Hagar the horrible).
Changes:
- comics: CyanideAndHappiness image filename now has the strip number prefixed.
Fixes:
- scripts: Ensure the generated comic names do not exceed 100 characters so they do
not cause problems with path length restrictions.
- scripts: Ensure the generated comic names do not exceed 100 characters so
they do not cause problems with path length restrictions.
Dosage 1.8 (released 20.12.2012)

2
dosage
View file

@ -2,7 +2,7 @@
# -*- coding: iso-8859-1 -*-
# Dosage, the webcomic downloader
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2013 Bastian Kleineidam
from __future__ import print_function
import sys
import os

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2013 Bastian Kleineidam
"""
Functions to load plugin modules.
"""

View file

@ -0,0 +1,109 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2013 Bastian Kleineidam
"""
Arcamax comic strips
"""
from re import compile
from ..scraper import make_scraper
from ..util import tagre
_imageSearch = compile(tagre("a", "href", r'(/newspics/[^"]+)', after='zoom'))
_prevSearch = compile(tagre("a", "href", r'(/[^"]+)', before='prev'))
def add(name, shortname):
latestUrl = 'http://www.arcamax.com%s' % shortname
classname = 'Arcamax_%s' % name
globals()[classname] = make_scraper(classname,
name='Arcamax/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '%s/',
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: none',
)
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
#add('9ChickweedLane', '/thefunnies/ninechickweedlane/')
#add('Agnes', '/thefunnies/agnes/')
#add('AndyCapp', '/thefunnies/andycapp/')
#add('Archie', '/thefunnies/archie/')
add('ArcticCircle', '/thefunnies/arcticcircle/')
#add('AskShagg', '/thefunnies/askshagg/')
#add('BC', '/thefunnies/bc/')
add('BabyBlues', '/thefunnies/babyblues/')
#add('BallardStreet', '/thefunnies/ballardstreet/')
#add('BarneyAndClyde', '/thefunnies/barneyandclyde/')
add('BarneyGoogleAndSnuffySmith', '/thefunnies/barneygoogle/')
add('BeetleBailey', '/thefunnies/beetlebailey/')
add('Bizarro', '/thefunnies/bizarro/')
add('BleekerTheRechargeableDog', '/thefunnies/bleekertherechargeabledog/')
add('Blondie', '/thefunnies/blondie/')
add('Boondocks', '/thefunnies/boondocks/')
add('BrilliantMindofEdisonLee', '/thefunnies/brilliantmindofedisonlee/')
add('CafC3A9ConLeche', '/thefunnies/cafeconleche/')
#add('Candorville', '/thefunnies/candorville/')
#add('Cathy', '/thefunnies/cathy/')
#add('ChuckleBros', '/thefunnies/chucklebros/')
add('Crankshaft', '/thefunnies/crankshaft/')
#add('CuldeSac', '/thefunnies/culdesac/')
add('Curtis', '/thefunnies/curtis/')
#add('DaddysHome', '/thefunnies/daddyshome/')
add('DeFlocked', '/thefunnies/deflocked/')
add('DennistheMenace', '/thefunnies/dennisthemenace/')
#add('DiamondLil', '/thefunnies/diamondlil/')
add('Dilbert', '/thefunnies/dilbert/')
add('DinetteSet', '/thefunnies/thedinetteset/')
#add('DogEatDoug', '/thefunnies/dogeatdoug/')
#add('DogsofCKennel', '/thefunnies/dogsofckennel/')
#add('Doonesbury', '/thefunnies/doonesbury/')
add('Dustin', '/thefunnies/dustin/')
add('FamilyCircus', '/thefunnies/familycircus/')
#add('FloAndFriends', '/thefunnies/floandfriends/')
#add('ForHeavensSake', '/thefunnies/forheavenssake/')
#add('FortKnox', '/thefunnies/fortknox/')
#add('FreeRange', '/thefunnies/freerange/')
#add('Garfield', '/thefunnies/garfield/')
#add('GetFuzzy', '/thefunnies/getfuzzy/')
add('HagartheHorrible', '/thefunnies/hagarthehorrible/')
#add('Heathcliff', '/thefunnies/heathcliff/')
#add('HerbandJamaal', '/thefunnies/herbandjamaal/')
add('HiandLois', '/thefunnies/hiandlois/')
#add('HomeAndAway', '/thefunnies/homeandaway/')
add('JerryKingCartoons', '/thefunnies/humorcartoon/')
#add('LittleDogLost', '/thefunnies/littledoglost/')
#add('Luann', '/thefunnies/luann/')
add('MallardFillmore', '/thefunnies/mallardfillmore/')
add('Marvin', '/thefunnies/marvin/')
add('MeaningofLila', '/thefunnies/meaningoflila/')
#add('Momma', '/thefunnies/momma/')
add('MotherGooseAndGrimm', '/thefunnies/mothergooseandgrimm/')
add('Mutts', '/thefunnies/mutts/')
#add('NestHeads', '/thefunnies/nestheads/')
#add('NonSequitur', '/thefunnies/nonsequitur/')
#add('OnaClaireDay', '/thefunnies/onaclaireday/')
#add('OneBigHappy', '/thefunnies/onebighappy/')
#add('Peanuts', '/thefunnies/peanuts/')
#add('PearlsBeforeSwine', '/thefunnies/pearlsbeforeswine/')
#add('Pickles', '/thefunnies/pickles/')
#add('RedandRover', '/thefunnies/redandrover/')
#add('ReplyAll', '/thefunnies/replyall/')
add('RhymeswithOrange', '/thefunnies/rhymeswithorange/')
#add('Rubes', '/thefunnies/rubes/')
#add('Rugrats', '/thefunnies/rugrats/')
#add('ScaryGary', '/thefunnies/scarygary/')
#add('SpeedBump', '/thefunnies/speedbump/')
#add('StrangeBrew', '/thefunnies/strangebrew/')
#add('TheBarn', '/thefunnies/thebarn/')
add('TheLockhorns', '/thefunnies/thelockhorns/')
#add('TheOtherCoast', '/thefunnies/theothercoast/')
#add('ThinLines', '/thefunnies/thinlines/')
add('TinasGroove', '/thefunnies/tinasgroove/')
#add('WatchYourHead', '/thefunnies/watchyourhead/')
#add('WeePals', '/thefunnies/weepals/')
#add('WizardofId', '/thefunnies/wizardofid/')
#add('WorkingitOut', '/thefunnies/workingitout/')
#add('ZackHill', '/thefunnies/zackhill/')
add('Zits', '/thefunnies/zits/')

1
scripts/arcamax.json Normal file
View file

@ -0,0 +1 @@
{"9ChickweedLane": "/thefunnies/ninechickweedlane/", "Agnes": "/thefunnies/agnes/", "AndyCapp": "/thefunnies/andycapp/", "Archie": "/thefunnies/archie/", "ArcticCircle": "/thefunnies/arcticcircle/", "AskShagg": "/thefunnies/askshagg/", "BC": "/thefunnies/bc/", "BabyBlues": "/thefunnies/babyblues/", "BallardStreet": "/thefunnies/ballardstreet/", "BarneyAndClyde": "/thefunnies/barneyandclyde/", "BarneyGoogleAndSnuffySmith": "/thefunnies/barneygoogle/", "BeetleBailey": "/thefunnies/beetlebailey/", "Bizarro": "/thefunnies/bizarro/", "BleekerTheRechargeableDog": "/thefunnies/bleekertherechargeabledog/", "Blondie": "/thefunnies/blondie/", "Boondocks": "/thefunnies/boondocks/", "BrilliantMindofEdisonLee": "/thefunnies/brilliantmindofedisonlee/", "CafC3A9ConLeche": "/thefunnies/cafeconleche/", "Candorville": "/thefunnies/candorville/", "Cathy": "/thefunnies/cathy/", "ChuckleBros": "/thefunnies/chucklebros/", "Crankshaft": "/thefunnies/crankshaft/", "CuldeSac": "/thefunnies/culdesac/", "Curtis": "/thefunnies/curtis/", "DaddysHome": "/thefunnies/daddyshome/", "DeFlocked": "/thefunnies/deflocked/", "DennistheMenace": "/thefunnies/dennisthemenace/", "DiamondLil": "/thefunnies/diamondlil/", "Dilbert": "/thefunnies/dilbert/", "DinetteSet": "/thefunnies/thedinetteset/", "DogEatDoug": "/thefunnies/dogeatdoug/", "DogsofCKennel": "/thefunnies/dogsofckennel/", "Doonesbury": "/thefunnies/doonesbury/", "Dustin": "/thefunnies/dustin/", "FamilyCircus": "/thefunnies/familycircus/", "FloAndFriends": "/thefunnies/floandfriends/", "ForHeavensSake": "/thefunnies/forheavenssake/", "FortKnox": "/thefunnies/fortknox/", "FreeRange": "/thefunnies/freerange/", "Garfield": "/thefunnies/garfield/", "GetFuzzy": "/thefunnies/getfuzzy/", "HagartheHorrible": "/thefunnies/hagarthehorrible/", "Heathcliff": "/thefunnies/heathcliff/", "HerbandJamaal": "/thefunnies/herbandjamaal/", "HiandLois": "/thefunnies/hiandlois/", "HomeAndAway": "/thefunnies/homeandaway/", "JerryKingCartoons": "/thefunnies/humorcartoon/", "LittleDogLost": "/thefunnies/littledoglost/", "Luann": "/thefunnies/luann/", "MallardFillmore": "/thefunnies/mallardfillmore/", "Marvin": "/thefunnies/marvin/", "MeaningofLila": "/thefunnies/meaningoflila/", "Momma": "/thefunnies/momma/", "MotherGooseAndGrimm": "/thefunnies/mothergooseandgrimm/", "Mutts": "/thefunnies/mutts/", "NestHeads": "/thefunnies/nestheads/", "NonSequitur": "/thefunnies/nonsequitur/", "OnaClaireDay": "/thefunnies/onaclaireday/", "OneBigHappy": "/thefunnies/onebighappy/", "Peanuts": "/thefunnies/peanuts/", "PearlsBeforeSwine": "/thefunnies/pearlsbeforeswine/", "Pickles": "/thefunnies/pickles/", "RedandRover": "/thefunnies/redandrover/", "ReplyAll": "/thefunnies/replyall/", "RhymeswithOrange": "/thefunnies/rhymeswithorange/", "Rubes": "/thefunnies/rubes/", "Rugrats": "/thefunnies/rugrats/", "ScaryGary": "/thefunnies/scarygary/", "SpeedBump": "/thefunnies/speedbump/", "StrangeBrew": "/thefunnies/strangebrew/", "TheBarn": "/thefunnies/thebarn/", "TheLockhorns": "/thefunnies/thelockhorns/", "TheOtherCoast": "/thefunnies/theothercoast/", "ThinLines": "/thefunnies/thinlines/", "TinasGroove": "/thefunnies/tinasgroove/", "WatchYourHead": "/thefunnies/watchyourhead/", "WeePals": "/thefunnies/weepals/", "WizardofId": "/thefunnies/wizardofid/", "WorkingitOut": "/thefunnies/workingitout/", "ZackHill": "/thefunnies/zackhill/", "Zits": "/thefunnies/zits/"}

88
scripts/arcamax.py Executable file
View file

@ -0,0 +1,88 @@
#!/usr/bin/env python
# Copyright (C) 2013 Bastian Kleineidam
"""
Script to get arcamax comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<li><b><a href="(/thefunnies/[^"]+)">([^<]+)</a>')
# names of comics to exclude
exclude_comics = [
]
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
name = capfirst(name)
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.arcamax.com/comics', res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
]
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
if lname in names:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
for name, shortname in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
if has_comic(name):
prefix = '#'
else:
prefix = ''
print("%sadd(%r, %r)" % (prefix, str(truncate_name(name)), str(shortname)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

View file

@ -2,7 +2,7 @@
set -u
d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot smackjeeves; do
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
echo "Executing ${script}.py"
"${d}/${script}.py"
done

View file

@ -1,5 +1,5 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2013 Bastian Kleineidam
"""Remove all lines after a given marker line.
"""
from __future__ import print_function

View file

@ -1,11 +1,11 @@
#!/bin/sh -e
# Copyright (C) 2012 Bastian Kleineidam
# Copyright (C) 2012-2013 Bastian Kleineidam
set -u
mincomics=100
d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot smackjeeves; do
for script in creators gocomics drunkduck universal keenspot smackjeeves arcamax; do
target="${d}/../dosagelib/plugins/${script}.py"
echo "Upating $target"
"${d}/removeafter.py" "$target" "# DO NOT REMOVE"