Add comic scripts, add fixes and other stuff.

This commit is contained in:
Bastian Kleineidam 2012-11-28 18:15:12 +01:00
parent a52e5ae575
commit 451fd982d9
41 changed files with 4130 additions and 4523 deletions

2
.gitignore vendored
View file

@ -11,3 +11,5 @@
/_Dosage_configdata.py
/comics.test
/testresults.txt
/dosage.prof
/test.sh

View file

@ -1,4 +1,6 @@
include MANIFEST.in
include COPYING doc/*.txt doc/*.1 doc/*.html
include Makefile
include requirements.txt
include scripts/*.py scripts/*.sh
recursive-include tests *.py

View file

@ -3,7 +3,7 @@ PYVER:=2.7
PYTHON:=python$(PYVER)
VERSION:=$(shell $(PYTHON) setup.py --version)
ARCHIVE:=dosage-$(VERSION).tar.gz
PY_FILES_DIRS := dosage dosagelib tests *.py
PY_FILES_DIRS := dosage dosagelib scripts tests *.py
PY2APPOPTS ?=
NUMPROCESSORS:=$(shell grep -c processor /proc/cpuinfo)
# Pytest options:
@ -33,8 +33,8 @@ doc/dosage.1.html: doc/dosage.1
release: distclean releasecheck dist
git tag v$(VERSION)
# @echo "Register at Python Package Index..."
# $(PYTHON) setup.py register
@echo "Register at Python Package Index..."
$(PYTHON) setup.py register
# freecode-submit < dosage.freecode
@ -75,7 +75,7 @@ clean:
rm -rf build dist
distclean: clean
rm -rf build dist Dosage.egg-info
rm -rf build dist Dosage.egg-info dosage.prof test.sh testresults.txt
rm -f _Dosage_configdata.py MANIFEST
localbuild:
@ -87,11 +87,8 @@ test: localbuild
deb:
git-buildpackage --git-export-dir=../build-area/ --git-upstream-branch=master --git-debian-branch=debian --git-ignore-new
comics:
./dosage -v @@ > comics.log 2>&1
update-copyright:
update-copyright --holder="Bastian Kleineidam"
.PHONY: update-copyright comics deb test clean distclean count pyflakes
.PHONY: update-copyright deb test clean distclean count pyflakes
.PHONY: doccheck check releasecheck release dist chmod localbuild

View file

@ -21,7 +21,7 @@ you may be infringing upon various copyrights.
Usage
------
List available comics (over 4400 at the moment):
List available comics (over 3500 at the moment):
`$ dosage -l`

View file

@ -12,7 +12,10 @@ Changes:
- comics: Removed the twisted and zope dependencies by adding
an internal plugin search mechanism.
- comics: Remove the disable mechanism.
- testing: Refactored the test comic routine in proper unit tests.
- comcis: Add scripts to automate comic listings for Creators, Universal,
KeenSpot, GoComics and DrunkDuck.
- testing: Refactored the test comic routine into fully automatic and complete
tests cases for every comic.
- cmdline: Improved terminal feature detection.
Fixes:

15
dosage
View file

@ -202,5 +202,20 @@ def main():
return res
def profile():
"""Profile the loading of all scrapers."""
import cProfile
cProfile.run("scraper.get_scrapers()", "dosage.prof")
def viewprof():
"""View profile stats."""
import pstats
stats = pstats.Stats("dosage.prof")
stats.strip_dirs().sort_stats("cumulative").print_stats(100)
if __name__ == '__main__':
sys.exit(main())
#profile()
#viewprof()

View file

@ -83,7 +83,7 @@ def GetConsoleScreenBufferInfo(stream_id=STDOUT):
"""Get console screen buffer info object."""
handle = handles[stream_id]
csbi = CONSOLE_SCREEN_BUFFER_INFO()
success = windll.kernel32.GetConsoleScreenBufferInfo(
windll.kernel32.GetConsoleScreenBufferInfo(
handle, byref(csbi))
return csbi

View file

@ -8,7 +8,7 @@ import rfc822
import time
from .output import out
from .util import urlopen, saneDataSize, normaliseURL
from .util import urlopen, saneDataSize, normaliseURL, unquote
from .events import getHandler
class FetchComicError(IOError):
@ -54,10 +54,10 @@ class ComicImage(object):
"""Connect to host and get meta information."""
try:
self.urlobj = urlopen(self.url, referrer=self.referrer)
except IOError as he:
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
except IOError as msg:
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
content_type = self.urlobj.headers.get('content-type')
content_type = unquote(self.urlobj.headers.get('content-type'))
content_type = content_type.split(';', 1)[0]
if '/' in content_type:
maintype, subtype = content_type.split('/', 1)
@ -65,7 +65,7 @@ class ComicImage(object):
maintype = content_type
subtype = None
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
raise FetchComicError('No suitable image found to retrieve.', self.url)
raise FetchComicError('Content type %r is not an image.' % content_type, self.url)
# Always use mime type for file extension if it is sane.
if maintype == 'image':

View file

@ -4,78 +4,75 @@
from re import compile
from ..scraper import make_scraper
from ..util import tagre, asciify
from ..util import tagre
def add(name, shortname):
baseUrl = 'http://www.creators.com/comics/'
classname = 'Creators_%s' % asciify(name)
_imageSearch = compile(tagre("img", "src", r'(/comics/\d+/[^"]+)'))
def add(name, path):
baseurl = 'http://www.creators.com'
classname = 'Creators_%s' % name
globals()[classname] = make_scraper(classname,
name = 'Creators/' + name,
latestUrl = baseUrl + shortname + '.html',
stripUrl = baseUrl + shortname + '/%s.html',
imageSearch = compile(tagre("img", "src", r'(/comics/\d+/[^"]+)')),
prevSearch = compile(tagre("a", "href", r'(/comics/%s/\d+\.html)' % shortname) +
latestUrl = baseurl + path + '.html',
stripUrl = baseurl + path + '/%s.html',
imageSearch = _imageSearch,
prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) +
tagre("img", "src", r'/img_comics/arrow_l\.gif')),
help = 'Index format: n',
)
# for a complete list see http://www.creators.com/comics/cat-seeall.html
comics = {
'Agnes': 'agnes',
'AndyCapp': 'andy-capp',
'Archie': 'archie',
'AskShagg': 'ask-shagg',
'BallardStreet': 'ballard-street',
'BC': 'bc',
'TheBarn': 'the-barn',
'CafeConLeche': 'cafe-con-leche',
'ChuckleBros': 'chuckle-bros',
'DaddysHome': 'daddys-home',
'DiamondLil': 'diamond-lil',
'TheDinetteSet': 'dinette-set',
'DogEatDoug': 'dog-eat-doug',
'DogsOfCKennel': 'dogs-of-c-kennel',
'DonaldDuck': 'donald-duck',
'FloAndFriends': 'flo-and-friends',
'Flare': 'flare',
'FlightDeck': 'flight-deck',
'ForHeavensSake': 'for-heavens-sake',
'FreeRange': 'free-range',
'GirlsAndSports': 'girls-and-sports',
'Heathcliff': 'heathcliff',
'HerbAndJamaal': 'herb-and-jamaal',
'HopeAndDeath': 'hope-and-death',
'LibertyMeadows': 'liberty-meadows',
'TheMeaningOfLila': 'meaning-of-lila',
'MickeyMouse': 'mickey-mouse',
'Momma': 'momma',
'NestHeads': 'nest-heads',
'OneBigHappy': 'one-big-happy',
'OnAClaireDay': 'on-a-claire-day',
'TheOtherCoast': 'the-other-coast',
'TheQuigmans': 'the-quigmans',
'Rubes': 'rubes',
'Rugrats': 'rugrats',
'ScaryGary': 'scary-gary',
'SpeedBump': 'speed-bump',
'StrangeBrew': 'strange-brew',
'ThinLines': 'thin-lines',
'WeePals': 'wee-pals',
'WizardOfId': 'wizard-of-id',
'WorkingItOut': 'working-it-out',
'ZackHill': 'zack-hill',
'BCSpanish': 'bc-spanish',
'WizardOfIdSpanish': 'wizard-of-id-spanish',
'ArchieSpanish': 'archie-spanish',
'HeathcliffSpanish': 'heathcliff-spanish',
'RugratsSpanish': 'rugrats-spanish',
'LongStoryShort': 'long-story-short',
'Recess': 'recess',
'HomeOffice': 'stay-at-home-dad',
'OffCenter': 'off-center',
'GirlsAndSportsSpanish': 'girls-and-sports-spanish',
}
for name, shortname in comics.items():
add(name, shortname)
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
add('Agnes', '/comics/agnes')
add('AndyCapp', '/comics/andy-capp')
add('Archie', '/comics/archie')
add('ArchieinSpanish', '/comics/archie-spanish')
add('AskShagg', '/comics/ask-shagg')
add('BC', '/comics/bc')
add('BCinSpanish', '/comics/bc-spanish')
add('BallardStreet', '/comics/ballard-street')
add('CafeconLeche', '/comics/cafe-con-leche')
add('ChuckleBros', '/comics/chuckle-bros')
add('DaddysHome', '/comics/daddys-home')
add('DiamondLil', '/comics/diamond-lil')
add('DogEatDoug', '/comics/dog-eat-doug')
add('DogsofCKennel', '/comics/dogs-of-c-kennel')
add('DonaldDuck', '/comics/donald-duck')
add('Flare', '/comics/flare')
add('FlightDeck', '/comics/flight-deck')
add('FloandFriends', '/comics/flo-and-friends')
add('ForHeavensSake', '/comics/for-heavens-sake')
add('FreeRange', '/comics/free-range')
add('GirlsAndSports', '/comics/girls-and-sports')
add('GirlsandSportsinSpanish', '/comics/girls-and-sports-spanish')
add('Heathcliff', '/comics/heathcliff')
add('HeathcliffinSpanish', '/comics/heathcliff-spanish')
add('HerbandJamaal', '/comics/herb-and-jamaal')
add('HomeOffice', '/comics/stay-at-home-dad')
add('HopeAndDeath', '/comics/hope-and-death')
add('LibertyMeadows', '/comics/liberty-meadows')
add('LongStoryShort', '/comics/long-story-short')
add('MickeyMouse', '/comics/mickey-mouse')
add('Momma', '/comics/momma')
add('NestHeads', '/comics/nest-heads')
add('OffCenter', '/comics/off-center')
add('OnaClaireDay', '/comics/on-a-claire-day')
add('OneBigHappy', '/comics/one-big-happy')
add('Recess', '/comics/recess')
add('Rubes', '/comics/rubes')
add('Rugrats', '/comics/rugrats')
add('RugratsinSpanish', '/comics/rugrats-spanish')
add('ScaryGary', '/comics/scary-gary')
add('SpeedBump', '/comics/speed-bump')
add('StrangeBrew', '/comics/strange-brew')
add('TheBarn', '/comics/the-barn')
add('TheDinetteSet', '/comics/dinette-set')
add('TheMeaningofLila', '/comics/meaning-of-lila')
add('TheOtherCoast', '/comics/the-other-coast')
add('TheQuigmans', '/comics/the-quigmans')
add('TheWizardofIdinSpanish', '/comics/wizard-of-id-spanish')
add('ThinLines', '/comics/thin-lines')
add('WeePals', '/comics/wee-pals')
add('WizardofId', '/comics/wizard-of-id')
add('WorkingitOut', '/comics/working-it-out')
add('ZackHill', '/comics/zack-hill')

View file

@ -143,4 +143,10 @@ class DresdenCodak(_BasicScraper):
starter = indirectStarter('http://dresdencodak.com/', compile(r'<div id="preview"><a href="http://dresdencodak.com/(\d+/\d+/\d+/.*?)">'))
# XXX dilbert.com
class Dilbert(_BasicScraper):
latestUrl = 'http://dilbert.com/'
stripUrl = latestUrl + '%s/'
prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+/)', after="STR_Prev"))
imageSearch = compile(tagre("img", "src", r'(/dyn/str_strip/[^"]+\.strip\.zoom\.gif)'))
help = 'Index format: yyyy-mm-dd'
# XXX namer

File diff suppressed because it is too large Load diff

View file

@ -7,6 +7,9 @@ from ..scraper import make_scraper
from ..util import asciify
_imageSearch = compile(r'SRC="(http://www\.thefallenangel\.co\.uk/\w+comics/.+?)"')
_prevSearch = compile(r' <a href="(http://www\.thefallenangel\.co\.uk/.+?)"><img[^>]+?src="http://www\.thefallenangel\.co\.uk/images/previousday\.jpg"')
def add(name, shortname):
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
classname = asciify(name)
@ -14,8 +17,8 @@ def add(name, shortname):
latestUrl = latestUrl,
stripUrl = latestUrl + '?date=%s',
name='FallenAngel/' + name,
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: yyyymmdd',
)

File diff suppressed because it is too large Load diff

View file

@ -4,7 +4,6 @@
from re import compile, IGNORECASE
from ..scraper import _BasicScraper
from ..util import tagre
class Key(_BasicScraper):

File diff suppressed because it is too large Load diff

View file

@ -6,6 +6,9 @@ from re import compile
from ..scraper import make_scraper
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
def add(name, shortname):
baseUrl = 'http://www.nuklearpower.com/' + shortname + '/'
classname = 'NuklearPower_%s' % name
@ -14,8 +17,8 @@ def add(name, shortname):
name='NuklearPower/' + name,
latestUrl = baseUrl,
stripUrl = baseUrl + '%s',
imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)')),
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous"),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: yyyy/mm/dd/name',
)

View file

@ -6,6 +6,8 @@ from ..scraper import make_scraper
from ..helpers import bounceStarter
from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(http://www\.smackjeeves\.com/images/uploaded/comics/[^"]*)'))
_prevSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="< Previous"')
def add(name):
classname = 'SmackJeeves/' + name
@ -20,8 +22,8 @@ def add(name):
globals()[classname] = make_scraper(classname,
starter=bounceStarter(baseUrl, compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="Next >"')),
stripUrl = baseUrl,
imageSearch = compile(tagre("img", "src", r'(http://www\.smackjeeves\.com/images/uploaded/comics/[^"]*)')),
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="< Previous"'),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: nnnn (some increasing number)',
namer = namer,
)

View file

@ -5,6 +5,10 @@
from re import compile
from ..scraper import make_scraper
_imageSearch = compile(r'<img src=http://\w+\.snafu-comics\.com/(comics/\d{6}_\w*\.\w{3,4})')
_prevSearch = compile(r'<a href="(\?comic_id=\d+)">Previous</a>')
def add(name, host):
baseUrl = 'http://%s.snafu-comics.com/' % host
classname = 'SnafuComics_%s' % name
@ -13,8 +17,8 @@ def add(name, host):
name='SnafuComics/%s' % name,
latestUrl = baseUrl,
stripUrl = baseUrl + 'index.php?strip_id=%s',
imageSearch = compile(r'<img src=http://\w+\.snafu-comics\.com/(comics/\d{6}_\w*\.\w{3,4})'),
prevSearch = compile(r'<a href="(\?comic_id=\d+)">Previous</a>'),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: n (unpadded)',
)

View file

@ -6,7 +6,7 @@ from re import compile
from ..scraper import _BasicScraper
from ..helpers import bounceStarter, indirectStarter
from ..util import getQueryParams, tagre
from ..util import getQueryParams
class Undertow(_BasicScraper):

View file

@ -1,54 +0,0 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
"""
The Universal comics only have some samples, but those samples are always the newest ones.
"""
import datetime
from re import compile, escape
from ..scraper import make_scraper
from ..util import tagre, asciify, getPageContent
def parse_strdate(strdate):
"""Parse date string. XXX this is locale dependant but it should not be."""
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
def add(name, category):
shortname = name.replace(' ', '').lower()
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
classname = 'UClick_%s' % asciify(name)
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Parse publish date from page content which looks like:
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4>
"""
data = getPageContent(pageUrl)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
if mo:
strdate = mo.group(1)
return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname,
name='UClick/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '%s/',
imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
multipleImagesPerStrip = True,
prevSearch = None,
help = 'Index format: none',
namer = namer,
)
# http://www.universaluclick.com/comics/list
comics = {
'9 Chickweed Lane': 'strip',
}
for name, category in comics.items():
add(name, category)

View file

@ -0,0 +1,150 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
"""
The Universal comics only have some samples, but those samples are always the newest ones.
"""
import datetime
from re import compile, escape
from ..scraper import make_scraper
from ..util import tagre, getPageContent
def parse_strdate(strdate):
"""Parse date string. XXX this is locale dependant but it should not be."""
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
def add(name, shortname):
latestUrl = 'http://www.universaluclick.com%s' % shortname
classname = 'UClick_%s' % name
@classmethod
def namer(cls, imageUrl, pageUrl):
"""Parse publish date from page content which looks like:
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4>
"""
data = getPageContent(pageUrl)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
if mo:
strdate = mo.group(1)
return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname,
name='UClick/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '%s/',
imageSearch = _imageSearch,
multipleImagesPerStrip = True,
prevSearch = None,
help = 'Index format: none',
namer = namer,
)
# do not edit anything below since these entries are generated from scripts/update.sh
# DO NOT REMOVE
#add('9ChickweedLane', '/comics/strip/9chickweedlane')
#add('AdamAtHome', '/comics/strip/adamathome')
#add('AlleyOop', '/comics/strip/alley-oop')
#add('ArloandJanis', '/comics/strip/arloandjanis')
#add('BadReporter', '/comics/badreporter')
#add('Baldo', '/comics/strip/baldo')
#add('Betty', '/comics/strip/betty')
#add('BigNate', '/comics/strip/bignate')
#add('Biographic', '/comics/strip/biographic')
add('Brevitystrip', '/comics/strip/brevity')
add('BusinessAndFinance', '/comics/category/business%20%26%20finance')
#add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
#add('Cathy', '/comics/strip/cathy')
#add('Cleats', '/comics/strip/cleats')
#add('ClosetoHome', '/comics/panel/closetohome')
add('ComicPanel', '/comics/panel')
add('ComicStrip', '/comics/strip')
add('ComicsAZ', '/comics/list')
#add('Cornered', '/comics/panel/cornered')
#add('CowandBoy', '/comics/strip/cowandboy')
#add('CuldeSac', '/comics/strip/culdesac')
#add('Dilbert', '/comics/strip/dilbert')
#add('Doonesbury', '/comics/strip/doonesbury')
#add('Drabble', '/comics/strip/drabble')
add('Espaol', '/comics/category/espanol')
#add('FMinus', '/comics/strip/fminus')
add('Family', '/comics/category/family')
#add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
add('ForKids', '/comics/category/for%20kids')
#add('FoxTrot', '/comics/strip/foxtrot')
#add('FrankAndErnest', '/comics/strip/frankandernest')
#add('Frazz', '/comics/strip/frazz')
#add('FredBasset', '/comics/strip/fredbasset')
#add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
#add('Garfield', '/comics/strip/garfield')
#add('GetFuzzy', '/comics/strip/getfuzzy')
#add('GingerMeggs', '/comics/strip/gingermeggs')
#add('Graffiti', '/comics/panel/graffiti')
#add('GrandAvenue', '/comics/strip/grand-avenue')
#add('HealthCapsules', '/comics/panel/healthcapsules')
#add('HeartoftheCity', '/comics/strip/heartofthecity')
#add('Herman', '/comics/panel/herman')
#add('InkPen', '/comics/strip/inkpen')
#add('IntheBleachers', '/comics/panel/inthebleachers')
#add('IntheSticks', '/comics/strip/inthesticks')
add('JamesBond', '/comics/strip/jamesbond')
#add('JumpStart', '/comics/strip/jumpstart')
#add('KidCity', '/comics/strip/kidcity')
#add('KidSpot', '/comics/panel/kidspot')
#add('KitNCarlyle', '/comics/panel/kitncarlyle')
#add('LaCucaracha', '/comics/strip/lacucaracha')
#add('Lio', '/comics/strip/lio')
#add('Lola', '/comics/strip/lola')
#add('Luann', '/comics/strip/luann')
#add('MagicinaMinute', '/comics/strip/magicinaminute')
#add('Marmaduke', '/comics/panel/marmaduke')
add('Men', '/comics/category/men')
#add('ModeratelyConfused', '/comics/panel/moderately-confused')
#add('Monty', '/comics/strip/monty')
#add('MrGigiandtheSquid', '/comics/strip/mr-gigi-and-the-squid')
#add('MuttAndJeff', '/comics/strip/muttandjeff')
add('NEA', '/comics/category/nea')
#add('Nancy', '/comics/strip/nancy')
#add('NonSequitur', '/comics/strip/nonsequitur')
add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
#add('OfftheMark', '/comics/panel/offthemark')
#add('Overboard', '/comics/strip/overboard')
#add('OvertheHedge', '/comics/strip/overthehedge')
#add('Peanuts', '/comics/strip/peanuts')
#add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
add('Pets', '/comics/category/pets')
#add('PoochCafe', '/comics/strip/poochcafe')
#add('PricklyCity', '/comics/strip/pricklycity')
#add('RealLifeAdventures', '/comics/panel/reallifeadventures')
#add('RealityCheck', '/comics/panel/realitycheck')
#add('RedandRover', '/comics/strip/redandrover')
#add('RipHaywire', '/comics/strip/riphaywire')
#add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
#add('RoseisRose', '/comics/strip/roseisrose')
#add('RudyPark', '/comics/strip/rudypark')
#add('Shortcuts', '/comics/strip/shortcuts')
#add('SouptoNutz', '/comics/strip/soup-to-nutz')
#add('StoneSoup', '/comics/strip/stonesoup')
add('SundayOnly', '/comics/category/sunday%20only')
#add('TankMcNamara', '/comics/strip/tankmcnamara')
#add('Tarzan', '/comics/strip/tarzan')
#add('Thatababy', '/comics/strip/thatababy')
#add('TheArgyleSweater', '/comics/panel/theargylesweater')
#add('TheBornLoser', '/comics/strip/the-born-loser')
#add('TheBuckets', '/comics/strip/thebuckets')
#add('TheDinetteSet', '/comics/panel/dinetteset')
#add('TheDuplex', '/comics/strip/duplex')
#add('TheElderberries', '/comics/strip/theelderberries')
#add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
#add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
#add('TheGrizzwells', '/comics/strip/thegrizzwells')
#add('TheKnightLife', '/comics/strip/theknightlife')
#add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
#add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
add('WebExclusive', '/comics/category/web%20exclusive')
add('Women', '/comics/category/women')
#add('Ziggy', '/comics/panel/ziggy')

View file

@ -33,6 +33,7 @@ class Adventure(_VGCats):
class ViiviJaWagner(_BasicScraper):
latestUrl = 'http://www.hs.fi/viivijawagner/'
imageSearch = compile(tagre("link", "href", r'(http://hs12\.snstatic\.fi/webkuva/oletus/[^"]+)', before="image_src"))
prevSearch = compile(tagre("a", "href", r'(/viivijawagner/\d+)', before="prev-cm"))
stripUrl = None
imageSearch = compile(tagre("link", "href", r'(http://hs\d+\.snstatic\.fi/webkuva/oletus/[^"]+)', before="image_src"))
prevSearch = compile(tagre("a", "href", r'(/viivijawagner/[^"]+)', before="prev-cm"))
help = 'Index format: none'

View file

@ -5,6 +5,9 @@
from re import compile, IGNORECASE, DOTALL
from ..scraper import make_scraper
_imageSearch = compile(r'<a name="strip\d*?">.*?<img[^>]+?src="([^"]*?memberimages/.+?)"', IGNORECASE + DOTALL)
_prevSearch = compile(r'href="([^"]*?whichbutton=prev[^"]*?)"', IGNORECASE)
def add(name, subpath):
baseUrl = 'http://www.webcomicsnation.com/'
@ -13,8 +16,8 @@ def add(name, subpath):
name = 'WebcomicsNation/' + name,
latestUrl = baseUrl + subpath,
stripUrl = baseUrl + '?view=archive&amp;chapter=%s',
imageSearch = compile(r'<a name="strip\d*?">.*?<img[^>]+?src="([^"]*?memberimages/.+?)"', IGNORECASE + DOTALL),
prevSearch = compile(r'href="([^"]*?whichbutton=prev[^"]*?)"', IGNORECASE),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
help = 'Index format: nnnn (non-contiguous)',
)

View file

@ -7,6 +7,11 @@ from ..scraper import make_scraper
from ..helpers import bounceStarter
_imageSearch = compile(r'SRC="(http://www\.wlpcomics\.com/adult/.+?|http://www\.wlpcomics\.com/general/.+?)"', IGNORECASE)
_prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE)
_nextSearch = compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE)
def add(name, path):
baseUrl = 'http://www.wlpcomics.com/' + path
classname = 'WLP_' + name
@ -17,10 +22,10 @@ def add(name, path):
globals()[classname] = make_scraper(classname,
name = 'WLP/' + name,
starter = bounceStarter(baseUrl, compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE)),
starter = bounceStarter(baseUrl, _nextSearch),
stripUrl = baseUrl + '%s.html',
imageSearch = compile(r'SRC="(http://www.wlpcomics.com/adult/.+?|http://www.wlpcomics.com/general/.+?)"', IGNORECASE),
prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE),
imageSearch = _imageSearch,
prevSearch = _prevSearch,
namer = namer,
help = 'Index format: nnn',
)

View file

@ -145,13 +145,13 @@ def get_scrapers():
"""
global _scrapers
if _scrapers is None:
out.write("Loading comic modules...")
out.write("Loading comic modules...", 2)
modules = loader.get_modules()
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name())
check_scrapers()
out.write("... %d modules loaded." % len(_scrapers))
out.write("... %d modules loaded." % len(_scrapers), 2)
return _scrapers

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012 Bastian Kleineidam
from __future__ import division, print_function
import urllib2, urlparse
import urllib, urllib2, urlparse
import requests
import sys
import os
@ -43,15 +43,19 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
prefix = r"[^>]*%s[^>]*\s+" % before
else:
prefix = r"(?:[^>]*\s+)?"
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
value=value,
quote=quote,
before=before,
prefix=prefix,
after=after,
)
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
def case_insensitive_re(name):
@ -122,7 +126,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
return imageUrls, None
def _unescape(text):
def unescape(text):
"""
Replace HTML entities and character references.
"""
@ -156,7 +160,7 @@ def normaliseURL(url):
HTML entities and character references.
"""
# XXX: brutal hack
url = _unescape(url)
url = unescape(url)
pu = list(urlparse.urlparse(url))
segments = pu[2].split('/')
@ -321,3 +325,9 @@ def strtimezone():
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
def unquote(text):
while '%' in text:
text = urllib.unquote(text)
return text

1
scripts/creators.json Normal file
View file

@ -0,0 +1 @@
{"Agnes": "/comics/agnes", "AndyCapp": "/comics/andy-capp", "Archie": "/comics/archie", "ArchieinSpanish": "/comics/archie-spanish", "AskShagg": "/comics/ask-shagg", "BC": "/comics/bc", "BCinSpanish": "/comics/bc-spanish", "BallardStreet": "/comics/ballard-street", "CafeconLeche": "/comics/cafe-con-leche", "ChuckleBros": "/comics/chuckle-bros", "DaddysHome": "/comics/daddys-home", "DiamondLil": "/comics/diamond-lil", "DogEatDoug": "/comics/dog-eat-doug", "DogsofCKennel": "/comics/dogs-of-c-kennel", "DonaldDuck": "/comics/donald-duck", "Flare": "/comics/flare", "FlightDeck": "/comics/flight-deck", "FloandFriends": "/comics/flo-and-friends", "ForHeavensSake": "/comics/for-heavens-sake", "FreeRange": "/comics/free-range", "GirlsAndSports": "/comics/girls-and-sports", "GirlsandSportsinSpanish": "/comics/girls-and-sports-spanish", "Heathcliff": "/comics/heathcliff", "HeathcliffinSpanish": "/comics/heathcliff-spanish", "HerbandJamaal": "/comics/herb-and-jamaal", "HomeOffice": "/comics/stay-at-home-dad", "HopeAndDeath": "/comics/hope-and-death", "LibertyMeadows": "/comics/liberty-meadows", "LongStoryShort": "/comics/long-story-short", "MickeyMouse": "/comics/mickey-mouse", "Momma": "/comics/momma", "NestHeads": "/comics/nest-heads", "OffCenter": "/comics/off-center", "OnaClaireDay": "/comics/on-a-claire-day", "OneBigHappy": "/comics/one-big-happy", "Recess": "/comics/recess", "Rubes": "/comics/rubes", "Rugrats": "/comics/rugrats", "RugratsinSpanish": "/comics/rugrats-spanish", "ScaryGary": "/comics/scary-gary", "SpeedBump": "/comics/speed-bump", "StrangeBrew": "/comics/strange-brew", "TheBarn": "/comics/the-barn", "TheDinetteSet": "/comics/dinette-set", "TheMeaningofLila": "/comics/meaning-of-lila", "TheOtherCoast": "/comics/the-other-coast", "TheQuigmans": "/comics/the-quigmans", "TheWizardofIdinSpanish": "/comics/wizard-of-id-spanish", "ThinLines": "/comics/thin-lines", "WeePals": "/comics/wee-pals", "WizardofId": "/comics/wizard-of-id", "WorkingitOut": "/comics/working-it-out", "ZackHill": "/comics/zack-hill"}

71
scripts/creators.py Executable file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""
Script to get keenspot comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
json_file = __file__.replace(".py", ".json")
# <a href="/comics/agnes.html"><strong>Agnes</strong></a>
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
url = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = url
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
save_result(res)
def print_results(args):
"""Print comics."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, url in sorted(comics.items()):
print("add(%r, %r)" % (str(name), str(url)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

1
scripts/drunkduck.json Normal file

File diff suppressed because one or more lines are too long

82
scripts/drunkduck.py Executable file
View file

@ -0,0 +1,82 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""
Script to get drunkduck comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent
json_file = __file__.replace(".py", ".json")
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False
def handle_url(url, url_matcher, num_matcher, res):
"""Parse one search result page."""
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(1)
name = comicurl[:-1].rsplit('/')[-1]
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue
# find out how many images this comic has
end = match.end(1)
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
continue
num = int(mo.group(1))
res[name] = num
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
num = re.compile(r'(\d+) pages?</span>')
# store info in a dictionary {name -> number of comics}
res = {}
# a search for an empty string returned 825 result pages
result_pages = 825
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(1, result_pages + 1):
print(i, file=sys.stderr, end=" ")
handle_url(base % i, href, num, res)
save_result(res)
def print_results(min_strips):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, num in sorted(comics.items()):
if num >= min_strips:
print("add('%s')" % name)
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(int(sys.argv[1]))
else:
get_results()

9
scripts/generate_json.sh Executable file
View file

@ -0,0 +1,9 @@
#!/bin/sh -e
set -u
d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot; do
echo "Executing ${script}.py"
"${d}/${script}.py"
done

1
scripts/gocomics.json Normal file

File diff suppressed because one or more lines are too long

87
scripts/gocomics.py Executable file
View file

@ -0,0 +1,87 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""
Script to get gocomics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import tagre, getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
json_file = __file__.replace(".py", ".json")
#<a href="/shortname" class="alpha_list updated">name</a>
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.gocomics.com/features', res)
handle_url('http://www.gocomics.com/explore/editorial_list', res)
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
save_result(res)
def has_creators_comic(name):
cname = "Creators/%s" % name
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
if lname == cname.lower():
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, shortname in sorted(comics.items()):
if has_creators_comic(name):
prefix = '#'
else:
prefix = ''
print("%sadd(%r, %r)" % (prefix, str(name), str(shortname)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

1
scripts/keenspot.json Normal file

File diff suppressed because one or more lines are too long

100
scripts/keenspot.py Executable file
View file

@ -0,0 +1,100 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""
Script to get keenspot comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre
from dosagelib.scraper import get_scrapers
json_file = __file__.replace(".py", ".json")
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
num_matcher = re.compile(r'Number of Days: (\d+)')
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
url = match.group(1) + '/'
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
continue
num = int(mo.group(1))
res[name] = (url, num)
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
handle_url(base % c, res)
save_result(res)
def has_comic(name):
cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
if lname == cname or lname == gname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics = int(args[0])
with open(json_file, "rb") as f:
comics = json.load(f)
for name, entry in sorted(comics.items()):
url, num = entry
if num < min_comics:
continue
if has_comic(name):
prefix = '#'
else:
prefix = ''
print("%sadd(%r, %r)" % (prefix, str(name), str(url)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

93
scripts/mktestpage.py Executable file
View file

@ -0,0 +1,93 @@
#!/usr/bin/env python
from __future__ import print_function
import sys
import os
import stat
import time
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.scraper import get_scrapers
htmltemplate = """
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<title>Dosage test results</title>
<meta name="description" content="">
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css">
</head>
<body>
<p>Dosage test results from %(date)s</p>
<ul>
%(content)s
</ul>
</body>
</html>
"""
def get_mtime (filename):
"""Return modification time of filename."""
return os.stat(filename)[stat.ST_MTIME]
def strdate(t):
return time.strftime("%d.%m.%Y", time.localtime(t))
def get_test_name(line):
classname = line.split('::')[1][4:]
for scraper in get_scrapers():
if scraper.__name__ == classname:
try:
url = scraper.starter()
except Exception:
url = None
return scraper.get_name(), url
raise ValueError("Scraper %r not found" % classname)
def get_test(line):
name, url = get_test_name(line)
if line.startswith(". "):
name += " OK"
else:
name += " FAILED"
return name, url
def get_content(filename):
tests = []
with open(filename, "r") as f:
print("Tests parsed: 0", end=" ", file=sys.stderr)
num_tests = 0
for line in f:
if line.startswith((". ", "F ")) and "test_comics" in line:
num_tests += 1
tests.append(get_test(line))
if num_tests % 5 == 0:
print(num_tests, end=" ", file=sys.stderr)
tests.sort()
res = []
for name, url in tests:
css = name.split()[-1].lower()
if url:
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
else:
inner = '<span class="%s">%s</span>' % (css, name)
res.append(' <li>%s</li>' % inner)
return os.linesep.join(res)
def main(args):
filename = "testresults.txt"
modified = get_mtime(filename)
content = get_content(filename)
attrs = {"date": strdate(modified), "content": content}
print(htmltemplate % attrs)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))

11
scripts/mktestscript.sh Executable file
View file

@ -0,0 +1,11 @@
#!/bin/sh -e
set -u
# generates a convenience test script from failed tests
script=test.sh
rm -f "$script"
echo "#!/bin/sh -e" > "$script"
egrep -v "^\. " testresults.txt | egrep "^F " | cut -b "3-" | awk '{ print "make test TESTS=" $0; }' >> "$script"
chmod 755 "$script"

17
scripts/removeafter.py Executable file
View file

@ -0,0 +1,17 @@
#!/usr/bin/env python
"""Remove all lines after a given marker line.
"""
from __future__ import print_function
import fileinput
import sys
def main(args):
filename = args[0]
marker = args[1]
for line in fileinput.input(filename, inplace=1):
print(line.rstrip())
if line.startswith(marker):
break
if __name__ == '__main__':
main(sys.argv[1:])

1
scripts/universal.json Normal file
View file

@ -0,0 +1 @@
{"9ChickweedLane": "/comics/strip/9chickweedlane", "AdamAtHome": "/comics/strip/adamathome", "AlleyOop": "/comics/strip/alley-oop", "ArloandJanis": "/comics/strip/arloandjanis", "BadReporter": "/comics/badreporter", "Baldo": "/comics/strip/baldo", "Betty": "/comics/strip/betty", "BigNate": "/comics/strip/bignate", "Biographic": "/comics/strip/biographic", "Brevitystrip": "/comics/strip/brevity", "BusinessAndFinance": "/comics/category/business%20%26%20finance", "CalvinandHobbes": "/comics/strip/calvinandhobbes", "Cathy": "/comics/strip/cathy", "Cleats": "/comics/strip/cleats", "ClosetoHome": "/comics/panel/closetohome", "ComicPanel": "/comics/panel", "ComicStrip": "/comics/strip", "ComicsAZ": "/comics/list", "Cornered": "/comics/panel/cornered", "CowandBoy": "/comics/strip/cowandboy", "CuldeSac": "/comics/strip/culdesac", "Dilbert": "/comics/strip/dilbert", "Doonesbury": "/comics/strip/doonesbury", "Drabble": "/comics/strip/drabble", "Espaol": "/comics/category/espanol", "FMinus": "/comics/strip/fminus", "Family": "/comics/category/family", "ForBetterorForWorse": "/comics/strip/forbetterorforworse", "ForKids": "/comics/category/for%20kids", "FoxTrot": "/comics/strip/foxtrot", "FrankAndErnest": "/comics/strip/frankandernest", "Frazz": "/comics/strip/frazz", "FredBasset": "/comics/strip/fredbasset", "FreshlySqueezed": "/comics/strip/freshlysqueezed", "Garfield": "/comics/strip/garfield", "GetFuzzy": "/comics/strip/getfuzzy", "GingerMeggs": "/comics/strip/gingermeggs", "Graffiti": "/comics/panel/graffiti", "GrandAvenue": "/comics/strip/grand-avenue", "HealthCapsules": "/comics/panel/healthcapsules", "HeartoftheCity": "/comics/strip/heartofthecity", "Herman": "/comics/panel/herman", "InkPen": "/comics/strip/inkpen", "IntheBleachers": "/comics/panel/inthebleachers", "IntheSticks": "/comics/strip/inthesticks", "JamesBond": "/comics/strip/jamesbond", "JumpStart": "/comics/strip/jumpstart", "KidCity": "/comics/strip/kidcity", "KidSpot": "/comics/panel/kidspot", "KitNCarlyle": "/comics/panel/kitncarlyle", "LaCucaracha": "/comics/strip/lacucaracha", "Lio": "/comics/strip/lio", "Lola": "/comics/strip/lola", "Luann": "/comics/strip/luann", "MagicinaMinute": "/comics/strip/magicinaminute", "Marmaduke": "/comics/panel/marmaduke", "Men": "/comics/category/men", "ModeratelyConfused": "/comics/panel/moderately-confused", "Monty": "/comics/strip/monty", "MrGigiandtheSquid": "/comics/strip/mr-gigi-and-the-squid", "MuttAndJeff": "/comics/strip/muttandjeff", "NEA": "/comics/category/nea", "Nancy": "/comics/strip/nancy", "NonSequitur": "/comics/strip/nonsequitur", "NonSequiturPanel": "/comics/panel/non-sequitur-panel", "OfftheMark": "/comics/panel/offthemark", "Overboard": "/comics/strip/overboard", "OvertheHedge": "/comics/strip/overthehedge", "Peanuts": "/comics/strip/peanuts", "PearlsBeforeSwine": "/comics/strip/pearlsbeforeswine", "Pets": "/comics/category/pets", "PoochCafe": "/comics/strip/poochcafe", "PricklyCity": "/comics/strip/pricklycity", "RealLifeAdventures": "/comics/panel/reallifeadventures", "RealityCheck": "/comics/panel/realitycheck", "RedandRover": "/comics/strip/redandrover", "RipHaywire": "/comics/strip/riphaywire", "RipleysBelieveItorNot": "/comics/panel/ripleysbelieveitornot", "RoseisRose": "/comics/strip/roseisrose", "RudyPark": "/comics/strip/rudypark", "Shortcuts": "/comics/strip/shortcuts", "SouptoNutz": "/comics/strip/soup-to-nutz", "StoneSoup": "/comics/strip/stonesoup", "SundayOnly": "/comics/category/sunday%20only", "TankMcNamara": "/comics/strip/tankmcnamara", "Tarzan": "/comics/strip/tarzan", "Thatababy": "/comics/strip/thatababy", "TheArgyleSweater": "/comics/panel/theargylesweater", "TheBornLoser": "/comics/strip/the-born-loser", "TheBuckets": "/comics/strip/thebuckets", "TheDinetteSet": "/comics/panel/dinetteset", "TheDuplex": "/comics/strip/duplex", "TheElderberries": "/comics/strip/theelderberries", "TheFlyingMcCoys": "/comics/panel/theflyingmccoys", "TheFuscoBrothers": "/comics/strip/thefuscobrothers", "TheGrizzwells": "/comics/strip/thegrizzwells", "TheKnightLife": "/comics/strip/theknightlife", "TomtheDancingBug": "/comics/strip/tomthedancingbug", "UncleArtsFunland": "/comics/strip/uncleartsfunland", "WebExclusive": "/comics/category/web%20exclusive", "Women": "/comics/category/women", "Ziggy": "/comics/panel/ziggy"}

86
scripts/universal.py Executable file
View file

@ -0,0 +1,86 @@
#!/usr/bin/env python
# Copyright (C) 2012 Bastian Kleineidam
"""
Script to get universal comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
import re
import sys
import os
import json
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape
from dosagelib.scraper import get_scrapers
json_file = __file__.replace(".py", ".json")
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
def contains_case_insensitive(adict, akey):
for key in adict:
if key.lower() == akey.lower():
return True
return False
def handle_url(url, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
shortname = match.group(1)
name = unescape(match.group(2))
name = asciify(name.replace('&', 'And').replace('@', 'At'))
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("WARN: skipping possible duplicate", name, file=sys.stderr)
continue
res[name] = shortname
def save_result(res):
"""Save result to file."""
with open(json_file, 'wb') as f:
json.dump(res, f, sort_keys=True)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
handle_url('http://www.universaluclick.com/comics/list', res)
save_result(res)
def has_comic(name):
cname = ("Creators/%s" % name).lower()
gname = ("GoComics/%s" % name).lower()
for scraperclass in get_scrapers():
lname = scraperclass.get_name().lower()
if lname == cname or lname == gname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
with open(json_file, "rb") as f:
comics = json.load(f)
for name, shortname in sorted(comics.items()):
if has_comic(name):
prefix = '#'
else:
prefix = ''
print("%sadd(%r, %r)" % (prefix, str(name), str(shortname)))
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()

13
scripts/update_plugins.py Executable file
View file

@ -0,0 +1,13 @@
#!/bin/sh -e
set -u
mincomics=100
d=$(dirname $0)
for script in creators gocomics drunkduck universal keenspot; do
target="${d}/../dosagelib/plugins/${script}.py"
echo "Upating $target"
"${d}/removeafter.py" "$target" "# DO NOT REMOVE"
"${d}/${script}.py" $mincomics >> "$target"
done