Add comic scripts, add fixes and other stuff.
This commit is contained in:
parent
a52e5ae575
commit
451fd982d9
41 changed files with 4130 additions and 4523 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -11,3 +11,5 @@
|
|||
/_Dosage_configdata.py
|
||||
/comics.test
|
||||
/testresults.txt
|
||||
/dosage.prof
|
||||
/test.sh
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
include MANIFEST.in
|
||||
include COPYING doc/*.txt doc/*.1 doc/*.html
|
||||
include Makefile
|
||||
include requirements.txt
|
||||
include scripts/*.py scripts/*.sh
|
||||
recursive-include tests *.py
|
||||
|
|
13
Makefile
13
Makefile
|
@ -3,7 +3,7 @@ PYVER:=2.7
|
|||
PYTHON:=python$(PYVER)
|
||||
VERSION:=$(shell $(PYTHON) setup.py --version)
|
||||
ARCHIVE:=dosage-$(VERSION).tar.gz
|
||||
PY_FILES_DIRS := dosage dosagelib tests *.py
|
||||
PY_FILES_DIRS := dosage dosagelib scripts tests *.py
|
||||
PY2APPOPTS ?=
|
||||
NUMPROCESSORS:=$(shell grep -c processor /proc/cpuinfo)
|
||||
# Pytest options:
|
||||
|
@ -33,8 +33,8 @@ doc/dosage.1.html: doc/dosage.1
|
|||
|
||||
release: distclean releasecheck dist
|
||||
git tag v$(VERSION)
|
||||
# @echo "Register at Python Package Index..."
|
||||
# $(PYTHON) setup.py register
|
||||
@echo "Register at Python Package Index..."
|
||||
$(PYTHON) setup.py register
|
||||
# freecode-submit < dosage.freecode
|
||||
|
||||
|
||||
|
@ -75,7 +75,7 @@ clean:
|
|||
rm -rf build dist
|
||||
|
||||
distclean: clean
|
||||
rm -rf build dist Dosage.egg-info
|
||||
rm -rf build dist Dosage.egg-info dosage.prof test.sh testresults.txt
|
||||
rm -f _Dosage_configdata.py MANIFEST
|
||||
|
||||
localbuild:
|
||||
|
@ -87,11 +87,8 @@ test: localbuild
|
|||
deb:
|
||||
git-buildpackage --git-export-dir=../build-area/ --git-upstream-branch=master --git-debian-branch=debian --git-ignore-new
|
||||
|
||||
comics:
|
||||
./dosage -v @@ > comics.log 2>&1
|
||||
|
||||
update-copyright:
|
||||
update-copyright --holder="Bastian Kleineidam"
|
||||
|
||||
.PHONY: update-copyright comics deb test clean distclean count pyflakes
|
||||
.PHONY: update-copyright deb test clean distclean count pyflakes
|
||||
.PHONY: doccheck check releasecheck release dist chmod localbuild
|
||||
|
|
|
@ -21,7 +21,7 @@ you may be infringing upon various copyrights.
|
|||
|
||||
Usage
|
||||
------
|
||||
List available comics (over 4400 at the moment):
|
||||
List available comics (over 3500 at the moment):
|
||||
|
||||
`$ dosage -l`
|
||||
|
||||
|
|
|
@ -12,7 +12,10 @@ Changes:
|
|||
- comics: Removed the twisted and zope dependencies by adding
|
||||
an internal plugin search mechanism.
|
||||
- comics: Remove the disable mechanism.
|
||||
- testing: Refactored the test comic routine in proper unit tests.
|
||||
- comcis: Add scripts to automate comic listings for Creators, Universal,
|
||||
KeenSpot, GoComics and DrunkDuck.
|
||||
- testing: Refactored the test comic routine into fully automatic and complete
|
||||
tests cases for every comic.
|
||||
- cmdline: Improved terminal feature detection.
|
||||
|
||||
Fixes:
|
||||
|
|
15
dosage
15
dosage
|
@ -202,5 +202,20 @@ def main():
|
|||
return res
|
||||
|
||||
|
||||
def profile():
|
||||
"""Profile the loading of all scrapers."""
|
||||
import cProfile
|
||||
cProfile.run("scraper.get_scrapers()", "dosage.prof")
|
||||
|
||||
|
||||
def viewprof():
|
||||
"""View profile stats."""
|
||||
import pstats
|
||||
stats = pstats.Stats("dosage.prof")
|
||||
stats.strip_dirs().sort_stats("cumulative").print_stats(100)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
#profile()
|
||||
#viewprof()
|
||||
|
|
|
@ -83,7 +83,7 @@ def GetConsoleScreenBufferInfo(stream_id=STDOUT):
|
|||
"""Get console screen buffer info object."""
|
||||
handle = handles[stream_id]
|
||||
csbi = CONSOLE_SCREEN_BUFFER_INFO()
|
||||
success = windll.kernel32.GetConsoleScreenBufferInfo(
|
||||
windll.kernel32.GetConsoleScreenBufferInfo(
|
||||
handle, byref(csbi))
|
||||
return csbi
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ import rfc822
|
|||
import time
|
||||
|
||||
from .output import out
|
||||
from .util import urlopen, saneDataSize, normaliseURL
|
||||
from .util import urlopen, saneDataSize, normaliseURL, unquote
|
||||
from .events import getHandler
|
||||
|
||||
class FetchComicError(IOError):
|
||||
|
@ -54,10 +54,10 @@ class ComicImage(object):
|
|||
"""Connect to host and get meta information."""
|
||||
try:
|
||||
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
||||
except IOError as he:
|
||||
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
|
||||
except IOError as msg:
|
||||
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
||||
|
||||
content_type = self.urlobj.headers.get('content-type')
|
||||
content_type = unquote(self.urlobj.headers.get('content-type'))
|
||||
content_type = content_type.split(';', 1)[0]
|
||||
if '/' in content_type:
|
||||
maintype, subtype = content_type.split('/', 1)
|
||||
|
@ -65,7 +65,7 @@ class ComicImage(object):
|
|||
maintype = content_type
|
||||
subtype = None
|
||||
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
||||
raise FetchComicError('No suitable image found to retrieve.', self.url)
|
||||
raise FetchComicError('Content type %r is not an image.' % content_type, self.url)
|
||||
|
||||
# Always use mime type for file extension if it is sane.
|
||||
if maintype == 'image':
|
||||
|
|
|
@ -4,78 +4,75 @@
|
|||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, asciify
|
||||
from ..util import tagre
|
||||
|
||||
def add(name, shortname):
|
||||
baseUrl = 'http://www.creators.com/comics/'
|
||||
classname = 'Creators_%s' % asciify(name)
|
||||
_imageSearch = compile(tagre("img", "src", r'(/comics/\d+/[^"]+)'))
|
||||
|
||||
def add(name, path):
|
||||
baseurl = 'http://www.creators.com'
|
||||
classname = 'Creators_%s' % name
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name = 'Creators/' + name,
|
||||
latestUrl = baseUrl + shortname + '.html',
|
||||
stripUrl = baseUrl + shortname + '/%s.html',
|
||||
imageSearch = compile(tagre("img", "src", r'(/comics/\d+/[^"]+)')),
|
||||
prevSearch = compile(tagre("a", "href", r'(/comics/%s/\d+\.html)' % shortname) +
|
||||
latestUrl = baseurl + path + '.html',
|
||||
stripUrl = baseurl + path + '/%s.html',
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = compile(tagre("a", "href", r'(%s/\d+\.html)' % path) +
|
||||
tagre("img", "src", r'/img_comics/arrow_l\.gif')),
|
||||
help = 'Index format: n',
|
||||
)
|
||||
|
||||
|
||||
# for a complete list see http://www.creators.com/comics/cat-seeall.html
|
||||
comics = {
|
||||
'Agnes': 'agnes',
|
||||
'AndyCapp': 'andy-capp',
|
||||
'Archie': 'archie',
|
||||
'AskShagg': 'ask-shagg',
|
||||
'BallardStreet': 'ballard-street',
|
||||
'BC': 'bc',
|
||||
'TheBarn': 'the-barn',
|
||||
'CafeConLeche': 'cafe-con-leche',
|
||||
'ChuckleBros': 'chuckle-bros',
|
||||
'DaddysHome': 'daddys-home',
|
||||
'DiamondLil': 'diamond-lil',
|
||||
'TheDinetteSet': 'dinette-set',
|
||||
'DogEatDoug': 'dog-eat-doug',
|
||||
'DogsOfCKennel': 'dogs-of-c-kennel',
|
||||
'DonaldDuck': 'donald-duck',
|
||||
'FloAndFriends': 'flo-and-friends',
|
||||
'Flare': 'flare',
|
||||
'FlightDeck': 'flight-deck',
|
||||
'ForHeavensSake': 'for-heavens-sake',
|
||||
'FreeRange': 'free-range',
|
||||
'GirlsAndSports': 'girls-and-sports',
|
||||
'Heathcliff': 'heathcliff',
|
||||
'HerbAndJamaal': 'herb-and-jamaal',
|
||||
'HopeAndDeath': 'hope-and-death',
|
||||
'LibertyMeadows': 'liberty-meadows',
|
||||
'TheMeaningOfLila': 'meaning-of-lila',
|
||||
'MickeyMouse': 'mickey-mouse',
|
||||
'Momma': 'momma',
|
||||
'NestHeads': 'nest-heads',
|
||||
'OneBigHappy': 'one-big-happy',
|
||||
'OnAClaireDay': 'on-a-claire-day',
|
||||
'TheOtherCoast': 'the-other-coast',
|
||||
'TheQuigmans': 'the-quigmans',
|
||||
'Rubes': 'rubes',
|
||||
'Rugrats': 'rugrats',
|
||||
'ScaryGary': 'scary-gary',
|
||||
'SpeedBump': 'speed-bump',
|
||||
'StrangeBrew': 'strange-brew',
|
||||
'ThinLines': 'thin-lines',
|
||||
'WeePals': 'wee-pals',
|
||||
'WizardOfId': 'wizard-of-id',
|
||||
'WorkingItOut': 'working-it-out',
|
||||
'ZackHill': 'zack-hill',
|
||||
'BCSpanish': 'bc-spanish',
|
||||
'WizardOfIdSpanish': 'wizard-of-id-spanish',
|
||||
'ArchieSpanish': 'archie-spanish',
|
||||
'HeathcliffSpanish': 'heathcliff-spanish',
|
||||
'RugratsSpanish': 'rugrats-spanish',
|
||||
'LongStoryShort': 'long-story-short',
|
||||
'Recess': 'recess',
|
||||
'HomeOffice': 'stay-at-home-dad',
|
||||
'OffCenter': 'off-center',
|
||||
'GirlsAndSportsSpanish': 'girls-and-sports-spanish',
|
||||
}
|
||||
|
||||
for name, shortname in comics.items():
|
||||
add(name, shortname)
|
||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
||||
# DO NOT REMOVE
|
||||
add('Agnes', '/comics/agnes')
|
||||
add('AndyCapp', '/comics/andy-capp')
|
||||
add('Archie', '/comics/archie')
|
||||
add('ArchieinSpanish', '/comics/archie-spanish')
|
||||
add('AskShagg', '/comics/ask-shagg')
|
||||
add('BC', '/comics/bc')
|
||||
add('BCinSpanish', '/comics/bc-spanish')
|
||||
add('BallardStreet', '/comics/ballard-street')
|
||||
add('CafeconLeche', '/comics/cafe-con-leche')
|
||||
add('ChuckleBros', '/comics/chuckle-bros')
|
||||
add('DaddysHome', '/comics/daddys-home')
|
||||
add('DiamondLil', '/comics/diamond-lil')
|
||||
add('DogEatDoug', '/comics/dog-eat-doug')
|
||||
add('DogsofCKennel', '/comics/dogs-of-c-kennel')
|
||||
add('DonaldDuck', '/comics/donald-duck')
|
||||
add('Flare', '/comics/flare')
|
||||
add('FlightDeck', '/comics/flight-deck')
|
||||
add('FloandFriends', '/comics/flo-and-friends')
|
||||
add('ForHeavensSake', '/comics/for-heavens-sake')
|
||||
add('FreeRange', '/comics/free-range')
|
||||
add('GirlsAndSports', '/comics/girls-and-sports')
|
||||
add('GirlsandSportsinSpanish', '/comics/girls-and-sports-spanish')
|
||||
add('Heathcliff', '/comics/heathcliff')
|
||||
add('HeathcliffinSpanish', '/comics/heathcliff-spanish')
|
||||
add('HerbandJamaal', '/comics/herb-and-jamaal')
|
||||
add('HomeOffice', '/comics/stay-at-home-dad')
|
||||
add('HopeAndDeath', '/comics/hope-and-death')
|
||||
add('LibertyMeadows', '/comics/liberty-meadows')
|
||||
add('LongStoryShort', '/comics/long-story-short')
|
||||
add('MickeyMouse', '/comics/mickey-mouse')
|
||||
add('Momma', '/comics/momma')
|
||||
add('NestHeads', '/comics/nest-heads')
|
||||
add('OffCenter', '/comics/off-center')
|
||||
add('OnaClaireDay', '/comics/on-a-claire-day')
|
||||
add('OneBigHappy', '/comics/one-big-happy')
|
||||
add('Recess', '/comics/recess')
|
||||
add('Rubes', '/comics/rubes')
|
||||
add('Rugrats', '/comics/rugrats')
|
||||
add('RugratsinSpanish', '/comics/rugrats-spanish')
|
||||
add('ScaryGary', '/comics/scary-gary')
|
||||
add('SpeedBump', '/comics/speed-bump')
|
||||
add('StrangeBrew', '/comics/strange-brew')
|
||||
add('TheBarn', '/comics/the-barn')
|
||||
add('TheDinetteSet', '/comics/dinette-set')
|
||||
add('TheMeaningofLila', '/comics/meaning-of-lila')
|
||||
add('TheOtherCoast', '/comics/the-other-coast')
|
||||
add('TheQuigmans', '/comics/the-quigmans')
|
||||
add('TheWizardofIdinSpanish', '/comics/wizard-of-id-spanish')
|
||||
add('ThinLines', '/comics/thin-lines')
|
||||
add('WeePals', '/comics/wee-pals')
|
||||
add('WizardofId', '/comics/wizard-of-id')
|
||||
add('WorkingitOut', '/comics/working-it-out')
|
||||
add('ZackHill', '/comics/zack-hill')
|
||||
|
|
|
@ -143,4 +143,10 @@ class DresdenCodak(_BasicScraper):
|
|||
starter = indirectStarter('http://dresdencodak.com/', compile(r'<div id="preview"><a href="http://dresdencodak.com/(\d+/\d+/\d+/.*?)">'))
|
||||
|
||||
|
||||
# XXX dilbert.com
|
||||
class Dilbert(_BasicScraper):
|
||||
latestUrl = 'http://dilbert.com/'
|
||||
stripUrl = latestUrl + '%s/'
|
||||
prevSearch = compile(tagre("a", "href", r'(/\d+-\d+-\d+/)', after="STR_Prev"))
|
||||
imageSearch = compile(tagre("img", "src", r'(/dyn/str_strip/[^"]+\.strip\.zoom\.gif)'))
|
||||
help = 'Index format: yyyy-mm-dd'
|
||||
# XXX namer
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -7,6 +7,9 @@ from ..scraper import make_scraper
|
|||
from ..util import asciify
|
||||
|
||||
|
||||
_imageSearch = compile(r'SRC="(http://www\.thefallenangel\.co\.uk/\w+comics/.+?)"')
|
||||
_prevSearch = compile(r' <a href="(http://www\.thefallenangel\.co\.uk/.+?)"><img[^>]+?src="http://www\.thefallenangel\.co\.uk/images/previousday\.jpg"')
|
||||
|
||||
def add(name, shortname):
|
||||
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
|
||||
classname = asciify(name)
|
||||
|
@ -14,8 +17,8 @@ def add(name, shortname):
|
|||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '?date=%s',
|
||||
name='FallenAngel/' + name,
|
||||
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
|
||||
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: yyyymmdd',
|
||||
)
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,7 +4,6 @@
|
|||
|
||||
from re import compile, IGNORECASE
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
class Key(_BasicScraper):
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -6,6 +6,9 @@ from re import compile
|
|||
from ..scraper import make_scraper
|
||||
from ..util import tagre
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
|
||||
|
||||
def add(name, shortname):
|
||||
baseUrl = 'http://www.nuklearpower.com/' + shortname + '/'
|
||||
classname = 'NuklearPower_%s' % name
|
||||
|
@ -14,8 +17,8 @@ def add(name, shortname):
|
|||
name='NuklearPower/' + name,
|
||||
latestUrl = baseUrl,
|
||||
stripUrl = baseUrl + '%s',
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)')),
|
||||
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous"),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: yyyy/mm/dd/name',
|
||||
)
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ from ..scraper import make_scraper
|
|||
from ..helpers import bounceStarter
|
||||
from ..util import tagre
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://www\.smackjeeves\.com/images/uploaded/comics/[^"]*)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="< Previous"')
|
||||
|
||||
def add(name):
|
||||
classname = 'SmackJeeves/' + name
|
||||
|
@ -20,8 +22,8 @@ def add(name):
|
|||
globals()[classname] = make_scraper(classname,
|
||||
starter=bounceStarter(baseUrl, compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="Next >"')),
|
||||
stripUrl = baseUrl,
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.smackjeeves\.com/images/uploaded/comics/[^"]*)')),
|
||||
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/[^"]*)') + '<img[^>]*alt="< Previous"'),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: nnnn (some increasing number)',
|
||||
namer = namer,
|
||||
)
|
||||
|
|
|
@ -5,6 +5,10 @@
|
|||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
|
||||
_imageSearch = compile(r'<img src=http://\w+\.snafu-comics\.com/(comics/\d{6}_\w*\.\w{3,4})')
|
||||
_prevSearch = compile(r'<a href="(\?comic_id=\d+)">Previous</a>')
|
||||
|
||||
|
||||
def add(name, host):
|
||||
baseUrl = 'http://%s.snafu-comics.com/' % host
|
||||
classname = 'SnafuComics_%s' % name
|
||||
|
@ -13,8 +17,8 @@ def add(name, host):
|
|||
name='SnafuComics/%s' % name,
|
||||
latestUrl = baseUrl,
|
||||
stripUrl = baseUrl + 'index.php?strip_id=%s',
|
||||
imageSearch = compile(r'<img src=http://\w+\.snafu-comics\.com/(comics/\d{6}_\w*\.\w{3,4})'),
|
||||
prevSearch = compile(r'<a href="(\?comic_id=\d+)">Previous</a>'),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: n (unpadded)',
|
||||
)
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from re import compile
|
|||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..helpers import bounceStarter, indirectStarter
|
||||
from ..util import getQueryParams, tagre
|
||||
from ..util import getQueryParams
|
||||
|
||||
|
||||
class Undertow(_BasicScraper):
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
The Universal comics only have some samples, but those samples are always the newest ones.
|
||||
"""
|
||||
import datetime
|
||||
from re import compile, escape
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, asciify, getPageContent
|
||||
|
||||
|
||||
def parse_strdate(strdate):
|
||||
"""Parse date string. XXX this is locale dependant but it should not be."""
|
||||
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
|
||||
|
||||
|
||||
def add(name, category):
|
||||
shortname = name.replace(' ', '').lower()
|
||||
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
|
||||
classname = 'UClick_%s' % asciify(name)
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
"""Parse publish date from page content which looks like:
|
||||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||
<h4>published: Sunday, November 11, 2012</h4>
|
||||
"""
|
||||
data = getPageContent(pageUrl)[0]
|
||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||
mo = ro.search(data)
|
||||
if mo:
|
||||
strdate = mo.group(1)
|
||||
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='UClick/' + name,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '%s/',
|
||||
imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
|
||||
multipleImagesPerStrip = True,
|
||||
prevSearch = None,
|
||||
help = 'Index format: none',
|
||||
namer = namer,
|
||||
)
|
||||
|
||||
|
||||
# http://www.universaluclick.com/comics/list
|
||||
comics = {
|
||||
'9 Chickweed Lane': 'strip',
|
||||
}
|
||||
|
||||
for name, category in comics.items():
|
||||
add(name, category)
|
150
dosagelib/plugins/universal.py
Normal file
150
dosagelib/plugins/universal.py
Normal file
|
@ -0,0 +1,150 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
The Universal comics only have some samples, but those samples are always the newest ones.
|
||||
"""
|
||||
import datetime
|
||||
from re import compile, escape
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, getPageContent
|
||||
|
||||
|
||||
def parse_strdate(strdate):
|
||||
"""Parse date string. XXX this is locale dependant but it should not be."""
|
||||
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published')
|
||||
|
||||
def add(name, shortname):
|
||||
latestUrl = 'http://www.universaluclick.com%s' % shortname
|
||||
classname = 'UClick_%s' % name
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
"""Parse publish date from page content which looks like:
|
||||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||
<h4>published: Sunday, November 11, 2012</h4>
|
||||
"""
|
||||
data = getPageContent(pageUrl)[0]
|
||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||
mo = ro.search(data)
|
||||
if mo:
|
||||
strdate = mo.group(1)
|
||||
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='UClick/' + name,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '%s/',
|
||||
imageSearch = _imageSearch,
|
||||
multipleImagesPerStrip = True,
|
||||
prevSearch = None,
|
||||
help = 'Index format: none',
|
||||
namer = namer,
|
||||
)
|
||||
|
||||
# do not edit anything below since these entries are generated from scripts/update.sh
|
||||
# DO NOT REMOVE
|
||||
#add('9ChickweedLane', '/comics/strip/9chickweedlane')
|
||||
#add('AdamAtHome', '/comics/strip/adamathome')
|
||||
#add('AlleyOop', '/comics/strip/alley-oop')
|
||||
#add('ArloandJanis', '/comics/strip/arloandjanis')
|
||||
#add('BadReporter', '/comics/badreporter')
|
||||
#add('Baldo', '/comics/strip/baldo')
|
||||
#add('Betty', '/comics/strip/betty')
|
||||
#add('BigNate', '/comics/strip/bignate')
|
||||
#add('Biographic', '/comics/strip/biographic')
|
||||
add('Brevitystrip', '/comics/strip/brevity')
|
||||
add('BusinessAndFinance', '/comics/category/business%20%26%20finance')
|
||||
#add('CalvinandHobbes', '/comics/strip/calvinandhobbes')
|
||||
#add('Cathy', '/comics/strip/cathy')
|
||||
#add('Cleats', '/comics/strip/cleats')
|
||||
#add('ClosetoHome', '/comics/panel/closetohome')
|
||||
add('ComicPanel', '/comics/panel')
|
||||
add('ComicStrip', '/comics/strip')
|
||||
add('ComicsAZ', '/comics/list')
|
||||
#add('Cornered', '/comics/panel/cornered')
|
||||
#add('CowandBoy', '/comics/strip/cowandboy')
|
||||
#add('CuldeSac', '/comics/strip/culdesac')
|
||||
#add('Dilbert', '/comics/strip/dilbert')
|
||||
#add('Doonesbury', '/comics/strip/doonesbury')
|
||||
#add('Drabble', '/comics/strip/drabble')
|
||||
add('Espaol', '/comics/category/espanol')
|
||||
#add('FMinus', '/comics/strip/fminus')
|
||||
add('Family', '/comics/category/family')
|
||||
#add('ForBetterorForWorse', '/comics/strip/forbetterorforworse')
|
||||
add('ForKids', '/comics/category/for%20kids')
|
||||
#add('FoxTrot', '/comics/strip/foxtrot')
|
||||
#add('FrankAndErnest', '/comics/strip/frankandernest')
|
||||
#add('Frazz', '/comics/strip/frazz')
|
||||
#add('FredBasset', '/comics/strip/fredbasset')
|
||||
#add('FreshlySqueezed', '/comics/strip/freshlysqueezed')
|
||||
#add('Garfield', '/comics/strip/garfield')
|
||||
#add('GetFuzzy', '/comics/strip/getfuzzy')
|
||||
#add('GingerMeggs', '/comics/strip/gingermeggs')
|
||||
#add('Graffiti', '/comics/panel/graffiti')
|
||||
#add('GrandAvenue', '/comics/strip/grand-avenue')
|
||||
#add('HealthCapsules', '/comics/panel/healthcapsules')
|
||||
#add('HeartoftheCity', '/comics/strip/heartofthecity')
|
||||
#add('Herman', '/comics/panel/herman')
|
||||
#add('InkPen', '/comics/strip/inkpen')
|
||||
#add('IntheBleachers', '/comics/panel/inthebleachers')
|
||||
#add('IntheSticks', '/comics/strip/inthesticks')
|
||||
add('JamesBond', '/comics/strip/jamesbond')
|
||||
#add('JumpStart', '/comics/strip/jumpstart')
|
||||
#add('KidCity', '/comics/strip/kidcity')
|
||||
#add('KidSpot', '/comics/panel/kidspot')
|
||||
#add('KitNCarlyle', '/comics/panel/kitncarlyle')
|
||||
#add('LaCucaracha', '/comics/strip/lacucaracha')
|
||||
#add('Lio', '/comics/strip/lio')
|
||||
#add('Lola', '/comics/strip/lola')
|
||||
#add('Luann', '/comics/strip/luann')
|
||||
#add('MagicinaMinute', '/comics/strip/magicinaminute')
|
||||
#add('Marmaduke', '/comics/panel/marmaduke')
|
||||
add('Men', '/comics/category/men')
|
||||
#add('ModeratelyConfused', '/comics/panel/moderately-confused')
|
||||
#add('Monty', '/comics/strip/monty')
|
||||
#add('MrGigiandtheSquid', '/comics/strip/mr-gigi-and-the-squid')
|
||||
#add('MuttAndJeff', '/comics/strip/muttandjeff')
|
||||
add('NEA', '/comics/category/nea')
|
||||
#add('Nancy', '/comics/strip/nancy')
|
||||
#add('NonSequitur', '/comics/strip/nonsequitur')
|
||||
add('NonSequiturPanel', '/comics/panel/non-sequitur-panel')
|
||||
#add('OfftheMark', '/comics/panel/offthemark')
|
||||
#add('Overboard', '/comics/strip/overboard')
|
||||
#add('OvertheHedge', '/comics/strip/overthehedge')
|
||||
#add('Peanuts', '/comics/strip/peanuts')
|
||||
#add('PearlsBeforeSwine', '/comics/strip/pearlsbeforeswine')
|
||||
add('Pets', '/comics/category/pets')
|
||||
#add('PoochCafe', '/comics/strip/poochcafe')
|
||||
#add('PricklyCity', '/comics/strip/pricklycity')
|
||||
#add('RealLifeAdventures', '/comics/panel/reallifeadventures')
|
||||
#add('RealityCheck', '/comics/panel/realitycheck')
|
||||
#add('RedandRover', '/comics/strip/redandrover')
|
||||
#add('RipHaywire', '/comics/strip/riphaywire')
|
||||
#add('RipleysBelieveItorNot', '/comics/panel/ripleysbelieveitornot')
|
||||
#add('RoseisRose', '/comics/strip/roseisrose')
|
||||
#add('RudyPark', '/comics/strip/rudypark')
|
||||
#add('Shortcuts', '/comics/strip/shortcuts')
|
||||
#add('SouptoNutz', '/comics/strip/soup-to-nutz')
|
||||
#add('StoneSoup', '/comics/strip/stonesoup')
|
||||
add('SundayOnly', '/comics/category/sunday%20only')
|
||||
#add('TankMcNamara', '/comics/strip/tankmcnamara')
|
||||
#add('Tarzan', '/comics/strip/tarzan')
|
||||
#add('Thatababy', '/comics/strip/thatababy')
|
||||
#add('TheArgyleSweater', '/comics/panel/theargylesweater')
|
||||
#add('TheBornLoser', '/comics/strip/the-born-loser')
|
||||
#add('TheBuckets', '/comics/strip/thebuckets')
|
||||
#add('TheDinetteSet', '/comics/panel/dinetteset')
|
||||
#add('TheDuplex', '/comics/strip/duplex')
|
||||
#add('TheElderberries', '/comics/strip/theelderberries')
|
||||
#add('TheFlyingMcCoys', '/comics/panel/theflyingmccoys')
|
||||
#add('TheFuscoBrothers', '/comics/strip/thefuscobrothers')
|
||||
#add('TheGrizzwells', '/comics/strip/thegrizzwells')
|
||||
#add('TheKnightLife', '/comics/strip/theknightlife')
|
||||
#add('TomtheDancingBug', '/comics/strip/tomthedancingbug')
|
||||
#add('UncleArtsFunland', '/comics/strip/uncleartsfunland')
|
||||
add('WebExclusive', '/comics/category/web%20exclusive')
|
||||
add('Women', '/comics/category/women')
|
||||
#add('Ziggy', '/comics/panel/ziggy')
|
|
@ -33,6 +33,7 @@ class Adventure(_VGCats):
|
|||
|
||||
class ViiviJaWagner(_BasicScraper):
|
||||
latestUrl = 'http://www.hs.fi/viivijawagner/'
|
||||
imageSearch = compile(tagre("link", "href", r'(http://hs12\.snstatic\.fi/webkuva/oletus/[^"]+)', before="image_src"))
|
||||
prevSearch = compile(tagre("a", "href", r'(/viivijawagner/\d+)', before="prev-cm"))
|
||||
stripUrl = None
|
||||
imageSearch = compile(tagre("link", "href", r'(http://hs\d+\.snstatic\.fi/webkuva/oletus/[^"]+)', before="image_src"))
|
||||
prevSearch = compile(tagre("a", "href", r'(/viivijawagner/[^"]+)', before="prev-cm"))
|
||||
help = 'Index format: none'
|
||||
|
|
|
@ -5,6 +5,9 @@
|
|||
from re import compile, IGNORECASE, DOTALL
|
||||
from ..scraper import make_scraper
|
||||
|
||||
_imageSearch = compile(r'<a name="strip\d*?">.*?<img[^>]+?src="([^"]*?memberimages/.+?)"', IGNORECASE + DOTALL)
|
||||
_prevSearch = compile(r'href="([^"]*?whichbutton=prev[^"]*?)"', IGNORECASE)
|
||||
|
||||
|
||||
def add(name, subpath):
|
||||
baseUrl = 'http://www.webcomicsnation.com/'
|
||||
|
@ -13,8 +16,8 @@ def add(name, subpath):
|
|||
name = 'WebcomicsNation/' + name,
|
||||
latestUrl = baseUrl + subpath,
|
||||
stripUrl = baseUrl + '?view=archive&chapter=%s',
|
||||
imageSearch = compile(r'<a name="strip\d*?">.*?<img[^>]+?src="([^"]*?memberimages/.+?)"', IGNORECASE + DOTALL),
|
||||
prevSearch = compile(r'href="([^"]*?whichbutton=prev[^"]*?)"', IGNORECASE),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
help = 'Index format: nnnn (non-contiguous)',
|
||||
)
|
||||
|
||||
|
|
|
@ -7,6 +7,11 @@ from ..scraper import make_scraper
|
|||
from ..helpers import bounceStarter
|
||||
|
||||
|
||||
_imageSearch = compile(r'SRC="(http://www\.wlpcomics\.com/adult/.+?|http://www\.wlpcomics\.com/general/.+?)"', IGNORECASE)
|
||||
_prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE)
|
||||
_nextSearch = compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE)
|
||||
|
||||
|
||||
def add(name, path):
|
||||
baseUrl = 'http://www.wlpcomics.com/' + path
|
||||
classname = 'WLP_' + name
|
||||
|
@ -17,10 +22,10 @@ def add(name, path):
|
|||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name = 'WLP/' + name,
|
||||
starter = bounceStarter(baseUrl, compile(r'</a> <A HREF="(\w+.html)">Next Page</a>', IGNORECASE)),
|
||||
starter = bounceStarter(baseUrl, _nextSearch),
|
||||
stripUrl = baseUrl + '%s.html',
|
||||
imageSearch = compile(r'SRC="(http://www.wlpcomics.com/adult/.+?|http://www.wlpcomics.com/general/.+?)"', IGNORECASE),
|
||||
prevSearch = compile(r'</a> <A HREF="(\w+.html)">Previous Page</a>', IGNORECASE),
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
namer = namer,
|
||||
help = 'Index format: nnn',
|
||||
)
|
||||
|
|
|
@ -145,13 +145,13 @@ def get_scrapers():
|
|||
"""
|
||||
global _scrapers
|
||||
if _scrapers is None:
|
||||
out.write("Loading comic modules...")
|
||||
out.write("Loading comic modules...", 2)
|
||||
modules = loader.get_modules()
|
||||
plugins = loader.get_plugins(modules, _BasicScraper)
|
||||
_scrapers = list(plugins)
|
||||
_scrapers.sort(key=lambda s: s.get_name())
|
||||
check_scrapers()
|
||||
out.write("... %d modules loaded." % len(_scrapers))
|
||||
out.write("... %d modules loaded." % len(_scrapers), 2)
|
||||
return _scrapers
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
from __future__ import division, print_function
|
||||
|
||||
import urllib2, urlparse
|
||||
import urllib, urllib2, urlparse
|
||||
import requests
|
||||
import sys
|
||||
import os
|
||||
|
@ -43,15 +43,19 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
|||
@return: the generated regular expression suitable for re.compile()
|
||||
@rtype: string
|
||||
"""
|
||||
if before:
|
||||
prefix = r"[^>]*%s[^>]*\s+" % before
|
||||
else:
|
||||
prefix = r"(?:[^>]*\s+)?"
|
||||
attrs = dict(
|
||||
tag=case_insensitive_re(tag),
|
||||
attribute=case_insensitive_re(attribute),
|
||||
value=value,
|
||||
quote=quote,
|
||||
before=before,
|
||||
prefix=prefix,
|
||||
after=after,
|
||||
)
|
||||
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
|
||||
return r'<\s*%(tag)s\s+%(prefix)s%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
|
||||
|
||||
|
||||
def case_insensitive_re(name):
|
||||
|
@ -122,7 +126,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
|
|||
return imageUrls, None
|
||||
|
||||
|
||||
def _unescape(text):
|
||||
def unescape(text):
|
||||
"""
|
||||
Replace HTML entities and character references.
|
||||
"""
|
||||
|
@ -156,7 +160,7 @@ def normaliseURL(url):
|
|||
HTML entities and character references.
|
||||
"""
|
||||
# XXX: brutal hack
|
||||
url = _unescape(url)
|
||||
url = unescape(url)
|
||||
|
||||
pu = list(urlparse.urlparse(url))
|
||||
segments = pu[2].split('/')
|
||||
|
@ -321,3 +325,9 @@ def strtimezone():
|
|||
def asciify(name):
|
||||
"""Remove non-ascii characters from string."""
|
||||
return re.sub("[^0-9a-zA-Z_]", "", name)
|
||||
|
||||
|
||||
def unquote(text):
|
||||
while '%' in text:
|
||||
text = urllib.unquote(text)
|
||||
return text
|
||||
|
|
1
scripts/creators.json
Normal file
1
scripts/creators.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"Agnes": "/comics/agnes", "AndyCapp": "/comics/andy-capp", "Archie": "/comics/archie", "ArchieinSpanish": "/comics/archie-spanish", "AskShagg": "/comics/ask-shagg", "BC": "/comics/bc", "BCinSpanish": "/comics/bc-spanish", "BallardStreet": "/comics/ballard-street", "CafeconLeche": "/comics/cafe-con-leche", "ChuckleBros": "/comics/chuckle-bros", "DaddysHome": "/comics/daddys-home", "DiamondLil": "/comics/diamond-lil", "DogEatDoug": "/comics/dog-eat-doug", "DogsofCKennel": "/comics/dogs-of-c-kennel", "DonaldDuck": "/comics/donald-duck", "Flare": "/comics/flare", "FlightDeck": "/comics/flight-deck", "FloandFriends": "/comics/flo-and-friends", "ForHeavensSake": "/comics/for-heavens-sake", "FreeRange": "/comics/free-range", "GirlsAndSports": "/comics/girls-and-sports", "GirlsandSportsinSpanish": "/comics/girls-and-sports-spanish", "Heathcliff": "/comics/heathcliff", "HeathcliffinSpanish": "/comics/heathcliff-spanish", "HerbandJamaal": "/comics/herb-and-jamaal", "HomeOffice": "/comics/stay-at-home-dad", "HopeAndDeath": "/comics/hope-and-death", "LibertyMeadows": "/comics/liberty-meadows", "LongStoryShort": "/comics/long-story-short", "MickeyMouse": "/comics/mickey-mouse", "Momma": "/comics/momma", "NestHeads": "/comics/nest-heads", "OffCenter": "/comics/off-center", "OnaClaireDay": "/comics/on-a-claire-day", "OneBigHappy": "/comics/one-big-happy", "Recess": "/comics/recess", "Rubes": "/comics/rubes", "Rugrats": "/comics/rugrats", "RugratsinSpanish": "/comics/rugrats-spanish", "ScaryGary": "/comics/scary-gary", "SpeedBump": "/comics/speed-bump", "StrangeBrew": "/comics/strange-brew", "TheBarn": "/comics/the-barn", "TheDinetteSet": "/comics/dinette-set", "TheMeaningofLila": "/comics/meaning-of-lila", "TheOtherCoast": "/comics/the-other-coast", "TheQuigmans": "/comics/the-quigmans", "TheWizardofIdinSpanish": "/comics/wizard-of-id-spanish", "ThinLines": "/comics/thin-lines", "WeePals": "/comics/wee-pals", "WizardofId": "/comics/wizard-of-id", "WorkingitOut": "/comics/working-it-out", "ZackHill": "/comics/zack-hill"}
|
71
scripts/creators.py
Executable file
71
scripts/creators.py
Executable file
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
Script to get keenspot comics and save the info in a JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
# <a href="/comics/agnes.html"><strong>Agnes</strong></a>
|
||||
url_matcher = re.compile(tagre("a", "href", r'(/comics/[^/]+)\.html') + r'<strong>([^<]+)</strong>')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
url = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = url
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.creators.com/comics/cat-seeall.html', res)
|
||||
save_result(res)
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print comics."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, url in sorted(comics.items()):
|
||||
print("add(%r, %r)" % (str(name), str(url)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
1
scripts/drunkduck.json
Normal file
1
scripts/drunkduck.json
Normal file
File diff suppressed because one or more lines are too long
82
scripts/drunkduck.py
Executable file
82
scripts/drunkduck.py
Executable file
|
@ -0,0 +1,82 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
Script to get drunkduck comics and save the info in a JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_url(url, url_matcher, num_matcher, res):
|
||||
"""Parse one search result page."""
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
comicurl = match.group(1)
|
||||
name = comicurl[:-1].rsplit('/')[-1]
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end(1)
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
res[name] = num
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
base = "http://www.drunkduck.com/search/?page=%d&search=&type=0&type=1&last_update="
|
||||
href = re.compile(tagre("a", "href", r'(/[^"]+/)', before="size24 yanone blue"))
|
||||
num = re.compile(r'(\d+) pages?</span>')
|
||||
# store info in a dictionary {name -> number of comics}
|
||||
res = {}
|
||||
# a search for an empty string returned 825 result pages
|
||||
result_pages = 825
|
||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
||||
for i in range(1, result_pages + 1):
|
||||
print(i, file=sys.stderr, end=" ")
|
||||
handle_url(base % i, href, num, res)
|
||||
save_result(res)
|
||||
|
||||
|
||||
def print_results(min_strips):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, num in sorted(comics.items()):
|
||||
if num >= min_strips:
|
||||
print("add('%s')" % name)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(int(sys.argv[1]))
|
||||
else:
|
||||
get_results()
|
9
scripts/generate_json.sh
Executable file
9
scripts/generate_json.sh
Executable file
|
@ -0,0 +1,9 @@
|
|||
#!/bin/sh -e
|
||||
set -u
|
||||
|
||||
d=$(dirname $0)
|
||||
for script in creators gocomics drunkduck universal keenspot; do
|
||||
echo "Executing ${script}.py"
|
||||
"${d}/${script}.py"
|
||||
done
|
||||
|
1
scripts/gocomics.json
Normal file
1
scripts/gocomics.json
Normal file
File diff suppressed because one or more lines are too long
87
scripts/gocomics.py
Executable file
87
scripts/gocomics.py
Executable file
|
@ -0,0 +1,87 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
Script to get gocomics and save the info in a JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import tagre, getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
#<a href="/shortname" class="alpha_list updated">name</a>
|
||||
url_matcher = re.compile(tagre("a", "href", r'(/[^"]+)', after="alpha_list") + r"([^<]+)</a>")
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
shortname = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = shortname
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.gocomics.com/features', res)
|
||||
handle_url('http://www.gocomics.com/explore/editorial_list', res)
|
||||
handle_url('http://www.gocomics.com/explore/sherpa_list', res)
|
||||
save_result(res)
|
||||
|
||||
|
||||
def has_creators_comic(name):
|
||||
cname = "Creators/%s" % name
|
||||
for scraperclass in get_scrapers():
|
||||
lname = scraperclass.get_name().lower()
|
||||
if lname == cname.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
if has_creators_comic(name):
|
||||
prefix = '#'
|
||||
else:
|
||||
prefix = ''
|
||||
print("%sadd(%r, %r)" % (prefix, str(name), str(shortname)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
1
scripts/keenspot.json
Normal file
1
scripts/keenspot.json
Normal file
File diff suppressed because one or more lines are too long
100
scripts/keenspot.py
Executable file
100
scripts/keenspot.py
Executable file
|
@ -0,0 +1,100 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
Script to get keenspot comics and save the info in a JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
|
||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' + tagre("a", "href", r'(http://[^"]+)') + r'([^<]+)</a>')
|
||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
url = match.group(1) + '/'
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
# find out how many images this comic has
|
||||
end = match.end()
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
res[name] = (url, num)
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
||||
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
||||
handle_url(base % c, res)
|
||||
save_result(res)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
cname = ("Creators/%s" % name).lower()
|
||||
gname = ("GoComics/%s" % name).lower()
|
||||
for scraperclass in get_scrapers():
|
||||
lname = scraperclass.get_name().lower()
|
||||
if lname == cname or lname == gname:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
min_comics = int(args[0])
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, entry in sorted(comics.items()):
|
||||
url, num = entry
|
||||
if num < min_comics:
|
||||
continue
|
||||
if has_comic(name):
|
||||
prefix = '#'
|
||||
else:
|
||||
prefix = ''
|
||||
print("%sadd(%r, %r)" % (prefix, str(name), str(url)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
93
scripts/mktestpage.py
Executable file
93
scripts/mktestpage.py
Executable file
|
@ -0,0 +1,93 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function
|
||||
import sys
|
||||
import os
|
||||
import stat
|
||||
import time
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
||||
htmltemplate = """
|
||||
<!DOCTYPE html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Dosage test results</title>
|
||||
<meta name="description" content="">
|
||||
<meta name="viewport" content="width=device-width">
|
||||
<link rel="stylesheet" href="css/normalize.css">
|
||||
<link rel="stylesheet" href="css/main.css">
|
||||
</head>
|
||||
<body>
|
||||
<p>Dosage test results from %(date)s</p>
|
||||
<ul>
|
||||
%(content)s
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def get_mtime (filename):
|
||||
"""Return modification time of filename."""
|
||||
return os.stat(filename)[stat.ST_MTIME]
|
||||
|
||||
|
||||
def strdate(t):
|
||||
return time.strftime("%d.%m.%Y", time.localtime(t))
|
||||
|
||||
|
||||
def get_test_name(line):
|
||||
classname = line.split('::')[1][4:]
|
||||
for scraper in get_scrapers():
|
||||
if scraper.__name__ == classname:
|
||||
try:
|
||||
url = scraper.starter()
|
||||
except Exception:
|
||||
url = None
|
||||
return scraper.get_name(), url
|
||||
raise ValueError("Scraper %r not found" % classname)
|
||||
|
||||
|
||||
def get_test(line):
|
||||
name, url = get_test_name(line)
|
||||
if line.startswith(". "):
|
||||
name += " OK"
|
||||
else:
|
||||
name += " FAILED"
|
||||
return name, url
|
||||
|
||||
|
||||
def get_content(filename):
|
||||
tests = []
|
||||
with open(filename, "r") as f:
|
||||
print("Tests parsed: 0", end=" ", file=sys.stderr)
|
||||
num_tests = 0
|
||||
for line in f:
|
||||
if line.startswith((". ", "F ")) and "test_comics" in line:
|
||||
num_tests += 1
|
||||
tests.append(get_test(line))
|
||||
if num_tests % 5 == 0:
|
||||
print(num_tests, end=" ", file=sys.stderr)
|
||||
tests.sort()
|
||||
res = []
|
||||
for name, url in tests:
|
||||
css = name.split()[-1].lower()
|
||||
if url:
|
||||
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
|
||||
else:
|
||||
inner = '<span class="%s">%s</span>' % (css, name)
|
||||
res.append(' <li>%s</li>' % inner)
|
||||
return os.linesep.join(res)
|
||||
|
||||
|
||||
def main(args):
|
||||
filename = "testresults.txt"
|
||||
modified = get_mtime(filename)
|
||||
content = get_content(filename)
|
||||
attrs = {"date": strdate(modified), "content": content}
|
||||
print(htmltemplate % attrs)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main(sys.argv[1:]))
|
11
scripts/mktestscript.sh
Executable file
11
scripts/mktestscript.sh
Executable file
|
@ -0,0 +1,11 @@
|
|||
#!/bin/sh -e
|
||||
set -u
|
||||
# generates a convenience test script from failed tests
|
||||
|
||||
script=test.sh
|
||||
|
||||
rm -f "$script"
|
||||
echo "#!/bin/sh -e" > "$script"
|
||||
egrep -v "^\. " testresults.txt | egrep "^F " | cut -b "3-" | awk '{ print "make test TESTS=" $0; }' >> "$script"
|
||||
chmod 755 "$script"
|
||||
|
17
scripts/removeafter.py
Executable file
17
scripts/removeafter.py
Executable file
|
@ -0,0 +1,17 @@
|
|||
#!/usr/bin/env python
|
||||
"""Remove all lines after a given marker line.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import fileinput
|
||||
import sys
|
||||
|
||||
def main(args):
|
||||
filename = args[0]
|
||||
marker = args[1]
|
||||
for line in fileinput.input(filename, inplace=1):
|
||||
print(line.rstrip())
|
||||
if line.startswith(marker):
|
||||
break
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
1
scripts/universal.json
Normal file
1
scripts/universal.json
Normal file
|
@ -0,0 +1 @@
|
|||
{"9ChickweedLane": "/comics/strip/9chickweedlane", "AdamAtHome": "/comics/strip/adamathome", "AlleyOop": "/comics/strip/alley-oop", "ArloandJanis": "/comics/strip/arloandjanis", "BadReporter": "/comics/badreporter", "Baldo": "/comics/strip/baldo", "Betty": "/comics/strip/betty", "BigNate": "/comics/strip/bignate", "Biographic": "/comics/strip/biographic", "Brevitystrip": "/comics/strip/brevity", "BusinessAndFinance": "/comics/category/business%20%26%20finance", "CalvinandHobbes": "/comics/strip/calvinandhobbes", "Cathy": "/comics/strip/cathy", "Cleats": "/comics/strip/cleats", "ClosetoHome": "/comics/panel/closetohome", "ComicPanel": "/comics/panel", "ComicStrip": "/comics/strip", "ComicsAZ": "/comics/list", "Cornered": "/comics/panel/cornered", "CowandBoy": "/comics/strip/cowandboy", "CuldeSac": "/comics/strip/culdesac", "Dilbert": "/comics/strip/dilbert", "Doonesbury": "/comics/strip/doonesbury", "Drabble": "/comics/strip/drabble", "Espaol": "/comics/category/espanol", "FMinus": "/comics/strip/fminus", "Family": "/comics/category/family", "ForBetterorForWorse": "/comics/strip/forbetterorforworse", "ForKids": "/comics/category/for%20kids", "FoxTrot": "/comics/strip/foxtrot", "FrankAndErnest": "/comics/strip/frankandernest", "Frazz": "/comics/strip/frazz", "FredBasset": "/comics/strip/fredbasset", "FreshlySqueezed": "/comics/strip/freshlysqueezed", "Garfield": "/comics/strip/garfield", "GetFuzzy": "/comics/strip/getfuzzy", "GingerMeggs": "/comics/strip/gingermeggs", "Graffiti": "/comics/panel/graffiti", "GrandAvenue": "/comics/strip/grand-avenue", "HealthCapsules": "/comics/panel/healthcapsules", "HeartoftheCity": "/comics/strip/heartofthecity", "Herman": "/comics/panel/herman", "InkPen": "/comics/strip/inkpen", "IntheBleachers": "/comics/panel/inthebleachers", "IntheSticks": "/comics/strip/inthesticks", "JamesBond": "/comics/strip/jamesbond", "JumpStart": "/comics/strip/jumpstart", "KidCity": "/comics/strip/kidcity", "KidSpot": "/comics/panel/kidspot", "KitNCarlyle": "/comics/panel/kitncarlyle", "LaCucaracha": "/comics/strip/lacucaracha", "Lio": "/comics/strip/lio", "Lola": "/comics/strip/lola", "Luann": "/comics/strip/luann", "MagicinaMinute": "/comics/strip/magicinaminute", "Marmaduke": "/comics/panel/marmaduke", "Men": "/comics/category/men", "ModeratelyConfused": "/comics/panel/moderately-confused", "Monty": "/comics/strip/monty", "MrGigiandtheSquid": "/comics/strip/mr-gigi-and-the-squid", "MuttAndJeff": "/comics/strip/muttandjeff", "NEA": "/comics/category/nea", "Nancy": "/comics/strip/nancy", "NonSequitur": "/comics/strip/nonsequitur", "NonSequiturPanel": "/comics/panel/non-sequitur-panel", "OfftheMark": "/comics/panel/offthemark", "Overboard": "/comics/strip/overboard", "OvertheHedge": "/comics/strip/overthehedge", "Peanuts": "/comics/strip/peanuts", "PearlsBeforeSwine": "/comics/strip/pearlsbeforeswine", "Pets": "/comics/category/pets", "PoochCafe": "/comics/strip/poochcafe", "PricklyCity": "/comics/strip/pricklycity", "RealLifeAdventures": "/comics/panel/reallifeadventures", "RealityCheck": "/comics/panel/realitycheck", "RedandRover": "/comics/strip/redandrover", "RipHaywire": "/comics/strip/riphaywire", "RipleysBelieveItorNot": "/comics/panel/ripleysbelieveitornot", "RoseisRose": "/comics/strip/roseisrose", "RudyPark": "/comics/strip/rudypark", "Shortcuts": "/comics/strip/shortcuts", "SouptoNutz": "/comics/strip/soup-to-nutz", "StoneSoup": "/comics/strip/stonesoup", "SundayOnly": "/comics/category/sunday%20only", "TankMcNamara": "/comics/strip/tankmcnamara", "Tarzan": "/comics/strip/tarzan", "Thatababy": "/comics/strip/thatababy", "TheArgyleSweater": "/comics/panel/theargylesweater", "TheBornLoser": "/comics/strip/the-born-loser", "TheBuckets": "/comics/strip/thebuckets", "TheDinetteSet": "/comics/panel/dinetteset", "TheDuplex": "/comics/strip/duplex", "TheElderberries": "/comics/strip/theelderberries", "TheFlyingMcCoys": "/comics/panel/theflyingmccoys", "TheFuscoBrothers": "/comics/strip/thefuscobrothers", "TheGrizzwells": "/comics/strip/thegrizzwells", "TheKnightLife": "/comics/strip/theknightlife", "TomtheDancingBug": "/comics/strip/tomthedancingbug", "UncleArtsFunland": "/comics/strip/uncleartsfunland", "WebExclusive": "/comics/category/web%20exclusive", "Women": "/comics/category/women", "Ziggy": "/comics/panel/ziggy"}
|
86
scripts/universal.py
Executable file
86
scripts/universal.py
Executable file
|
@ -0,0 +1,86 @@
|
|||
#!/usr/bin/env python
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
"""
|
||||
Script to get universal comics and save the info in a JSON file for further processing.
|
||||
"""
|
||||
from __future__ import print_function
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||
from dosagelib.util import getPageContent, asciify, unescape
|
||||
from dosagelib.scraper import get_scrapers
|
||||
|
||||
json_file = __file__.replace(".py", ".json")
|
||||
|
||||
#<li><a href="/comics/strip/9chickweedlane">9 Chickweed Lane</a>
|
||||
url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
||||
|
||||
def contains_case_insensitive(adict, akey):
|
||||
for key in adict:
|
||||
if key.lower() == akey.lower():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def handle_url(url, res):
|
||||
"""Parse one search result page."""
|
||||
print("Parsing", url, file=sys.stderr)
|
||||
try:
|
||||
data, baseUrl = getPageContent(url)
|
||||
except IOError as msg:
|
||||
print("ERROR:", msg, file=sys.stderr)
|
||||
return
|
||||
for match in url_matcher.finditer(data):
|
||||
shortname = match.group(1)
|
||||
name = unescape(match.group(2))
|
||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
||||
if contains_case_insensitive(res, name):
|
||||
# we cannot handle two comics that only differ in case
|
||||
print("WARN: skipping possible duplicate", name, file=sys.stderr)
|
||||
continue
|
||||
res[name] = shortname
|
||||
|
||||
|
||||
def save_result(res):
|
||||
"""Save result to file."""
|
||||
with open(json_file, 'wb') as f:
|
||||
json.dump(res, f, sort_keys=True)
|
||||
|
||||
|
||||
def get_results():
|
||||
"""Parse all search result pages."""
|
||||
# store info in a dictionary {name -> shortname}
|
||||
res = {}
|
||||
handle_url('http://www.universaluclick.com/comics/list', res)
|
||||
save_result(res)
|
||||
|
||||
|
||||
def has_comic(name):
|
||||
cname = ("Creators/%s" % name).lower()
|
||||
gname = ("GoComics/%s" % name).lower()
|
||||
for scraperclass in get_scrapers():
|
||||
lname = scraperclass.get_name().lower()
|
||||
if lname == cname or lname == gname:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def print_results(args):
|
||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
||||
with open(json_file, "rb") as f:
|
||||
comics = json.load(f)
|
||||
for name, shortname in sorted(comics.items()):
|
||||
if has_comic(name):
|
||||
prefix = '#'
|
||||
else:
|
||||
prefix = ''
|
||||
print("%sadd(%r, %r)" % (prefix, str(name), str(shortname)))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) > 1:
|
||||
print_results(sys.argv[1:])
|
||||
else:
|
||||
get_results()
|
13
scripts/update_plugins.py
Executable file
13
scripts/update_plugins.py
Executable file
|
@ -0,0 +1,13 @@
|
|||
#!/bin/sh -e
|
||||
set -u
|
||||
|
||||
mincomics=100
|
||||
d=$(dirname $0)
|
||||
|
||||
for script in creators gocomics drunkduck universal keenspot; do
|
||||
target="${d}/../dosagelib/plugins/${script}.py"
|
||||
echo "Upating $target"
|
||||
"${d}/removeafter.py" "$target" "# DO NOT REMOVE"
|
||||
"${d}/${script}.py" $mincomics >> "$target"
|
||||
done
|
||||
|
Loading…
Reference in a new issue