Fix comics, improve tests, use python-requests.

This commit is contained in:
Bastian Kleineidam 2012-11-26 18:44:31 +01:00
parent d4eee7719d
commit 0556ffd30a
16 changed files with 191 additions and 403 deletions

View file

@ -40,10 +40,11 @@ manual page.
Dependencies Dependencies
------------- -------------
Dosage requires Python version 2.7 or higher, which can be downloaded Python version 2.7 or higher, which can be downloaded
from http://www.python.org. from http://www.python.org/
No external Python modules are required - only the Python Standard Library
that gets installed with Python. Also the python-requests module must be installed, which can be downloaded
from http://docs.python-requests.org/en/latest/
Installation Installation
------------- -------------
@ -59,7 +60,7 @@ or if you do not have root permissions:
Technical Description Technical Description
---------------------- ----------------------
Dosage is written entirely in Python and relies on regular expressions to Dosage is written in Python and relies on regular expressions to
do most of the grunt work. do most of the grunt work.
For each webcomic Dosage has a plugin module, found in the "plugins" For each webcomic Dosage has a plugin module, found in the "plugins"

View file

@ -4,6 +4,7 @@ Features:
- cmdline: Added proper return codes for error conditions. - cmdline: Added proper return codes for error conditions.
- comics: Added more robust regular expressions for HTML tags. - comics: Added more robust regular expressions for HTML tags.
They match case insensitive and ignore whitespaces now. They match case insensitive and ignore whitespaces now.
- comics: Use the python-requests module for HTTP requests.
Changes: Changes:
- installation: Added support for dynamic configuration values. - installation: Added support for dynamic configuration values.

View file

@ -2,7 +2,6 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
import urllib2
import os import os
import locale import locale
import rfc822 import rfc822
@ -55,18 +54,24 @@ class ComicImage(object):
"""Connect to host and get meta information.""" """Connect to host and get meta information."""
try: try:
self.urlobj = urlopen(self.url, referrer=self.referrer) self.urlobj = urlopen(self.url, referrer=self.referrer)
except urllib2.HTTPError as he: except IOError as he:
raise FetchComicError('Unable to retrieve URL.', self.url, he.code) raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
if self.urlobj.info().getmaintype() != 'image' and \ content_type = self.urlobj.headers.get('content-type')
self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'): content_type = content_type.split(';', 1)[0]
if '/' in content_type:
maintype, subtype = content_type.split('/', 1)
else:
maintype = content_type
subtype = None
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
raise FetchComicError('No suitable image found to retrieve.', self.url) raise FetchComicError('No suitable image found to retrieve.', self.url)
# Always use mime type for file extension if it is sane. # Always use mime type for file extension if it is sane.
if self.urlobj.info().getmaintype() == 'image': if maintype == 'image':
self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg') self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.info().get('content-length', 0)) self.contentLength = int(self.urlobj.headers.get('content-length', 0))
self.lastModified = self.urlobj.info().get('last-modified') self.lastModified = self.urlobj.headers.get('last-modified')
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2) out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
def touch(self, filename): def touch(self, filename):
@ -88,7 +93,6 @@ class ComicImage(object):
fn = os.path.join(comicDir, filename) fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.urlobj.close()
self.touch(fn) self.touch(fn)
out.write('Skipping existing file "%s".' % (fn,), 1) out.write('Skipping existing file "%s".' % (fn,), 1)
return fn, False return fn, False
@ -97,7 +101,7 @@ class ComicImage(object):
out.write('Writing comic to file %s...' % (fn,), 3) out.write('Writing comic to file %s...' % (fn,), 3)
with open(fn, 'wb') as comicOut: with open(fn, 'wb') as comicOut:
startTime = time.time() startTime = time.time()
comicOut.write(self.urlobj.read()) comicOut.write(self.urlobj.content)
endTime = time.time() endTime = time.time()
self.touch(fn) self.touch(fn)
except: except:
@ -114,7 +118,5 @@ class ComicImage(object):
attrs = dict(fn=fn, bytes=bytes, speed=speed) attrs = dict(fn=fn, bytes=bytes, speed=speed)
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1) out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
getHandler().comicDownloaded(self.name, fn) getHandler().comicDownloaded(self.name, fn)
finally:
self.urlobj.close()
return fn, True return fn, True

View file

@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
class DMFA(_BasicScraper): class DMFA(_BasicScraper):
latestUrl = 'http://www.missmab.com/' latestUrl = 'http://www.missmab.com/'
stripUrl = latestUrl + 'Comics/Vol_%s.php' stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)')) imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"])+')+ prevSearch = compile(tagre("a", "href", r'([^"])+')+
tagre("img", "src", r'(?:../)?Images/comicprev.gif')) tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
help = 'Index format: nnn (normally, some specials)' help = 'Index format: nnn (normally, some specials)'

View file

@ -4,22 +4,29 @@
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import make_scraper
from ..helpers import bounceStarter, queryNamer from ..helpers import bounceStarter
from ..util import tagre from ..util import tagre
def add(name): def add(name):
classname = 'DrunkDuck_%s' % name classname = 'DrunkDuck_%s' % name
url = 'http://www.drunkduck.com/%s/' % name url = 'http://www.drunkduck.com/%s/' % name
linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page") linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
@classmethod
def namer(cls, imageUrl, pageUrl):
index = int(pageUrl.rstrip('/').split('/')[-1])
ext = imageUrl.rsplit('.')[-1]
return '%d.%s' % (index, ext)
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
name = 'DrunkDuck/' + name, name = 'DrunkDuck/' + name,
starter = bounceStarter(url, compile(linkSearch % 'next')), starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
stripUrl = url + 'index.php?p=%s' % name, stripUrl = url + '%s/',
imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")), imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
prevSearch= compile(linkSearch % 'previous'), prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
help = 'Index format: n (unpadded)', help = 'Index format: n (unpadded)',
namer = queryNamer('p', usePageUrl=True), namer = namer,
) )
comics = ( comics = (

View file

@ -1,47 +1,26 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
from ..scraper import _BasicScraper
from re import compile
from ..scraper import make_scraper
from ..util import asciify
def fallenangel(name, shortname): def add(name, shortname):
pass # XXX latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
classname = asciify(name)
class _TheFallenAngel(_BasicScraper): globals()[classname] = make_scraper(classname,
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"') latestUrl = latestUrl,
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"') stripUrl = latestUrl + '?date=%s',
help = 'Index format: yyyymmdd' name='FallenAngel/' + name,
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
@property prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
def baseUrl(self): help = 'Index format: yyyymmdd',
return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,) )
@property
def stripUrl(self):
return self.baseUrl + '?date=%s'
def starter(self):
return self.baseUrl
class HighMaintenance(_TheFallenAngel):
name = 'TheFallenAngel/HighMaintenance'
shortName = 'hm'
class FAWK(_TheFallenAngel):
name = 'TheFallenAngel/FAWK'
shortName = 'fawk'
class MalloryChan(_TheFallenAngel):
name = 'TheFallenAngel/MalloryChan'
shortName = 'mallorychan'
add('HighMaintenance', 'hm')
add('FAWK', 'fawk')
add('MalloryChan', 'mallorychan')

View file

@ -29,30 +29,30 @@ def add(name, repl=''):
# http://www.gocomics.com/features # http://www.gocomics.com/features
# note that comics from creators.com are not repeated here # Duplicate comics from creators.com are commented out
add('2 Cows and a Chicken') add('2 Cows and a Chicken')
add('9 Chickweed Lane') add('9 Chickweed Lane')
add('9 to 5') add('9 to 5')
add('The Academia Waltz') add('The Academia Waltz')
add('Adam at Home') add('Adam at Home')
add('Agnes') #add('Agnes')
add('Alley Oop', repl='-') add('Alley Oop', repl='-')
add('Andertoons') add('Andertoons')
add('Andy Capp') #add('Andy Capp')
add('Angry Little Girls', repl='-') add('Angry Little Girls', repl='-')
add('Animal Crackers') add('Animal Crackers')
add('Annie') add('Annie')
add('The Argyle Sweater') add('The Argyle Sweater')
add('Arlo and Janis') add('Arlo and Janis')
add('Ask Shagg') #add('Ask Shagg')
add('BC') #add('BC')
add('Back in the Day') add('Back in the Day')
add('Bad Reporter') add('Bad Reporter')
add('Baldo') add('Baldo')
add('Ballard Street') #add('Ballard Street')
add('Banana Triangle', repl='-') add('Banana Triangle', repl='-')
add('Barkeater Lake') add('Barkeater Lake')
add('The Barn') #add('The Barn')
add('Barney and Clyde') add('Barney and Clyde')
add('Basic Instructions') add('Basic Instructions')
add('Beardo') add('Beardo')
@ -81,13 +81,13 @@ add('Brewster Rockit')
add('Broom Hilda') add('Broom Hilda')
add('The Buckets') add('The Buckets')
add('Buni') add('Buni')
add('Cafe con Leche') #add('Cafe con Leche')
add('Calvin and Hobbes') add('Calvin and Hobbes')
add('Candorville') add('Candorville')
add('Cathy') add('Cathy')
add('Cest la Vie') add('Cest la Vie')
add('Cheap Thrills Cuisine', repl='-') add('Cheap Thrills Cuisine', repl='-')
add('Chuckle Bros') #add('Chuckle Bros')
add('Citizen Dog') add('Citizen Dog')
add('The City') add('The City')
add('Cleats') add('Cleats')
@ -99,15 +99,15 @@ add('Cow and Boy')
add('CowTown') add('CowTown')
add('Crumb') add('Crumb')
add('Cul de Sac') add('Cul de Sac')
add('Daddys Home') #add('Daddys Home')
add('Dark Side of the Horse') add('Dark Side of the Horse')
add('Deep Cover') add('Deep Cover')
add('Diamond Lil') #add('Diamond Lil')
add('Dick Tracy') add('Dick Tracy')
add('The Dinette Set') #add('The Dinette Set')
add('Dixie Drive', repl='-') add('Dixie Drive', repl='-')
add('Dog Eat Doug') #add('Dog Eat Doug')
add('Dogs of C Kennel') #add('Dogs of C Kennel')
add('Domestic Abuse') add('Domestic Abuse')
add('Doonesbury') add('Doonesbury')
add('The Doozies') add('The Doozies')
@ -122,18 +122,18 @@ add('F Minus')
add('Family Tree') add('Family Tree')
add('Farcus') add('Farcus')
add('Fat Cats', repl='-') add('Fat Cats', repl='-')
add('Flo and Friends') #add('Flo and Friends')
add('The Flying McCoys') add('The Flying McCoys')
add('Foolish Mortals', repl='-') add('Foolish Mortals', repl='-')
add('For Better or For Worse') add('For Better or For Worse')
add('For Heavens Sake') #add('For Heavens Sake')
add('Fort Knox') add('Fort Knox')
add('FoxTrot') add('FoxTrot')
add('FoxTrot Classics') add('FoxTrot Classics')
add('Frank and Ernest') add('Frank and Ernest')
add('Frazz') add('Frazz')
add('Fred Basset') add('Fred Basset')
add('Free Range') #add('Free Range')
add('Freshly Squeezed') add('Freshly Squeezed')
add('Frog Applause') add('Frog Applause')
add('The Fusco Brothers') add('The Fusco Brothers')
@ -154,9 +154,9 @@ add('Haiku Ewe')
add('Ham Shears') add('Ham Shears')
add('Health Capsules') add('Health Capsules')
add('Heart of the City') add('Heart of the City')
add('Heathcliff') #add('Heathcliff')
add('Heavenly Nostrils') add('Heavenly Nostrils')
add('Herb and Jamaal') #add('Herb and Jamaal')
add('Herman') add('Herman')
add('Home and Away') add('Home and Away')
add('HUBRIS!') add('HUBRIS!')
@ -184,7 +184,7 @@ add('La Cucaracha')
add('Last Kiss') add('Last Kiss')
add('The LeftyBosco Picture Show') add('The LeftyBosco Picture Show')
add('Legend of Bill') add('Legend of Bill')
add('Liberty Meadows') #add('Liberty Meadows')
add('Lil Abner') add('Lil Abner')
add('Lio') add('Lio')
add('Little Dog Lost') add('Little Dog Lost')
@ -201,7 +201,7 @@ add('Maintaining')
add('Marias Day') add('Marias Day')
add('Marmaduke') add('Marmaduke')
add('McArroni') add('McArroni')
add('The Meaning of Lila') #add('The Meaning of Lila')
add('Medium Large') add('Medium Large')
add('Meg Classics') add('Meg Classics')
add('The Middletons') add('The Middletons')
@ -209,7 +209,7 @@ add('Mike du Jour')
add('Minimum Security') add('Minimum Security')
add('Moderately Confused') add('Moderately Confused')
add('Molly and the Bear') add('Molly and the Bear')
add('Momma') #add('Momma')
add('Monty') add('Monty')
add('Motley Classics') add('Motley Classics')
add('Mr. Gigi and the Squid') add('Mr. Gigi and the Squid')
@ -217,7 +217,7 @@ add('Mutt and Jeff')
add('My Cage') add('My Cage')
add('MythTickle') add('MythTickle')
add('Nancy') add('Nancy')
add('Nest Heads') #add('Nest Heads')
add('NEUROTICA') add('NEUROTICA')
add('New Adventures of Queen Victoria') add('New Adventures of Queen Victoria')
add('Non Sequitur') add('Non Sequitur')
@ -225,10 +225,10 @@ add('The Norm Classics')
add('Nothing is Not Something') add('Nothing is Not Something')
add('Off the Mark') add('Off the Mark')
add('Ollie and Quentin') add('Ollie and Quentin')
add('On A Claire Day') #add('On A Claire Day')
add('One Big Happy') #add('One Big Happy')
add('Ordinary Bill') add('Ordinary Bill')
add('The Other Coast') #add('The Other Coast')
add('Out of the Gene Pool Re-Runs') add('Out of the Gene Pool Re-Runs')
add('Over the Hedge') add('Over the Hedge')
add('Overboard') add('Overboard')
@ -254,10 +254,10 @@ add('Reply All')
add('Rip Haywire') add('Rip Haywire')
add('Ripleys Believe It or Not') add('Ripleys Believe It or Not')
add('Rose is Rose') add('Rose is Rose')
add('Rubes') #add('Rubes')
add('Rudy Park') add('Rudy Park')
add('Savage Chickens') add('Savage Chickens')
add('Scary Gary') #add('Scary Gary')
add('Shirley and Son Classics') add('Shirley and Son Classics')
add('Shoe') add('Shoe')
add('Shoecabbage') add('Shoecabbage')
@ -266,11 +266,11 @@ add('Skin Horse')
add('Skippy') add('Skippy')
add('Slowpoke') add('Slowpoke')
add('Soup to Nutz') add('Soup to Nutz')
add('Speed Bump') #add('Speed Bump')
add('Spot the Frog') add('Spot the Frog')
add('Starslip') add('Starslip')
add('Stone Soup') add('Stone Soup')
add('Strange Brew') #add('Strange Brew')
add('The Sunshine Club') add('The Sunshine Club')
add('Sylvia') add('Sylvia')
add('Tank McNamara') add('Tank McNamara')
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
add('That is Priceless') add('That is Priceless')
add('Thats Life') add('Thats Life')
add('Thatababy') add('Thatababy')
add('Thin Lines') #add('Thin Lines')
add('Tiny Sepuku') add('Tiny Sepuku')
add('TOBY') add('TOBY')
add('Todays Dogg') add('Todays Dogg')
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
add('U.S. Acres') add('U.S. Acres')
add('Viivi and Wagner') add('Viivi and Wagner')
add('Watch Your Head') add('Watch Your Head')
add('Wee Pals') #add('Wee Pals')
add('Wizard of Id') #add('Wizard of Id')
add('Working Daze') add('Working Daze')
add('Working It Out') #add('Working It Out')
add('W.T. Duck') add('W.T. Duck')
add('Zack Hill') #add('Zack Hill')
add('Ziggy') add('Ziggy')
# http://www.gocomics.com/explore/editorial_list # http://www.gocomics.com/explore/editorial_list

View file

@ -18,9 +18,9 @@ def add(name, urls):
name='KeenSpot/' + name, name='KeenSpot/' + name,
latestUrl=latestUrl, latestUrl=latestUrl,
stripUrl=baseUrl + 'd/%s.html', stripUrl=baseUrl + 'd/%s.html',
imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')), imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') + prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'), '(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
help = 'Index format: yyyymmdd', help = 'Index format: yyyymmdd',
) )

View file

@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
class NineteenNinetySeven(_BasicScraper): class NineteenNinetySeven(_BasicScraper):
name = '1997' name = '1997'
latestUrl = 'http://www.1977thecomic.com/' latestUrl = 'http://www.1977thecomic.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous") prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
help = 'Index format: yyyy/mm/dd/strip-name' help = 'Index format: yyyy/mm/dd/strip-name'

View file

@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
latestUrl = 'http://www.sheldoncomics.com/' latestUrl = 'http://www.sheldoncomics.com/'
stripUrl = latestUrl + 'archive/%s.html' stripUrl = latestUrl + 'archive/%s.html'
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)')) imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev")) prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
help = 'Index format: yymmdd' help = 'Index format: yymmdd'

View file

@ -1,280 +1,54 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
"""
from re import compile, sub The Universal comics only have some samples, but those samples are always the newest ones.
"""
import datetime
from re import compile, escape
from ..scraper import make_scraper from ..scraper import make_scraper
from ..util import fetchUrl, tagre from ..util import tagre, asciify, getPageContent
def add(name, shortName): def parse_strdate(strdate):
homepage = 'http://content.uclick.com/a2z.html' """Parse date string. XXX this is locale dependant but it should not be."""
baseUrl = 'http://www.uclick.com/client/zzz/%s/' return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
latestUrl = baseUrl % shortName
classname = 'UClick_%s' % name
def add(name, category):
shortname = name.replace(' ', '').lower()
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
classname = 'UClick_%s' % asciify(name)
@classmethod @classmethod
def fetchSubmodules(cls): def namer(cls, imageUrl, pageUrl):
exclusions = ('index',) """Parse publish date from page content which looks like:
# XXX refactor this mess <img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)')) <h4>published: Sunday, November 11, 2012</h4>
partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html')) """
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0] data = getPageContent(pageUrl)[0]
possibles = [partsMatch.match(match).groups() for match in matches] ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
def normalizeName(name): if mo:
name = sub(r'&(.)acute;', r'\1', name).title() strdate = mo.group(1)
return ''.join([c for c in name if c.isalnum()]) return parse_strdate(strdate).strftime("%Y%m%d")
def fetchSubmodule(module):
try:
return fetchUrl(cls.baseUrl % module, cls.imageSearch)
except Exception:
# XXX log error
return False
return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
name='UClick/' + name, name='UClick/' + name,
latestUrl = latestUrl, latestUrl = latestUrl,
stripUrl = latestUrl + '%s/', stripUrl = latestUrl + '%s/',
imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')), imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'), multipleImagesPerStrip = True,
help = 'Index format: yyyy/mm/dd', prevSearch = None,
help = 'Index format: none',
namer = namer,
) )
# List is from http://www.universaluclick.com/comics/list
comics = { comics = {
'5thWave': 'fw', '9 Chickweed Lane': 'strip',
'9To5': 'tmntf',
'AdamHome': 'ad',
'Agnes': 'cragn',
'AlcarazLalo': 'la',
'AlcarazLaloSpanish': 'spla',
'AndersonNick': 'wpnan',
'AndyCapp': 'crcap',
'AnimalCrackers': 'tmani',
'Annie': 'tmann',
'AsayChuck': 'crcas',
'AskShagg': 'crask',
'AuthTony': 'ta',
'BadReporter': 'bad',
'Baldo': 'ba',
'BaldoSpanish': 'be',
'BallardStreet': 'crbal',
'BarkEaterLake': 'bark',
'BarstowDonna': 'dba',
'BC': 'crbc',
'BCSpanish': 'crbcs',
'BeattieBruce': 'crbbe',
'BennetClay': 'wpcbe',
'BensonLisa': 'wplbe',
'BensonSteve': 'crsbe',
'BigTop': 'bt',
'Biographic': 'biov',
'Bleeker': 'blk',
'BobTheSquirrel': 'bob',
'BoilingPoint': 'boil',
'BokChip': 'crcbo',
'BoNanas': 'bon',
'Boomerangs': 'boom',
'BoondocksThe': 'bo',
'BottomLiners': 'tmbot',
'BoundAndGagged': 'tmbou',
'Brainwaves': 'bwv',
'BreenSteve': 'crsbr',
'BrendaStarr': 'tmbre',
'BrewsterRockit': 'tmrkt',
'BrittChris': 'crcbr',
'BroomHilda': 'tmbro',
'Candorville': 'cand',
'CarlsonStuart': 'sc',
'CatalinoKen': 'crkca',
'Cathy': 'ca',
'CathySpanish': 'spca',
'CEstLaVie': 'clv',
'CityThe': 'derf',
'ClearBlueWater': 'cbw',
'Cleats': 'cle',
'CloseToHome': 'cl',
'CombsPaul': 'tmcmb',
'CompuToon': 'tmcom',
'Condorito': 'cond',
'ConradPaul': 'tmpco',
'Cornered': 'co',
'CulDeSac': 'cds',
'DanzigerJeff': 'jd',
'DaviesMatt': 'tmmda',
'DeepCover': 'deep',
'DeeringJohn': 'crjde',
'DickTracy': 'tmdic',
'DinetteSetThe': 'crdin',
'DogEatDoug': 'crdog',
'DonWright': 'tmdow',
'Doodles': 'tmdoo',
'Doonesbury': 'db',
'DuplexThe': 'dp',
'Eek': 'eek',
'ElderberriesThe': 'eld',
'FacesInTheNews': 'kw',
'FlightDeck': 'crfd',
'FloAndFriends': 'crflo',
'FlyingMccoysThe': 'fmc',
'ForBetterOrForWorse': 'fb',
'ForHeavenSSake': 'crfhs',
'FoxtrotClassics': 'ftcl',
'Foxtrot': 'ft',
'FoxtrotSpanish': 'spft',
'FrankAndErnest': 'fa',
'FredBassetSpanish': 'spfba',
'FredBasset': 'tmfba',
'FrogApplause': 'frog',
'FuscoBrothersThe': 'fu',
'Garfield': 'ga',
'GarfieldSpanish': 'gh',
'GasolineAlley': 'tmgas',
'GaturroSpanish': 'spgat',
'GilThorp': 'tmgil',
'GingerMeggs': 'gin',
'GingerMeggsSpanish': 'spgin',
'GirlsAndSports': 'crgis',
'GorrellBob': 'crbgo',
'GoTeamBob': 'gtb',
'HammondBruce': 'hb',
'HandelsmanWalt': 'tmwha',
'HeartOfTheCity': 'hc',
'Heathcliff': 'crhea',
'HeathcliffSpanish': 'crhes',
'HerbAndJamaal': 'crher',
'HigginsJack': 'jh',
'HomeAndAway': 'wphaa',
'HorseyDavid': 'tmdho',
'Housebroken': 'tmhou',
'HubertAndAbby': 'haa',
'IdiotBox': 'ibox',
'ImagineThis': 'imt',
'InkPen': 'ink',
'InTheBleachers': 'bl',
'ItsAllAboutYou': 'wpiay',
'JamesBondSpanish': 'spjb',
'JonesClay': 'crcjo',
'KallaugherKevin': 'cwkal',
'KChroniclesThe': 'kk',
'KelleySteve': 'crske',
'Kudzu': 'tmkud',
'LaCucaracha': 'lc',
'LegendOfBill': 'lob',
'LibertyMeadows': 'crlib',
'Lio': 'lio',
'LittleDogLost': 'wpldl',
'LocherDick': 'tmdlo',
'LooseParts': 'tmloo',
'LostSheep': 'lost',
'LoweChan': 'tmclo',
'LuckovichMike': 'crmlu',
'LuckyCow': 'luc',
'MarkstienGary': 'crgma',
'MarletteDoug': 'tmdma',
'MccoyGlenn': 'gm',
'MeaningOfLilaThe': 'crlil',
'MeehanStreak': 'tmmee',
'MiddletonsThe': 'tmmid',
'MinimumSecurity': 'ms',
'ModestyBlaiseSpanish': 'spmb',
'Momma': 'crmom',
'MorinJim': 'cwjmo',
'MuttJeffSpanish': 'spmut',
'MythTickle': 'myth',
'NAoQV': 'naqv',
'NaturalSelection': 'crns',
'NestHeads': 'cpnst',
'Neurotica': 'neu',
'NonSequitur': 'nq',
'OhmanJack': 'tmjoh',
'OliphantPat': 'po',
'OnAClaireDay': 'crocd',
'OneBigHappy': 'crobh',
'OtherCoastThe': 'crtoc',
'OutOfTheGenePool': 'wpgen',
'Overboard': 'ob',
'OverboardSpanish': 'spob',
'PepeSpanish': 'sppep',
'PettJoel': 'jp',
'Pibgorn': 'pib',
'Pickles': 'wppic',
'Pluggers': 'tmplu',
'PoochCafe': 'poc',
'PoochCafeSpanish': 'sppoc',
'PopCulture': 'pop',
'PowellDwane': 'crdpo',
'Preteena': 'pr',
'PricklyCity': 'prc',
'QuigmansThe': 'tmqui',
'RallComic': 'tr',
'RamirezMicheal': 'crmrm',
'RamseyMarshall': 'crmra',
'RealLifeAdventures': 'rl',
'RedAndRover': 'wpred',
'RedMeat': 'red',
'ReynoldsUnwrapped': 'rw',
'RonaldinhoGaucho': 'ron',
'RonaldinhoGauchoSpanish': 'spron',
'Rubes': 'crrub',
'SackSteve': 'tmssa',
'SargentBen': 'bs',
'SargentBenSpanish': 'spbs',
'SendHelp': 'send',
'ShenemanDrew': 'tmdsh',
'SherffiusDrew': 'crjsh',
'Shoecabbage': 'shcab',
'Shoe': 'tmsho',
'SigmundSpanish': 'spsig',
'Slowpoke': 'slow',
'SmallWorld': 'small',
'SpaceIsThePlace': 'sitp',
'SpeedBump': 'crspe',
'StanisScott': 'crsst',
'StateOfTheUnion': 'crsou',
'StayskalWayne': 'tmwst',
'StoneSoup': 'ss',
'StrangeBrew': 'crstr',
'SummersDana': 'tmdsu',
'SuttonImpact': 'stn',
'Sylvia': 'tmsyl',
'SzepPaul': 'crpsz',
'TankMcnamara': 'tm',
'TeenageMutantNinjaTurtles': 'tmnt',
'TelnaesAnn': 'tmate',
'TheArgyleSweater': 'tas',
'ThePinkPanther': 'tmpnk',
'TheWizardOfId': 'crwiz',
'TheWizardOfIdSpanish': 'crwis',
'ThInk': 'think',
'ThompsonMike': 'crmth',
'ThroughThickAndThin': 'cpthk',
'TinySepuku': 'tiny',
'Toby': 'toby',
'TolesTom': 'tt',
'TomTheDancingBug': 'td',
'TooMuchCoffeeMan': 'tmcm',
'Trevor': 'trev',
'TutelandiaSpanish': 'sptut',
'VarvelGary': 'crgva',
'WassermanDan': 'tmdwa',
'WatchYourHead': 'wpwyh',
'Waylay': 'min',
'WeePals': 'crwee',
'WinnieThePooh': 'crwin',
'WitOfTheWorld': 'cwwit',
'WorkingItOut': 'crwio',
'WriteDon': 'tmdow',
'YennySpanish': 'spyen',
'Yenny': 'yen',
'ZackHill': 'crzhi',
'ZiggySpanish': 'spzi',
'Ziggy': 'zi',
} }
for name, shortname in comics.items(): for name, category in comics.items():
add(name, shortname) add(name, category)

View file

@ -2,6 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam # Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre

View file

@ -22,6 +22,9 @@ class _BasicScraper(object):
@cvar prevSearch: A compiled regex that will locate the URL for the @cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page. previous strip when applied to a strip page.
''' '''
# if more than one image per URL is expected
multipleImagesPerStrip = False
# usually the index format help
help = 'Sorry, no help for this comic yet.' help = 'Sorry, no help for this comic yet.'
def __init__(self, indexes=None): def __init__(self, indexes=None):
@ -44,7 +47,9 @@ class _BasicScraper(object):
def getStrip(self, url): def getStrip(self, url):
"""Get comic strip for given URL.""" """Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch) imageUrls = fetchUrls(url, self.imageSearch)[0]
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls) return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls): def getComicStrip(self, url, imageUrls):
@ -140,11 +145,13 @@ def get_scrapers():
""" """
global _scrapers global _scrapers
if _scrapers is None: if _scrapers is None:
out.write("Loading comic modules...")
modules = loader.get_modules() modules = loader.get_modules()
plugins = loader.get_plugins(modules, _BasicScraper) plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins) _scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name()) _scrapers.sort(key=lambda s: s.get_name())
check_scrapers() check_scrapers()
out.write("... %d modules loaded." % len(_scrapers))
return _scrapers return _scrapers

View file

@ -4,6 +4,7 @@
from __future__ import division, print_function from __future__ import division, print_function
import urllib2, urlparse import urllib2, urlparse
import requests
import sys import sys
import os import os
import cgi import cgi
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
@return: the generated regular expression suitable for re.compile() @return: the generated regular expression suitable for re.compile()
@rtype: string @rtype: string
""" """
if before:
before += "[^>]*"
if after:
after += "[^>]*"
attrs = dict( attrs = dict(
tag=case_insensitive_re(tag), tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute), attribute=case_insensitive_re(attribute),
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
before=before, before=before,
after=after, after=after,
) )
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
def case_insensitive_re(name): def case_insensitive_re(name):
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url): def getPageContent(url):
# read page data # read page data
page = urlopen(url) page = urlopen(url)
data = page.read(MAX_FILESIZE) data = page.text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
match = baseSearch.search(data) match = baseSearch.search(data)
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
imageUrl = match.group(1) imageUrl = match.group(1)
if not imageUrl: if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern)) raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.write('matched image URL %r' % imageUrl, 2) out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl))) imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls: if not imageUrls:
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern)) out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
@ -178,22 +175,18 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
out.write('Open URL %s' % url, 2) out.write('Open URL %s' % url, 2)
assert retries >= 0, 'invalid retry value %r' % retries assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
req = urllib2.Request(url) headers = {'User-Agent': UserAgent}
config = {"max_retries": retries}
if referrer: if referrer:
req.add_header('Referer', referrer) headers['Referer'] = referrer
req.add_header('User-Agent', UserAgent) try:
tries = 0 req = requests.get(url, headers=headers, config=config)
while True: req.raise_for_status()
try: return req
return urllib2.urlopen(req) except requests.exceptions.RequestException as err:
except IOError as err: msg = 'URL retrieval of %s failed: %s' % (url, err)
msg = 'URL retrieval of %s failed: %s' % (url, err) out.write(msg)
out.write(msg) raise IOError(msg)
out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
time.sleep(retry_wait_seconds)
tries += 1
if tries >= retries:
raise IOError(msg)
def get_columns (fp): def get_columns (fp):
@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
print("""********** Oops, I did it again. ************* print("""********** Oops, I did it again. *************
You have found an internal error in %(app)s. Please write a bug report You have found an internal error in %(app)s. Please write a bug report
at %(url)s and include the following information: at %(url)s and include at least the information below:
- your commandline arguments and any configuration file in ~/.dosage/
- the system information below
Not disclosing some of the information above due to privacy reasons is ok. Not disclosing some of the information below due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something I will try to help you nonetheless, but you have to give me something
I can work with ;) . I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out) """ % dict(app=AppName, url=SupportUrl), file=out)
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
{"version": sys.version, "platform": sys.platform}, file=out) {"version": sys.version, "platform": sys.platform}, file=out)
stime = strtime(time.time()) stime = strtime(time.time())
print("Local time:", stime, file=out) print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
def strtime(t): def strtime(t):

2
requirements.txt Normal file
View file

@ -0,0 +1,2 @@
requests

View file

@ -4,6 +4,7 @@
import tempfile import tempfile
import shutil import shutil
import re import re
import os
from itertools import islice from itertools import islice
from unittest import TestCase from unittest import TestCase
from dosagelib import scraper from dosagelib import scraper
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
def setUp(self): def setUp(self):
self.name = self.scraperclass.get_name() self.name = self.scraperclass.get_name()
self.url = self.scraperclass.starter() self.url = self.scraperclass.starter()
# create a temporary directory for images
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdir)
def get_saved_images(self):
"""Get saved images."""
dirs = tuple(self.name.split('/'))
return os.listdir(os.path.join(self.tmpdir, *dirs))
def test_comic(self): def test_comic(self):
# Test a scraper. It must be able to traverse backward for # Test a scraper. It must be able to traverse backward for
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
# on at least 4 pages. # on at least 4 pages.
scraperobj = self.scraperclass() scraperobj = self.scraperclass()
num = empty = 0 num = empty = 0
for strip in islice(scraperobj.getAllStrips(), 0, 5): max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0 images = 0
for image in strip.getImages(): for image in strip.getImages():
images += 1 images += 1
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
num += 1 num += 1
if self.scraperclass.prevSearch: if self.scraperclass.prevSearch:
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num) self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
# check that at exactly or for multiple pages at least 5 images are saved
saved_images = self.get_saved_images()
num_images = len(saved_images)
if self.scraperclass.multipleImagesPerStrip:
self.check(num_images >= max_strips,
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
else:
self.check(num_images == max_strips,
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty) self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
def check_stripurl(self, strip): def check_stripurl(self, strip):
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch)) self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
def save(self, image): def save(self, image):
# create a temporary directory
tmpdir = tempfile.mkdtemp()
try: try:
image.save(tmpdir) image.save(self.tmpdir)
except Exception as msg: except Exception as msg:
self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg)) self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
finally:
shutil.rmtree(tmpdir)
def check(self, condition, msg): def check(self, condition, msg):
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg)) self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
def make_comic_tester(name, **kwargs):
"""Create and return a _ComicTester class with given name and attributes."""
return type(name, (_ComicTester,), kwargs)
def generate_comic_testers(): def generate_comic_testers():
"""For each comic scraper, create a test class.""" """For each comic scraper, create a test class."""
g = globals()
# Limit number of scraper tests for now # Limit number of scraper tests for now
max_scrapers = 100 max_scrapers = 10000
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers): for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
name = 'Test'+scraperclass.__name__ name = 'Test'+scraperclass.__name__
globals()[name] = type(name, g[name] = make_comic_tester(name, scraperclass=scraperclass)
(_ComicTester,),
dict(scraperclass=scraperclass)
)
generate_comic_testers() generate_comic_testers()