Fix comics, improve tests, use python-requests.
This commit is contained in:
parent
d4eee7719d
commit
0556ffd30a
16 changed files with 191 additions and 403 deletions
|
@ -40,10 +40,11 @@ manual page.
|
|||
|
||||
Dependencies
|
||||
-------------
|
||||
Dosage requires Python version 2.7 or higher, which can be downloaded
|
||||
from http://www.python.org.
|
||||
No external Python modules are required - only the Python Standard Library
|
||||
that gets installed with Python.
|
||||
Python version 2.7 or higher, which can be downloaded
|
||||
from http://www.python.org/
|
||||
|
||||
Also the python-requests module must be installed, which can be downloaded
|
||||
from http://docs.python-requests.org/en/latest/
|
||||
|
||||
Installation
|
||||
-------------
|
||||
|
@ -59,7 +60,7 @@ or if you do not have root permissions:
|
|||
|
||||
Technical Description
|
||||
----------------------
|
||||
Dosage is written entirely in Python and relies on regular expressions to
|
||||
Dosage is written in Python and relies on regular expressions to
|
||||
do most of the grunt work.
|
||||
|
||||
For each webcomic Dosage has a plugin module, found in the "plugins"
|
||||
|
|
|
@ -4,6 +4,7 @@ Features:
|
|||
- cmdline: Added proper return codes for error conditions.
|
||||
- comics: Added more robust regular expressions for HTML tags.
|
||||
They match case insensitive and ignore whitespaces now.
|
||||
- comics: Use the python-requests module for HTTP requests.
|
||||
|
||||
Changes:
|
||||
- installation: Added support for dynamic configuration values.
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
|
||||
import urllib2
|
||||
import os
|
||||
import locale
|
||||
import rfc822
|
||||
|
@ -55,18 +54,24 @@ class ComicImage(object):
|
|||
"""Connect to host and get meta information."""
|
||||
try:
|
||||
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
||||
except urllib2.HTTPError as he:
|
||||
except IOError as he:
|
||||
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
|
||||
|
||||
if self.urlobj.info().getmaintype() != 'image' and \
|
||||
self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
||||
content_type = self.urlobj.headers.get('content-type')
|
||||
content_type = content_type.split(';', 1)[0]
|
||||
if '/' in content_type:
|
||||
maintype, subtype = content_type.split('/', 1)
|
||||
else:
|
||||
maintype = content_type
|
||||
subtype = None
|
||||
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
||||
raise FetchComicError('No suitable image found to retrieve.', self.url)
|
||||
|
||||
# Always use mime type for file extension if it is sane.
|
||||
if self.urlobj.info().getmaintype() == 'image':
|
||||
self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
|
||||
self.contentLength = int(self.urlobj.info().get('content-length', 0))
|
||||
self.lastModified = self.urlobj.info().get('last-modified')
|
||||
if maintype == 'image':
|
||||
self.ext = '.' + subtype.replace('jpeg', 'jpg')
|
||||
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
|
||||
self.lastModified = self.urlobj.headers.get('last-modified')
|
||||
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
|
||||
|
||||
def touch(self, filename):
|
||||
|
@ -88,7 +93,6 @@ class ComicImage(object):
|
|||
|
||||
fn = os.path.join(comicDir, filename)
|
||||
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
||||
self.urlobj.close()
|
||||
self.touch(fn)
|
||||
out.write('Skipping existing file "%s".' % (fn,), 1)
|
||||
return fn, False
|
||||
|
@ -97,7 +101,7 @@ class ComicImage(object):
|
|||
out.write('Writing comic to file %s...' % (fn,), 3)
|
||||
with open(fn, 'wb') as comicOut:
|
||||
startTime = time.time()
|
||||
comicOut.write(self.urlobj.read())
|
||||
comicOut.write(self.urlobj.content)
|
||||
endTime = time.time()
|
||||
self.touch(fn)
|
||||
except:
|
||||
|
@ -114,7 +118,5 @@ class ComicImage(object):
|
|||
attrs = dict(fn=fn, bytes=bytes, speed=speed)
|
||||
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
|
||||
getHandler().comicDownloaded(self.name, fn)
|
||||
finally:
|
||||
self.urlobj.close()
|
||||
|
||||
return fn, True
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
|
|||
class DMFA(_BasicScraper):
|
||||
latestUrl = 'http://www.missmab.com/'
|
||||
stripUrl = latestUrl + 'Comics/Vol_%s.php'
|
||||
imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
|
||||
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'([^"])+')+
|
||||
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
|
||||
help = 'Index format: nnn (normally, some specials)'
|
||||
|
|
|
@ -4,22 +4,29 @@
|
|||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..helpers import bounceStarter, queryNamer
|
||||
from ..helpers import bounceStarter
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
def add(name):
|
||||
classname = 'DrunkDuck_%s' % name
|
||||
url = 'http://www.drunkduck.com/%s/' % name
|
||||
linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
|
||||
linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
index = int(pageUrl.rstrip('/').split('/')[-1])
|
||||
ext = imageUrl.rsplit('.')[-1]
|
||||
return '%d.%s' % (index, ext)
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name = 'DrunkDuck/' + name,
|
||||
starter = bounceStarter(url, compile(linkSearch % 'next')),
|
||||
stripUrl = url + 'index.php?p=%s' % name,
|
||||
imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
|
||||
prevSearch= compile(linkSearch % 'previous'),
|
||||
starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
|
||||
stripUrl = url + '%s/',
|
||||
imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
|
||||
prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
|
||||
help = 'Index format: n (unpadded)',
|
||||
namer = queryNamer('p', usePageUrl=True),
|
||||
namer = namer,
|
||||
)
|
||||
|
||||
comics = (
|
||||
|
|
|
@ -1,47 +1,26 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
from ..scraper import _BasicScraper
|
||||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import asciify
|
||||
|
||||
|
||||
def fallenangel(name, shortname):
|
||||
pass # XXX
|
||||
|
||||
class _TheFallenAngel(_BasicScraper):
|
||||
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
|
||||
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
||||
@property
|
||||
def baseUrl(self):
|
||||
return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
|
||||
|
||||
|
||||
@property
|
||||
def stripUrl(self):
|
||||
return self.baseUrl + '?date=%s'
|
||||
|
||||
|
||||
def starter(self):
|
||||
return self.baseUrl
|
||||
|
||||
|
||||
|
||||
class HighMaintenance(_TheFallenAngel):
|
||||
name = 'TheFallenAngel/HighMaintenance'
|
||||
shortName = 'hm'
|
||||
|
||||
|
||||
|
||||
class FAWK(_TheFallenAngel):
|
||||
name = 'TheFallenAngel/FAWK'
|
||||
shortName = 'fawk'
|
||||
|
||||
|
||||
|
||||
class MalloryChan(_TheFallenAngel):
|
||||
name = 'TheFallenAngel/MalloryChan'
|
||||
shortName = 'mallorychan'
|
||||
def add(name, shortname):
|
||||
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
|
||||
classname = asciify(name)
|
||||
globals()[classname] = make_scraper(classname,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '?date=%s',
|
||||
name='FallenAngel/' + name,
|
||||
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
|
||||
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
|
||||
help = 'Index format: yyyymmdd',
|
||||
)
|
||||
|
||||
|
||||
add('HighMaintenance', 'hm')
|
||||
add('FAWK', 'fawk')
|
||||
add('MalloryChan', 'mallorychan')
|
||||
|
||||
|
|
|
@ -29,30 +29,30 @@ def add(name, repl=''):
|
|||
|
||||
|
||||
# http://www.gocomics.com/features
|
||||
# note that comics from creators.com are not repeated here
|
||||
# Duplicate comics from creators.com are commented out
|
||||
add('2 Cows and a Chicken')
|
||||
add('9 Chickweed Lane')
|
||||
add('9 to 5')
|
||||
add('The Academia Waltz')
|
||||
add('Adam at Home')
|
||||
add('Agnes')
|
||||
#add('Agnes')
|
||||
add('Alley Oop', repl='-')
|
||||
add('Andertoons')
|
||||
add('Andy Capp')
|
||||
#add('Andy Capp')
|
||||
add('Angry Little Girls', repl='-')
|
||||
add('Animal Crackers')
|
||||
add('Annie')
|
||||
add('The Argyle Sweater')
|
||||
add('Arlo and Janis')
|
||||
add('Ask Shagg')
|
||||
add('BC')
|
||||
#add('Ask Shagg')
|
||||
#add('BC')
|
||||
add('Back in the Day')
|
||||
add('Bad Reporter')
|
||||
add('Baldo')
|
||||
add('Ballard Street')
|
||||
#add('Ballard Street')
|
||||
add('Banana Triangle', repl='-')
|
||||
add('Barkeater Lake')
|
||||
add('The Barn')
|
||||
#add('The Barn')
|
||||
add('Barney and Clyde')
|
||||
add('Basic Instructions')
|
||||
add('Beardo')
|
||||
|
@ -81,13 +81,13 @@ add('Brewster Rockit')
|
|||
add('Broom Hilda')
|
||||
add('The Buckets')
|
||||
add('Buni')
|
||||
add('Cafe con Leche')
|
||||
#add('Cafe con Leche')
|
||||
add('Calvin and Hobbes')
|
||||
add('Candorville')
|
||||
add('Cathy')
|
||||
add('Cest la Vie')
|
||||
add('Cheap Thrills Cuisine', repl='-')
|
||||
add('Chuckle Bros')
|
||||
#add('Chuckle Bros')
|
||||
add('Citizen Dog')
|
||||
add('The City')
|
||||
add('Cleats')
|
||||
|
@ -99,15 +99,15 @@ add('Cow and Boy')
|
|||
add('CowTown')
|
||||
add('Crumb')
|
||||
add('Cul de Sac')
|
||||
add('Daddys Home')
|
||||
#add('Daddys Home')
|
||||
add('Dark Side of the Horse')
|
||||
add('Deep Cover')
|
||||
add('Diamond Lil')
|
||||
#add('Diamond Lil')
|
||||
add('Dick Tracy')
|
||||
add('The Dinette Set')
|
||||
#add('The Dinette Set')
|
||||
add('Dixie Drive', repl='-')
|
||||
add('Dog Eat Doug')
|
||||
add('Dogs of C Kennel')
|
||||
#add('Dog Eat Doug')
|
||||
#add('Dogs of C Kennel')
|
||||
add('Domestic Abuse')
|
||||
add('Doonesbury')
|
||||
add('The Doozies')
|
||||
|
@ -122,18 +122,18 @@ add('F Minus')
|
|||
add('Family Tree')
|
||||
add('Farcus')
|
||||
add('Fat Cats', repl='-')
|
||||
add('Flo and Friends')
|
||||
#add('Flo and Friends')
|
||||
add('The Flying McCoys')
|
||||
add('Foolish Mortals', repl='-')
|
||||
add('For Better or For Worse')
|
||||
add('For Heavens Sake')
|
||||
#add('For Heavens Sake')
|
||||
add('Fort Knox')
|
||||
add('FoxTrot')
|
||||
add('FoxTrot Classics')
|
||||
add('Frank and Ernest')
|
||||
add('Frazz')
|
||||
add('Fred Basset')
|
||||
add('Free Range')
|
||||
#add('Free Range')
|
||||
add('Freshly Squeezed')
|
||||
add('Frog Applause')
|
||||
add('The Fusco Brothers')
|
||||
|
@ -154,9 +154,9 @@ add('Haiku Ewe')
|
|||
add('Ham Shears')
|
||||
add('Health Capsules')
|
||||
add('Heart of the City')
|
||||
add('Heathcliff')
|
||||
#add('Heathcliff')
|
||||
add('Heavenly Nostrils')
|
||||
add('Herb and Jamaal')
|
||||
#add('Herb and Jamaal')
|
||||
add('Herman')
|
||||
add('Home and Away')
|
||||
add('HUBRIS!')
|
||||
|
@ -184,7 +184,7 @@ add('La Cucaracha')
|
|||
add('Last Kiss')
|
||||
add('The LeftyBosco Picture Show')
|
||||
add('Legend of Bill')
|
||||
add('Liberty Meadows')
|
||||
#add('Liberty Meadows')
|
||||
add('Lil Abner')
|
||||
add('Lio')
|
||||
add('Little Dog Lost')
|
||||
|
@ -201,7 +201,7 @@ add('Maintaining')
|
|||
add('Marias Day')
|
||||
add('Marmaduke')
|
||||
add('McArroni')
|
||||
add('The Meaning of Lila')
|
||||
#add('The Meaning of Lila')
|
||||
add('Medium Large')
|
||||
add('Meg Classics')
|
||||
add('The Middletons')
|
||||
|
@ -209,7 +209,7 @@ add('Mike du Jour')
|
|||
add('Minimum Security')
|
||||
add('Moderately Confused')
|
||||
add('Molly and the Bear')
|
||||
add('Momma')
|
||||
#add('Momma')
|
||||
add('Monty')
|
||||
add('Motley Classics')
|
||||
add('Mr. Gigi and the Squid')
|
||||
|
@ -217,7 +217,7 @@ add('Mutt and Jeff')
|
|||
add('My Cage')
|
||||
add('MythTickle')
|
||||
add('Nancy')
|
||||
add('Nest Heads')
|
||||
#add('Nest Heads')
|
||||
add('NEUROTICA')
|
||||
add('New Adventures of Queen Victoria')
|
||||
add('Non Sequitur')
|
||||
|
@ -225,10 +225,10 @@ add('The Norm Classics')
|
|||
add('Nothing is Not Something')
|
||||
add('Off the Mark')
|
||||
add('Ollie and Quentin')
|
||||
add('On A Claire Day')
|
||||
add('One Big Happy')
|
||||
#add('On A Claire Day')
|
||||
#add('One Big Happy')
|
||||
add('Ordinary Bill')
|
||||
add('The Other Coast')
|
||||
#add('The Other Coast')
|
||||
add('Out of the Gene Pool Re-Runs')
|
||||
add('Over the Hedge')
|
||||
add('Overboard')
|
||||
|
@ -254,10 +254,10 @@ add('Reply All')
|
|||
add('Rip Haywire')
|
||||
add('Ripleys Believe It or Not')
|
||||
add('Rose is Rose')
|
||||
add('Rubes')
|
||||
#add('Rubes')
|
||||
add('Rudy Park')
|
||||
add('Savage Chickens')
|
||||
add('Scary Gary')
|
||||
#add('Scary Gary')
|
||||
add('Shirley and Son Classics')
|
||||
add('Shoe')
|
||||
add('Shoecabbage')
|
||||
|
@ -266,11 +266,11 @@ add('Skin Horse')
|
|||
add('Skippy')
|
||||
add('Slowpoke')
|
||||
add('Soup to Nutz')
|
||||
add('Speed Bump')
|
||||
#add('Speed Bump')
|
||||
add('Spot the Frog')
|
||||
add('Starslip')
|
||||
add('Stone Soup')
|
||||
add('Strange Brew')
|
||||
#add('Strange Brew')
|
||||
add('The Sunshine Club')
|
||||
add('Sylvia')
|
||||
add('Tank McNamara')
|
||||
|
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
|
|||
add('That is Priceless')
|
||||
add('Thats Life')
|
||||
add('Thatababy')
|
||||
add('Thin Lines')
|
||||
#add('Thin Lines')
|
||||
add('Tiny Sepuku')
|
||||
add('TOBY')
|
||||
add('Todays Dogg')
|
||||
|
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
|
|||
add('U.S. Acres')
|
||||
add('Viivi and Wagner')
|
||||
add('Watch Your Head')
|
||||
add('Wee Pals')
|
||||
add('Wizard of Id')
|
||||
#add('Wee Pals')
|
||||
#add('Wizard of Id')
|
||||
add('Working Daze')
|
||||
add('Working It Out')
|
||||
#add('Working It Out')
|
||||
add('W.T. Duck')
|
||||
add('Zack Hill')
|
||||
#add('Zack Hill')
|
||||
add('Ziggy')
|
||||
|
||||
# http://www.gocomics.com/explore/editorial_list
|
||||
|
|
|
@ -18,9 +18,9 @@ def add(name, urls):
|
|||
name='KeenSpot/' + name,
|
||||
latestUrl=latestUrl,
|
||||
stripUrl=baseUrl + 'd/%s.html',
|
||||
imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
|
||||
prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
|
||||
'(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
|
||||
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
|
||||
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
|
||||
help = 'Index format: yyyymmdd',
|
||||
)
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
|
|||
class NineteenNinetySeven(_BasicScraper):
|
||||
name = '1997'
|
||||
latestUrl = 'http://www.1977thecomic.com/'
|
||||
stripUrl = latestUrl + '%s'
|
||||
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
|
||||
help = 'Index format: yyyy/mm/dd/strip-name'
|
||||
|
|
|
@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
|
|||
latestUrl = 'http://www.sheldoncomics.com/'
|
||||
stripUrl = latestUrl + 'archive/%s.html'
|
||||
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
|
||||
prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
|
||||
prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
|
||||
help = 'Index format: yymmdd'
|
||||
|
||||
|
||||
|
|
|
@ -1,280 +1,54 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
|
||||
from re import compile, sub
|
||||
"""
|
||||
The Universal comics only have some samples, but those samples are always the newest ones.
|
||||
"""
|
||||
import datetime
|
||||
from re import compile, escape
|
||||
from ..scraper import make_scraper
|
||||
from ..util import fetchUrl, tagre
|
||||
from ..util import tagre, asciify, getPageContent
|
||||
|
||||
|
||||
def add(name, shortName):
|
||||
homepage = 'http://content.uclick.com/a2z.html'
|
||||
baseUrl = 'http://www.uclick.com/client/zzz/%s/'
|
||||
latestUrl = baseUrl % shortName
|
||||
classname = 'UClick_%s' % name
|
||||
def parse_strdate(strdate):
|
||||
"""Parse date string. XXX this is locale dependant but it should not be."""
|
||||
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
|
||||
|
||||
|
||||
def add(name, category):
|
||||
shortname = name.replace(' ', '').lower()
|
||||
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
|
||||
classname = 'UClick_%s' % asciify(name)
|
||||
|
||||
@classmethod
|
||||
def fetchSubmodules(cls):
|
||||
exclusions = ('index',)
|
||||
# XXX refactor this mess
|
||||
submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
|
||||
partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
|
||||
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
|
||||
possibles = [partsMatch.match(match).groups() for match in matches]
|
||||
|
||||
def normalizeName(name):
|
||||
name = sub(r'&(.)acute;', r'\1', name).title()
|
||||
return ''.join([c for c in name if c.isalnum()])
|
||||
|
||||
def fetchSubmodule(module):
|
||||
try:
|
||||
return fetchUrl(cls.baseUrl % module, cls.imageSearch)
|
||||
except Exception:
|
||||
# XXX log error
|
||||
return False
|
||||
|
||||
return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
"""Parse publish date from page content which looks like:
|
||||
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||
<h4>published: Sunday, November 11, 2012</h4>
|
||||
"""
|
||||
data = getPageContent(pageUrl)[0]
|
||||
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||
mo = ro.search(data)
|
||||
if mo:
|
||||
strdate = mo.group(1)
|
||||
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='UClick/' + name,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '%s/',
|
||||
imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
|
||||
prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
|
||||
help = 'Index format: yyyy/mm/dd',
|
||||
imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
|
||||
multipleImagesPerStrip = True,
|
||||
prevSearch = None,
|
||||
help = 'Index format: none',
|
||||
namer = namer,
|
||||
)
|
||||
|
||||
|
||||
# List is from http://www.universaluclick.com/comics/list
|
||||
comics = {
|
||||
'5thWave': 'fw',
|
||||
'9To5': 'tmntf',
|
||||
'AdamHome': 'ad',
|
||||
'Agnes': 'cragn',
|
||||
'AlcarazLalo': 'la',
|
||||
'AlcarazLaloSpanish': 'spla',
|
||||
'AndersonNick': 'wpnan',
|
||||
'AndyCapp': 'crcap',
|
||||
'AnimalCrackers': 'tmani',
|
||||
'Annie': 'tmann',
|
||||
'AsayChuck': 'crcas',
|
||||
'AskShagg': 'crask',
|
||||
'AuthTony': 'ta',
|
||||
'BadReporter': 'bad',
|
||||
'Baldo': 'ba',
|
||||
'BaldoSpanish': 'be',
|
||||
'BallardStreet': 'crbal',
|
||||
'BarkEaterLake': 'bark',
|
||||
'BarstowDonna': 'dba',
|
||||
'BC': 'crbc',
|
||||
'BCSpanish': 'crbcs',
|
||||
'BeattieBruce': 'crbbe',
|
||||
'BennetClay': 'wpcbe',
|
||||
'BensonLisa': 'wplbe',
|
||||
'BensonSteve': 'crsbe',
|
||||
'BigTop': 'bt',
|
||||
'Biographic': 'biov',
|
||||
'Bleeker': 'blk',
|
||||
'BobTheSquirrel': 'bob',
|
||||
'BoilingPoint': 'boil',
|
||||
'BokChip': 'crcbo',
|
||||
'BoNanas': 'bon',
|
||||
'Boomerangs': 'boom',
|
||||
'BoondocksThe': 'bo',
|
||||
'BottomLiners': 'tmbot',
|
||||
'BoundAndGagged': 'tmbou',
|
||||
'Brainwaves': 'bwv',
|
||||
'BreenSteve': 'crsbr',
|
||||
'BrendaStarr': 'tmbre',
|
||||
'BrewsterRockit': 'tmrkt',
|
||||
'BrittChris': 'crcbr',
|
||||
'BroomHilda': 'tmbro',
|
||||
'Candorville': 'cand',
|
||||
'CarlsonStuart': 'sc',
|
||||
'CatalinoKen': 'crkca',
|
||||
'Cathy': 'ca',
|
||||
'CathySpanish': 'spca',
|
||||
'CEstLaVie': 'clv',
|
||||
'CityThe': 'derf',
|
||||
'ClearBlueWater': 'cbw',
|
||||
'Cleats': 'cle',
|
||||
'CloseToHome': 'cl',
|
||||
'CombsPaul': 'tmcmb',
|
||||
'CompuToon': 'tmcom',
|
||||
'Condorito': 'cond',
|
||||
'ConradPaul': 'tmpco',
|
||||
'Cornered': 'co',
|
||||
'CulDeSac': 'cds',
|
||||
'DanzigerJeff': 'jd',
|
||||
'DaviesMatt': 'tmmda',
|
||||
'DeepCover': 'deep',
|
||||
'DeeringJohn': 'crjde',
|
||||
'DickTracy': 'tmdic',
|
||||
'DinetteSetThe': 'crdin',
|
||||
'DogEatDoug': 'crdog',
|
||||
'DonWright': 'tmdow',
|
||||
'Doodles': 'tmdoo',
|
||||
'Doonesbury': 'db',
|
||||
'DuplexThe': 'dp',
|
||||
'Eek': 'eek',
|
||||
'ElderberriesThe': 'eld',
|
||||
'FacesInTheNews': 'kw',
|
||||
'FlightDeck': 'crfd',
|
||||
'FloAndFriends': 'crflo',
|
||||
'FlyingMccoysThe': 'fmc',
|
||||
'ForBetterOrForWorse': 'fb',
|
||||
'ForHeavenSSake': 'crfhs',
|
||||
'FoxtrotClassics': 'ftcl',
|
||||
'Foxtrot': 'ft',
|
||||
'FoxtrotSpanish': 'spft',
|
||||
'FrankAndErnest': 'fa',
|
||||
'FredBassetSpanish': 'spfba',
|
||||
'FredBasset': 'tmfba',
|
||||
'FrogApplause': 'frog',
|
||||
'FuscoBrothersThe': 'fu',
|
||||
'Garfield': 'ga',
|
||||
'GarfieldSpanish': 'gh',
|
||||
'GasolineAlley': 'tmgas',
|
||||
'GaturroSpanish': 'spgat',
|
||||
'GilThorp': 'tmgil',
|
||||
'GingerMeggs': 'gin',
|
||||
'GingerMeggsSpanish': 'spgin',
|
||||
'GirlsAndSports': 'crgis',
|
||||
'GorrellBob': 'crbgo',
|
||||
'GoTeamBob': 'gtb',
|
||||
'HammondBruce': 'hb',
|
||||
'HandelsmanWalt': 'tmwha',
|
||||
'HeartOfTheCity': 'hc',
|
||||
'Heathcliff': 'crhea',
|
||||
'HeathcliffSpanish': 'crhes',
|
||||
'HerbAndJamaal': 'crher',
|
||||
'HigginsJack': 'jh',
|
||||
'HomeAndAway': 'wphaa',
|
||||
'HorseyDavid': 'tmdho',
|
||||
'Housebroken': 'tmhou',
|
||||
'HubertAndAbby': 'haa',
|
||||
'IdiotBox': 'ibox',
|
||||
'ImagineThis': 'imt',
|
||||
'InkPen': 'ink',
|
||||
'InTheBleachers': 'bl',
|
||||
'ItsAllAboutYou': 'wpiay',
|
||||
'JamesBondSpanish': 'spjb',
|
||||
'JonesClay': 'crcjo',
|
||||
'KallaugherKevin': 'cwkal',
|
||||
'KChroniclesThe': 'kk',
|
||||
'KelleySteve': 'crske',
|
||||
'Kudzu': 'tmkud',
|
||||
'LaCucaracha': 'lc',
|
||||
'LegendOfBill': 'lob',
|
||||
'LibertyMeadows': 'crlib',
|
||||
'Lio': 'lio',
|
||||
'LittleDogLost': 'wpldl',
|
||||
'LocherDick': 'tmdlo',
|
||||
'LooseParts': 'tmloo',
|
||||
'LostSheep': 'lost',
|
||||
'LoweChan': 'tmclo',
|
||||
'LuckovichMike': 'crmlu',
|
||||
'LuckyCow': 'luc',
|
||||
'MarkstienGary': 'crgma',
|
||||
'MarletteDoug': 'tmdma',
|
||||
'MccoyGlenn': 'gm',
|
||||
'MeaningOfLilaThe': 'crlil',
|
||||
'MeehanStreak': 'tmmee',
|
||||
'MiddletonsThe': 'tmmid',
|
||||
'MinimumSecurity': 'ms',
|
||||
'ModestyBlaiseSpanish': 'spmb',
|
||||
'Momma': 'crmom',
|
||||
'MorinJim': 'cwjmo',
|
||||
'MuttJeffSpanish': 'spmut',
|
||||
'MythTickle': 'myth',
|
||||
'NAoQV': 'naqv',
|
||||
'NaturalSelection': 'crns',
|
||||
'NestHeads': 'cpnst',
|
||||
'Neurotica': 'neu',
|
||||
'NonSequitur': 'nq',
|
||||
'OhmanJack': 'tmjoh',
|
||||
'OliphantPat': 'po',
|
||||
'OnAClaireDay': 'crocd',
|
||||
'OneBigHappy': 'crobh',
|
||||
'OtherCoastThe': 'crtoc',
|
||||
'OutOfTheGenePool': 'wpgen',
|
||||
'Overboard': 'ob',
|
||||
'OverboardSpanish': 'spob',
|
||||
'PepeSpanish': 'sppep',
|
||||
'PettJoel': 'jp',
|
||||
'Pibgorn': 'pib',
|
||||
'Pickles': 'wppic',
|
||||
'Pluggers': 'tmplu',
|
||||
'PoochCafe': 'poc',
|
||||
'PoochCafeSpanish': 'sppoc',
|
||||
'PopCulture': 'pop',
|
||||
'PowellDwane': 'crdpo',
|
||||
'Preteena': 'pr',
|
||||
'PricklyCity': 'prc',
|
||||
'QuigmansThe': 'tmqui',
|
||||
'RallComic': 'tr',
|
||||
'RamirezMicheal': 'crmrm',
|
||||
'RamseyMarshall': 'crmra',
|
||||
'RealLifeAdventures': 'rl',
|
||||
'RedAndRover': 'wpred',
|
||||
'RedMeat': 'red',
|
||||
'ReynoldsUnwrapped': 'rw',
|
||||
'RonaldinhoGaucho': 'ron',
|
||||
'RonaldinhoGauchoSpanish': 'spron',
|
||||
'Rubes': 'crrub',
|
||||
'SackSteve': 'tmssa',
|
||||
'SargentBen': 'bs',
|
||||
'SargentBenSpanish': 'spbs',
|
||||
'SendHelp': 'send',
|
||||
'ShenemanDrew': 'tmdsh',
|
||||
'SherffiusDrew': 'crjsh',
|
||||
'Shoecabbage': 'shcab',
|
||||
'Shoe': 'tmsho',
|
||||
'SigmundSpanish': 'spsig',
|
||||
'Slowpoke': 'slow',
|
||||
'SmallWorld': 'small',
|
||||
'SpaceIsThePlace': 'sitp',
|
||||
'SpeedBump': 'crspe',
|
||||
'StanisScott': 'crsst',
|
||||
'StateOfTheUnion': 'crsou',
|
||||
'StayskalWayne': 'tmwst',
|
||||
'StoneSoup': 'ss',
|
||||
'StrangeBrew': 'crstr',
|
||||
'SummersDana': 'tmdsu',
|
||||
'SuttonImpact': 'stn',
|
||||
'Sylvia': 'tmsyl',
|
||||
'SzepPaul': 'crpsz',
|
||||
'TankMcnamara': 'tm',
|
||||
'TeenageMutantNinjaTurtles': 'tmnt',
|
||||
'TelnaesAnn': 'tmate',
|
||||
'TheArgyleSweater': 'tas',
|
||||
'ThePinkPanther': 'tmpnk',
|
||||
'TheWizardOfId': 'crwiz',
|
||||
'TheWizardOfIdSpanish': 'crwis',
|
||||
'ThInk': 'think',
|
||||
'ThompsonMike': 'crmth',
|
||||
'ThroughThickAndThin': 'cpthk',
|
||||
'TinySepuku': 'tiny',
|
||||
'Toby': 'toby',
|
||||
'TolesTom': 'tt',
|
||||
'TomTheDancingBug': 'td',
|
||||
'TooMuchCoffeeMan': 'tmcm',
|
||||
'Trevor': 'trev',
|
||||
'TutelandiaSpanish': 'sptut',
|
||||
'VarvelGary': 'crgva',
|
||||
'WassermanDan': 'tmdwa',
|
||||
'WatchYourHead': 'wpwyh',
|
||||
'Waylay': 'min',
|
||||
'WeePals': 'crwee',
|
||||
'WinnieThePooh': 'crwin',
|
||||
'WitOfTheWorld': 'cwwit',
|
||||
'WorkingItOut': 'crwio',
|
||||
'WriteDon': 'tmdow',
|
||||
'YennySpanish': 'spyen',
|
||||
'Yenny': 'yen',
|
||||
'ZackHill': 'crzhi',
|
||||
'ZiggySpanish': 'spzi',
|
||||
'Ziggy': 'zi',
|
||||
'9 Chickweed Lane': 'strip',
|
||||
}
|
||||
|
||||
for name, shortname in comics.items():
|
||||
add(name, shortname)
|
||||
for name, category in comics.items():
|
||||
add(name, category)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012 Bastian Kleineidam
|
||||
|
||||
from re import compile
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
|
||||
|
|
|
@ -22,6 +22,9 @@ class _BasicScraper(object):
|
|||
@cvar prevSearch: A compiled regex that will locate the URL for the
|
||||
previous strip when applied to a strip page.
|
||||
'''
|
||||
# if more than one image per URL is expected
|
||||
multipleImagesPerStrip = False
|
||||
# usually the index format help
|
||||
help = 'Sorry, no help for this comic yet.'
|
||||
|
||||
def __init__(self, indexes=None):
|
||||
|
@ -44,7 +47,9 @@ class _BasicScraper(object):
|
|||
|
||||
def getStrip(self, url):
|
||||
"""Get comic strip for given URL."""
|
||||
imageUrls = fetchUrls(url, self.imageSearch)
|
||||
imageUrls = fetchUrls(url, self.imageSearch)[0]
|
||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||
raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||
return self.getComicStrip(url, imageUrls)
|
||||
|
||||
def getComicStrip(self, url, imageUrls):
|
||||
|
@ -140,11 +145,13 @@ def get_scrapers():
|
|||
"""
|
||||
global _scrapers
|
||||
if _scrapers is None:
|
||||
out.write("Loading comic modules...")
|
||||
modules = loader.get_modules()
|
||||
plugins = loader.get_plugins(modules, _BasicScraper)
|
||||
_scrapers = list(plugins)
|
||||
_scrapers.sort(key=lambda s: s.get_name())
|
||||
check_scrapers()
|
||||
out.write("... %d modules loaded." % len(_scrapers))
|
||||
return _scrapers
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
from __future__ import division, print_function
|
||||
|
||||
import urllib2, urlparse
|
||||
import requests
|
||||
import sys
|
||||
import os
|
||||
import cgi
|
||||
|
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
|||
@return: the generated regular expression suitable for re.compile()
|
||||
@rtype: string
|
||||
"""
|
||||
if before:
|
||||
before += "[^>]*"
|
||||
if after:
|
||||
after += "[^>]*"
|
||||
attrs = dict(
|
||||
tag=case_insensitive_re(tag),
|
||||
attribute=case_insensitive_re(attribute),
|
||||
|
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
|||
before=before,
|
||||
after=after,
|
||||
)
|
||||
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
|
||||
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
|
||||
|
||||
|
||||
def case_insensitive_re(name):
|
||||
|
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
|||
def getPageContent(url):
|
||||
# read page data
|
||||
page = urlopen(url)
|
||||
data = page.read(MAX_FILESIZE)
|
||||
data = page.text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
match = baseSearch.search(data)
|
||||
|
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
|
|||
imageUrl = match.group(1)
|
||||
if not imageUrl:
|
||||
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
|
||||
out.write('matched image URL %r' % imageUrl, 2)
|
||||
out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
|
||||
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
|
||||
if not imageUrls:
|
||||
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
|
||||
|
@ -178,21 +175,17 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
|||
out.write('Open URL %s' % url, 2)
|
||||
assert retries >= 0, 'invalid retry value %r' % retries
|
||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||
req = urllib2.Request(url)
|
||||
headers = {'User-Agent': UserAgent}
|
||||
config = {"max_retries": retries}
|
||||
if referrer:
|
||||
req.add_header('Referer', referrer)
|
||||
req.add_header('User-Agent', UserAgent)
|
||||
tries = 0
|
||||
while True:
|
||||
headers['Referer'] = referrer
|
||||
try:
|
||||
return urllib2.urlopen(req)
|
||||
except IOError as err:
|
||||
req = requests.get(url, headers=headers, config=config)
|
||||
req.raise_for_status()
|
||||
return req
|
||||
except requests.exceptions.RequestException as err:
|
||||
msg = 'URL retrieval of %s failed: %s' % (url, err)
|
||||
out.write(msg)
|
||||
out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
|
||||
time.sleep(retry_wait_seconds)
|
||||
tries += 1
|
||||
if tries >= retries:
|
||||
raise IOError(msg)
|
||||
|
||||
|
||||
|
@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
|
|||
print("""********** Oops, I did it again. *************
|
||||
|
||||
You have found an internal error in %(app)s. Please write a bug report
|
||||
at %(url)s and include the following information:
|
||||
- your commandline arguments and any configuration file in ~/.dosage/
|
||||
- the system information below
|
||||
at %(url)s and include at least the information below:
|
||||
|
||||
Not disclosing some of the information above due to privacy reasons is ok.
|
||||
Not disclosing some of the information below due to privacy reasons is ok.
|
||||
I will try to help you nonetheless, but you have to give me something
|
||||
I can work with ;) .
|
||||
""" % dict(app=AppName, url=SupportUrl), file=out)
|
||||
|
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
|
|||
{"version": sys.version, "platform": sys.platform}, file=out)
|
||||
stime = strtime(time.time())
|
||||
print("Local time:", stime, file=out)
|
||||
print("sys.argv", sys.argv, file=out)
|
||||
|
||||
|
||||
def strtime(t):
|
||||
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
requests
|
||||
|
|
@ -4,6 +4,7 @@
|
|||
import tempfile
|
||||
import shutil
|
||||
import re
|
||||
import os
|
||||
from itertools import islice
|
||||
from unittest import TestCase
|
||||
from dosagelib import scraper
|
||||
|
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
|
|||
def setUp(self):
|
||||
self.name = self.scraperclass.get_name()
|
||||
self.url = self.scraperclass.starter()
|
||||
# create a temporary directory for images
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdir)
|
||||
|
||||
def get_saved_images(self):
|
||||
"""Get saved images."""
|
||||
dirs = tuple(self.name.split('/'))
|
||||
return os.listdir(os.path.join(self.tmpdir, *dirs))
|
||||
|
||||
def test_comic(self):
|
||||
# Test a scraper. It must be able to traverse backward for
|
||||
|
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
|
|||
# on at least 4 pages.
|
||||
scraperobj = self.scraperclass()
|
||||
num = empty = 0
|
||||
for strip in islice(scraperobj.getAllStrips(), 0, 5):
|
||||
max_strips = 5
|
||||
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
|
||||
images = 0
|
||||
for image in strip.getImages():
|
||||
images += 1
|
||||
|
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
|
|||
num += 1
|
||||
if self.scraperclass.prevSearch:
|
||||
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
|
||||
# check that at exactly or for multiple pages at least 5 images are saved
|
||||
saved_images = self.get_saved_images()
|
||||
num_images = len(saved_images)
|
||||
if self.scraperclass.multipleImagesPerStrip:
|
||||
self.check(num_images >= max_strips,
|
||||
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
|
||||
else:
|
||||
self.check(num_images == max_strips,
|
||||
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
|
||||
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
|
||||
|
||||
def check_stripurl(self, strip):
|
||||
|
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
|
|||
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
|
||||
|
||||
def save(self, image):
|
||||
# create a temporary directory
|
||||
tmpdir = tempfile.mkdtemp()
|
||||
try:
|
||||
image.save(tmpdir)
|
||||
image.save(self.tmpdir)
|
||||
except Exception as msg:
|
||||
self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
|
||||
finally:
|
||||
shutil.rmtree(tmpdir)
|
||||
self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
|
||||
|
||||
def check(self, condition, msg):
|
||||
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
|
||||
|
||||
|
||||
def make_comic_tester(name, **kwargs):
|
||||
"""Create and return a _ComicTester class with given name and attributes."""
|
||||
return type(name, (_ComicTester,), kwargs)
|
||||
|
||||
|
||||
def generate_comic_testers():
|
||||
"""For each comic scraper, create a test class."""
|
||||
g = globals()
|
||||
# Limit number of scraper tests for now
|
||||
max_scrapers = 100
|
||||
max_scrapers = 10000
|
||||
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
|
||||
name = 'Test'+scraperclass.__name__
|
||||
globals()[name] = type(name,
|
||||
(_ComicTester,),
|
||||
dict(scraperclass=scraperclass)
|
||||
)
|
||||
g[name] = make_comic_tester(name, scraperclass=scraperclass)
|
||||
|
||||
|
||||
generate_comic_testers()
|
||||
|
|
Loading…
Reference in a new issue