Fix comics, improve tests, use python-requests.
This commit is contained in:
parent
d4eee7719d
commit
0556ffd30a
16 changed files with 191 additions and 403 deletions
|
@ -40,10 +40,11 @@ manual page.
|
||||||
|
|
||||||
Dependencies
|
Dependencies
|
||||||
-------------
|
-------------
|
||||||
Dosage requires Python version 2.7 or higher, which can be downloaded
|
Python version 2.7 or higher, which can be downloaded
|
||||||
from http://www.python.org.
|
from http://www.python.org/
|
||||||
No external Python modules are required - only the Python Standard Library
|
|
||||||
that gets installed with Python.
|
Also the python-requests module must be installed, which can be downloaded
|
||||||
|
from http://docs.python-requests.org/en/latest/
|
||||||
|
|
||||||
Installation
|
Installation
|
||||||
-------------
|
-------------
|
||||||
|
@ -59,7 +60,7 @@ or if you do not have root permissions:
|
||||||
|
|
||||||
Technical Description
|
Technical Description
|
||||||
----------------------
|
----------------------
|
||||||
Dosage is written entirely in Python and relies on regular expressions to
|
Dosage is written in Python and relies on regular expressions to
|
||||||
do most of the grunt work.
|
do most of the grunt work.
|
||||||
|
|
||||||
For each webcomic Dosage has a plugin module, found in the "plugins"
|
For each webcomic Dosage has a plugin module, found in the "plugins"
|
||||||
|
|
|
@ -4,6 +4,7 @@ Features:
|
||||||
- cmdline: Added proper return codes for error conditions.
|
- cmdline: Added proper return codes for error conditions.
|
||||||
- comics: Added more robust regular expressions for HTML tags.
|
- comics: Added more robust regular expressions for HTML tags.
|
||||||
They match case insensitive and ignore whitespaces now.
|
They match case insensitive and ignore whitespaces now.
|
||||||
|
- comics: Use the python-requests module for HTTP requests.
|
||||||
|
|
||||||
Changes:
|
Changes:
|
||||||
- installation: Added support for dynamic configuration values.
|
- installation: Added support for dynamic configuration values.
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
|
|
||||||
import urllib2
|
|
||||||
import os
|
import os
|
||||||
import locale
|
import locale
|
||||||
import rfc822
|
import rfc822
|
||||||
|
@ -55,18 +54,24 @@ class ComicImage(object):
|
||||||
"""Connect to host and get meta information."""
|
"""Connect to host and get meta information."""
|
||||||
try:
|
try:
|
||||||
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
||||||
except urllib2.HTTPError as he:
|
except IOError as he:
|
||||||
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
|
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
|
||||||
|
|
||||||
if self.urlobj.info().getmaintype() != 'image' and \
|
content_type = self.urlobj.headers.get('content-type')
|
||||||
self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
content_type = content_type.split(';', 1)[0]
|
||||||
|
if '/' in content_type:
|
||||||
|
maintype, subtype = content_type.split('/', 1)
|
||||||
|
else:
|
||||||
|
maintype = content_type
|
||||||
|
subtype = None
|
||||||
|
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
|
||||||
raise FetchComicError('No suitable image found to retrieve.', self.url)
|
raise FetchComicError('No suitable image found to retrieve.', self.url)
|
||||||
|
|
||||||
# Always use mime type for file extension if it is sane.
|
# Always use mime type for file extension if it is sane.
|
||||||
if self.urlobj.info().getmaintype() == 'image':
|
if maintype == 'image':
|
||||||
self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
|
self.ext = '.' + subtype.replace('jpeg', 'jpg')
|
||||||
self.contentLength = int(self.urlobj.info().get('content-length', 0))
|
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
|
||||||
self.lastModified = self.urlobj.info().get('last-modified')
|
self.lastModified = self.urlobj.headers.get('last-modified')
|
||||||
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
|
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
|
||||||
|
|
||||||
def touch(self, filename):
|
def touch(self, filename):
|
||||||
|
@ -88,7 +93,6 @@ class ComicImage(object):
|
||||||
|
|
||||||
fn = os.path.join(comicDir, filename)
|
fn = os.path.join(comicDir, filename)
|
||||||
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
|
||||||
self.urlobj.close()
|
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
out.write('Skipping existing file "%s".' % (fn,), 1)
|
out.write('Skipping existing file "%s".' % (fn,), 1)
|
||||||
return fn, False
|
return fn, False
|
||||||
|
@ -97,7 +101,7 @@ class ComicImage(object):
|
||||||
out.write('Writing comic to file %s...' % (fn,), 3)
|
out.write('Writing comic to file %s...' % (fn,), 3)
|
||||||
with open(fn, 'wb') as comicOut:
|
with open(fn, 'wb') as comicOut:
|
||||||
startTime = time.time()
|
startTime = time.time()
|
||||||
comicOut.write(self.urlobj.read())
|
comicOut.write(self.urlobj.content)
|
||||||
endTime = time.time()
|
endTime = time.time()
|
||||||
self.touch(fn)
|
self.touch(fn)
|
||||||
except:
|
except:
|
||||||
|
@ -114,7 +118,5 @@ class ComicImage(object):
|
||||||
attrs = dict(fn=fn, bytes=bytes, speed=speed)
|
attrs = dict(fn=fn, bytes=bytes, speed=speed)
|
||||||
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
|
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
|
||||||
getHandler().comicDownloaded(self.name, fn)
|
getHandler().comicDownloaded(self.name, fn)
|
||||||
finally:
|
|
||||||
self.urlobj.close()
|
|
||||||
|
|
||||||
return fn, True
|
return fn, True
|
||||||
|
|
|
@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
|
||||||
class DMFA(_BasicScraper):
|
class DMFA(_BasicScraper):
|
||||||
latestUrl = 'http://www.missmab.com/'
|
latestUrl = 'http://www.missmab.com/'
|
||||||
stripUrl = latestUrl + 'Comics/Vol_%s.php'
|
stripUrl = latestUrl + 'Comics/Vol_%s.php'
|
||||||
imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
|
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"])+')+
|
prevSearch = compile(tagre("a", "href", r'([^"])+')+
|
||||||
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
|
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
|
||||||
help = 'Index format: nnn (normally, some specials)'
|
help = 'Index format: nnn (normally, some specials)'
|
||||||
|
|
|
@ -4,22 +4,29 @@
|
||||||
|
|
||||||
from re import compile
|
from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..helpers import bounceStarter, queryNamer
|
from ..helpers import bounceStarter
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
||||||
def add(name):
|
def add(name):
|
||||||
classname = 'DrunkDuck_%s' % name
|
classname = 'DrunkDuck_%s' % name
|
||||||
url = 'http://www.drunkduck.com/%s/' % name
|
url = 'http://www.drunkduck.com/%s/' % name
|
||||||
linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
|
linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
index = int(pageUrl.rstrip('/').split('/')[-1])
|
||||||
|
ext = imageUrl.rsplit('.')[-1]
|
||||||
|
return '%d.%s' % (index, ext)
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
name = 'DrunkDuck/' + name,
|
name = 'DrunkDuck/' + name,
|
||||||
starter = bounceStarter(url, compile(linkSearch % 'next')),
|
starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
|
||||||
stripUrl = url + 'index.php?p=%s' % name,
|
stripUrl = url + '%s/',
|
||||||
imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
|
imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
|
||||||
prevSearch= compile(linkSearch % 'previous'),
|
prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
|
||||||
help = 'Index format: n (unpadded)',
|
help = 'Index format: n (unpadded)',
|
||||||
namer = queryNamer('p', usePageUrl=True),
|
namer = namer,
|
||||||
)
|
)
|
||||||
|
|
||||||
comics = (
|
comics = (
|
||||||
|
|
|
@ -1,47 +1,26 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
from ..scraper import _BasicScraper
|
|
||||||
|
from re import compile
|
||||||
|
from ..scraper import make_scraper
|
||||||
|
from ..util import asciify
|
||||||
|
|
||||||
|
|
||||||
def fallenangel(name, shortname):
|
def add(name, shortname):
|
||||||
pass # XXX
|
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
|
||||||
|
classname = asciify(name)
|
||||||
class _TheFallenAngel(_BasicScraper):
|
globals()[classname] = make_scraper(classname,
|
||||||
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
|
latestUrl = latestUrl,
|
||||||
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
|
stripUrl = latestUrl + '?date=%s',
|
||||||
help = 'Index format: yyyymmdd'
|
name='FallenAngel/' + name,
|
||||||
|
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
|
||||||
@property
|
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
|
||||||
def baseUrl(self):
|
help = 'Index format: yyyymmdd',
|
||||||
return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
|
)
|
||||||
|
|
||||||
|
|
||||||
@property
|
|
||||||
def stripUrl(self):
|
|
||||||
return self.baseUrl + '?date=%s'
|
|
||||||
|
|
||||||
|
|
||||||
def starter(self):
|
|
||||||
return self.baseUrl
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class HighMaintenance(_TheFallenAngel):
|
|
||||||
name = 'TheFallenAngel/HighMaintenance'
|
|
||||||
shortName = 'hm'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class FAWK(_TheFallenAngel):
|
|
||||||
name = 'TheFallenAngel/FAWK'
|
|
||||||
shortName = 'fawk'
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class MalloryChan(_TheFallenAngel):
|
|
||||||
name = 'TheFallenAngel/MalloryChan'
|
|
||||||
shortName = 'mallorychan'
|
|
||||||
|
|
||||||
|
|
||||||
|
add('HighMaintenance', 'hm')
|
||||||
|
add('FAWK', 'fawk')
|
||||||
|
add('MalloryChan', 'mallorychan')
|
||||||
|
|
||||||
|
|
|
@ -29,30 +29,30 @@ def add(name, repl=''):
|
||||||
|
|
||||||
|
|
||||||
# http://www.gocomics.com/features
|
# http://www.gocomics.com/features
|
||||||
# note that comics from creators.com are not repeated here
|
# Duplicate comics from creators.com are commented out
|
||||||
add('2 Cows and a Chicken')
|
add('2 Cows and a Chicken')
|
||||||
add('9 Chickweed Lane')
|
add('9 Chickweed Lane')
|
||||||
add('9 to 5')
|
add('9 to 5')
|
||||||
add('The Academia Waltz')
|
add('The Academia Waltz')
|
||||||
add('Adam at Home')
|
add('Adam at Home')
|
||||||
add('Agnes')
|
#add('Agnes')
|
||||||
add('Alley Oop', repl='-')
|
add('Alley Oop', repl='-')
|
||||||
add('Andertoons')
|
add('Andertoons')
|
||||||
add('Andy Capp')
|
#add('Andy Capp')
|
||||||
add('Angry Little Girls', repl='-')
|
add('Angry Little Girls', repl='-')
|
||||||
add('Animal Crackers')
|
add('Animal Crackers')
|
||||||
add('Annie')
|
add('Annie')
|
||||||
add('The Argyle Sweater')
|
add('The Argyle Sweater')
|
||||||
add('Arlo and Janis')
|
add('Arlo and Janis')
|
||||||
add('Ask Shagg')
|
#add('Ask Shagg')
|
||||||
add('BC')
|
#add('BC')
|
||||||
add('Back in the Day')
|
add('Back in the Day')
|
||||||
add('Bad Reporter')
|
add('Bad Reporter')
|
||||||
add('Baldo')
|
add('Baldo')
|
||||||
add('Ballard Street')
|
#add('Ballard Street')
|
||||||
add('Banana Triangle', repl='-')
|
add('Banana Triangle', repl='-')
|
||||||
add('Barkeater Lake')
|
add('Barkeater Lake')
|
||||||
add('The Barn')
|
#add('The Barn')
|
||||||
add('Barney and Clyde')
|
add('Barney and Clyde')
|
||||||
add('Basic Instructions')
|
add('Basic Instructions')
|
||||||
add('Beardo')
|
add('Beardo')
|
||||||
|
@ -81,13 +81,13 @@ add('Brewster Rockit')
|
||||||
add('Broom Hilda')
|
add('Broom Hilda')
|
||||||
add('The Buckets')
|
add('The Buckets')
|
||||||
add('Buni')
|
add('Buni')
|
||||||
add('Cafe con Leche')
|
#add('Cafe con Leche')
|
||||||
add('Calvin and Hobbes')
|
add('Calvin and Hobbes')
|
||||||
add('Candorville')
|
add('Candorville')
|
||||||
add('Cathy')
|
add('Cathy')
|
||||||
add('Cest la Vie')
|
add('Cest la Vie')
|
||||||
add('Cheap Thrills Cuisine', repl='-')
|
add('Cheap Thrills Cuisine', repl='-')
|
||||||
add('Chuckle Bros')
|
#add('Chuckle Bros')
|
||||||
add('Citizen Dog')
|
add('Citizen Dog')
|
||||||
add('The City')
|
add('The City')
|
||||||
add('Cleats')
|
add('Cleats')
|
||||||
|
@ -99,15 +99,15 @@ add('Cow and Boy')
|
||||||
add('CowTown')
|
add('CowTown')
|
||||||
add('Crumb')
|
add('Crumb')
|
||||||
add('Cul de Sac')
|
add('Cul de Sac')
|
||||||
add('Daddys Home')
|
#add('Daddys Home')
|
||||||
add('Dark Side of the Horse')
|
add('Dark Side of the Horse')
|
||||||
add('Deep Cover')
|
add('Deep Cover')
|
||||||
add('Diamond Lil')
|
#add('Diamond Lil')
|
||||||
add('Dick Tracy')
|
add('Dick Tracy')
|
||||||
add('The Dinette Set')
|
#add('The Dinette Set')
|
||||||
add('Dixie Drive', repl='-')
|
add('Dixie Drive', repl='-')
|
||||||
add('Dog Eat Doug')
|
#add('Dog Eat Doug')
|
||||||
add('Dogs of C Kennel')
|
#add('Dogs of C Kennel')
|
||||||
add('Domestic Abuse')
|
add('Domestic Abuse')
|
||||||
add('Doonesbury')
|
add('Doonesbury')
|
||||||
add('The Doozies')
|
add('The Doozies')
|
||||||
|
@ -122,18 +122,18 @@ add('F Minus')
|
||||||
add('Family Tree')
|
add('Family Tree')
|
||||||
add('Farcus')
|
add('Farcus')
|
||||||
add('Fat Cats', repl='-')
|
add('Fat Cats', repl='-')
|
||||||
add('Flo and Friends')
|
#add('Flo and Friends')
|
||||||
add('The Flying McCoys')
|
add('The Flying McCoys')
|
||||||
add('Foolish Mortals', repl='-')
|
add('Foolish Mortals', repl='-')
|
||||||
add('For Better or For Worse')
|
add('For Better or For Worse')
|
||||||
add('For Heavens Sake')
|
#add('For Heavens Sake')
|
||||||
add('Fort Knox')
|
add('Fort Knox')
|
||||||
add('FoxTrot')
|
add('FoxTrot')
|
||||||
add('FoxTrot Classics')
|
add('FoxTrot Classics')
|
||||||
add('Frank and Ernest')
|
add('Frank and Ernest')
|
||||||
add('Frazz')
|
add('Frazz')
|
||||||
add('Fred Basset')
|
add('Fred Basset')
|
||||||
add('Free Range')
|
#add('Free Range')
|
||||||
add('Freshly Squeezed')
|
add('Freshly Squeezed')
|
||||||
add('Frog Applause')
|
add('Frog Applause')
|
||||||
add('The Fusco Brothers')
|
add('The Fusco Brothers')
|
||||||
|
@ -154,9 +154,9 @@ add('Haiku Ewe')
|
||||||
add('Ham Shears')
|
add('Ham Shears')
|
||||||
add('Health Capsules')
|
add('Health Capsules')
|
||||||
add('Heart of the City')
|
add('Heart of the City')
|
||||||
add('Heathcliff')
|
#add('Heathcliff')
|
||||||
add('Heavenly Nostrils')
|
add('Heavenly Nostrils')
|
||||||
add('Herb and Jamaal')
|
#add('Herb and Jamaal')
|
||||||
add('Herman')
|
add('Herman')
|
||||||
add('Home and Away')
|
add('Home and Away')
|
||||||
add('HUBRIS!')
|
add('HUBRIS!')
|
||||||
|
@ -184,7 +184,7 @@ add('La Cucaracha')
|
||||||
add('Last Kiss')
|
add('Last Kiss')
|
||||||
add('The LeftyBosco Picture Show')
|
add('The LeftyBosco Picture Show')
|
||||||
add('Legend of Bill')
|
add('Legend of Bill')
|
||||||
add('Liberty Meadows')
|
#add('Liberty Meadows')
|
||||||
add('Lil Abner')
|
add('Lil Abner')
|
||||||
add('Lio')
|
add('Lio')
|
||||||
add('Little Dog Lost')
|
add('Little Dog Lost')
|
||||||
|
@ -201,7 +201,7 @@ add('Maintaining')
|
||||||
add('Marias Day')
|
add('Marias Day')
|
||||||
add('Marmaduke')
|
add('Marmaduke')
|
||||||
add('McArroni')
|
add('McArroni')
|
||||||
add('The Meaning of Lila')
|
#add('The Meaning of Lila')
|
||||||
add('Medium Large')
|
add('Medium Large')
|
||||||
add('Meg Classics')
|
add('Meg Classics')
|
||||||
add('The Middletons')
|
add('The Middletons')
|
||||||
|
@ -209,7 +209,7 @@ add('Mike du Jour')
|
||||||
add('Minimum Security')
|
add('Minimum Security')
|
||||||
add('Moderately Confused')
|
add('Moderately Confused')
|
||||||
add('Molly and the Bear')
|
add('Molly and the Bear')
|
||||||
add('Momma')
|
#add('Momma')
|
||||||
add('Monty')
|
add('Monty')
|
||||||
add('Motley Classics')
|
add('Motley Classics')
|
||||||
add('Mr. Gigi and the Squid')
|
add('Mr. Gigi and the Squid')
|
||||||
|
@ -217,7 +217,7 @@ add('Mutt and Jeff')
|
||||||
add('My Cage')
|
add('My Cage')
|
||||||
add('MythTickle')
|
add('MythTickle')
|
||||||
add('Nancy')
|
add('Nancy')
|
||||||
add('Nest Heads')
|
#add('Nest Heads')
|
||||||
add('NEUROTICA')
|
add('NEUROTICA')
|
||||||
add('New Adventures of Queen Victoria')
|
add('New Adventures of Queen Victoria')
|
||||||
add('Non Sequitur')
|
add('Non Sequitur')
|
||||||
|
@ -225,10 +225,10 @@ add('The Norm Classics')
|
||||||
add('Nothing is Not Something')
|
add('Nothing is Not Something')
|
||||||
add('Off the Mark')
|
add('Off the Mark')
|
||||||
add('Ollie and Quentin')
|
add('Ollie and Quentin')
|
||||||
add('On A Claire Day')
|
#add('On A Claire Day')
|
||||||
add('One Big Happy')
|
#add('One Big Happy')
|
||||||
add('Ordinary Bill')
|
add('Ordinary Bill')
|
||||||
add('The Other Coast')
|
#add('The Other Coast')
|
||||||
add('Out of the Gene Pool Re-Runs')
|
add('Out of the Gene Pool Re-Runs')
|
||||||
add('Over the Hedge')
|
add('Over the Hedge')
|
||||||
add('Overboard')
|
add('Overboard')
|
||||||
|
@ -254,10 +254,10 @@ add('Reply All')
|
||||||
add('Rip Haywire')
|
add('Rip Haywire')
|
||||||
add('Ripleys Believe It or Not')
|
add('Ripleys Believe It or Not')
|
||||||
add('Rose is Rose')
|
add('Rose is Rose')
|
||||||
add('Rubes')
|
#add('Rubes')
|
||||||
add('Rudy Park')
|
add('Rudy Park')
|
||||||
add('Savage Chickens')
|
add('Savage Chickens')
|
||||||
add('Scary Gary')
|
#add('Scary Gary')
|
||||||
add('Shirley and Son Classics')
|
add('Shirley and Son Classics')
|
||||||
add('Shoe')
|
add('Shoe')
|
||||||
add('Shoecabbage')
|
add('Shoecabbage')
|
||||||
|
@ -266,11 +266,11 @@ add('Skin Horse')
|
||||||
add('Skippy')
|
add('Skippy')
|
||||||
add('Slowpoke')
|
add('Slowpoke')
|
||||||
add('Soup to Nutz')
|
add('Soup to Nutz')
|
||||||
add('Speed Bump')
|
#add('Speed Bump')
|
||||||
add('Spot the Frog')
|
add('Spot the Frog')
|
||||||
add('Starslip')
|
add('Starslip')
|
||||||
add('Stone Soup')
|
add('Stone Soup')
|
||||||
add('Strange Brew')
|
#add('Strange Brew')
|
||||||
add('The Sunshine Club')
|
add('The Sunshine Club')
|
||||||
add('Sylvia')
|
add('Sylvia')
|
||||||
add('Tank McNamara')
|
add('Tank McNamara')
|
||||||
|
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
|
||||||
add('That is Priceless')
|
add('That is Priceless')
|
||||||
add('Thats Life')
|
add('Thats Life')
|
||||||
add('Thatababy')
|
add('Thatababy')
|
||||||
add('Thin Lines')
|
#add('Thin Lines')
|
||||||
add('Tiny Sepuku')
|
add('Tiny Sepuku')
|
||||||
add('TOBY')
|
add('TOBY')
|
||||||
add('Todays Dogg')
|
add('Todays Dogg')
|
||||||
|
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
|
||||||
add('U.S. Acres')
|
add('U.S. Acres')
|
||||||
add('Viivi and Wagner')
|
add('Viivi and Wagner')
|
||||||
add('Watch Your Head')
|
add('Watch Your Head')
|
||||||
add('Wee Pals')
|
#add('Wee Pals')
|
||||||
add('Wizard of Id')
|
#add('Wizard of Id')
|
||||||
add('Working Daze')
|
add('Working Daze')
|
||||||
add('Working It Out')
|
#add('Working It Out')
|
||||||
add('W.T. Duck')
|
add('W.T. Duck')
|
||||||
add('Zack Hill')
|
#add('Zack Hill')
|
||||||
add('Ziggy')
|
add('Ziggy')
|
||||||
|
|
||||||
# http://www.gocomics.com/explore/editorial_list
|
# http://www.gocomics.com/explore/editorial_list
|
||||||
|
|
|
@ -18,9 +18,9 @@ def add(name, urls):
|
||||||
name='KeenSpot/' + name,
|
name='KeenSpot/' + name,
|
||||||
latestUrl=latestUrl,
|
latestUrl=latestUrl,
|
||||||
stripUrl=baseUrl + 'd/%s.html',
|
stripUrl=baseUrl + 'd/%s.html',
|
||||||
imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
|
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
|
||||||
prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
|
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||||
'(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
|
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
|
||||||
help = 'Index format: yyyymmdd',
|
help = 'Index format: yyyymmdd',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
|
||||||
class NineteenNinetySeven(_BasicScraper):
|
class NineteenNinetySeven(_BasicScraper):
|
||||||
name = '1997'
|
name = '1997'
|
||||||
latestUrl = 'http://www.1977thecomic.com/'
|
latestUrl = 'http://www.1977thecomic.com/'
|
||||||
|
stripUrl = latestUrl + '%s'
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
|
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
|
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
|
||||||
help = 'Index format: yyyy/mm/dd/strip-name'
|
help = 'Index format: yyyy/mm/dd/strip-name'
|
||||||
|
|
|
@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
|
||||||
latestUrl = 'http://www.sheldoncomics.com/'
|
latestUrl = 'http://www.sheldoncomics.com/'
|
||||||
stripUrl = latestUrl + 'archive/%s.html'
|
stripUrl = latestUrl + 'archive/%s.html'
|
||||||
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
|
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
|
||||||
prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
|
prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
|
||||||
help = 'Index format: yymmdd'
|
help = 'Index format: yymmdd'
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,280 +1,54 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: iso-8859-1 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
|
"""
|
||||||
from re import compile, sub
|
The Universal comics only have some samples, but those samples are always the newest ones.
|
||||||
|
"""
|
||||||
|
import datetime
|
||||||
|
from re import compile, escape
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import fetchUrl, tagre
|
from ..util import tagre, asciify, getPageContent
|
||||||
|
|
||||||
|
|
||||||
def add(name, shortName):
|
def parse_strdate(strdate):
|
||||||
homepage = 'http://content.uclick.com/a2z.html'
|
"""Parse date string. XXX this is locale dependant but it should not be."""
|
||||||
baseUrl = 'http://www.uclick.com/client/zzz/%s/'
|
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
|
||||||
latestUrl = baseUrl % shortName
|
|
||||||
classname = 'UClick_%s' % name
|
|
||||||
|
def add(name, category):
|
||||||
|
shortname = name.replace(' ', '').lower()
|
||||||
|
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
|
||||||
|
classname = 'UClick_%s' % asciify(name)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fetchSubmodules(cls):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
exclusions = ('index',)
|
"""Parse publish date from page content which looks like:
|
||||||
# XXX refactor this mess
|
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
|
||||||
submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
|
<h4>published: Sunday, November 11, 2012</h4>
|
||||||
partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
|
"""
|
||||||
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
|
data = getPageContent(pageUrl)[0]
|
||||||
possibles = [partsMatch.match(match).groups() for match in matches]
|
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
|
||||||
|
mo = ro.search(data)
|
||||||
def normalizeName(name):
|
if mo:
|
||||||
name = sub(r'&(.)acute;', r'\1', name).title()
|
strdate = mo.group(1)
|
||||||
return ''.join([c for c in name if c.isalnum()])
|
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||||
|
|
||||||
def fetchSubmodule(module):
|
|
||||||
try:
|
|
||||||
return fetchUrl(cls.baseUrl % module, cls.imageSearch)
|
|
||||||
except Exception:
|
|
||||||
# XXX log error
|
|
||||||
return False
|
|
||||||
|
|
||||||
return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
|
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
name='UClick/' + name,
|
name='UClick/' + name,
|
||||||
latestUrl = latestUrl,
|
latestUrl = latestUrl,
|
||||||
stripUrl = latestUrl + '%s/',
|
stripUrl = latestUrl + '%s/',
|
||||||
imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
|
imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
|
||||||
prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
|
multipleImagesPerStrip = True,
|
||||||
help = 'Index format: yyyy/mm/dd',
|
prevSearch = None,
|
||||||
|
help = 'Index format: none',
|
||||||
|
namer = namer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# List is from http://www.universaluclick.com/comics/list
|
||||||
comics = {
|
comics = {
|
||||||
'5thWave': 'fw',
|
'9 Chickweed Lane': 'strip',
|
||||||
'9To5': 'tmntf',
|
|
||||||
'AdamHome': 'ad',
|
|
||||||
'Agnes': 'cragn',
|
|
||||||
'AlcarazLalo': 'la',
|
|
||||||
'AlcarazLaloSpanish': 'spla',
|
|
||||||
'AndersonNick': 'wpnan',
|
|
||||||
'AndyCapp': 'crcap',
|
|
||||||
'AnimalCrackers': 'tmani',
|
|
||||||
'Annie': 'tmann',
|
|
||||||
'AsayChuck': 'crcas',
|
|
||||||
'AskShagg': 'crask',
|
|
||||||
'AuthTony': 'ta',
|
|
||||||
'BadReporter': 'bad',
|
|
||||||
'Baldo': 'ba',
|
|
||||||
'BaldoSpanish': 'be',
|
|
||||||
'BallardStreet': 'crbal',
|
|
||||||
'BarkEaterLake': 'bark',
|
|
||||||
'BarstowDonna': 'dba',
|
|
||||||
'BC': 'crbc',
|
|
||||||
'BCSpanish': 'crbcs',
|
|
||||||
'BeattieBruce': 'crbbe',
|
|
||||||
'BennetClay': 'wpcbe',
|
|
||||||
'BensonLisa': 'wplbe',
|
|
||||||
'BensonSteve': 'crsbe',
|
|
||||||
'BigTop': 'bt',
|
|
||||||
'Biographic': 'biov',
|
|
||||||
'Bleeker': 'blk',
|
|
||||||
'BobTheSquirrel': 'bob',
|
|
||||||
'BoilingPoint': 'boil',
|
|
||||||
'BokChip': 'crcbo',
|
|
||||||
'BoNanas': 'bon',
|
|
||||||
'Boomerangs': 'boom',
|
|
||||||
'BoondocksThe': 'bo',
|
|
||||||
'BottomLiners': 'tmbot',
|
|
||||||
'BoundAndGagged': 'tmbou',
|
|
||||||
'Brainwaves': 'bwv',
|
|
||||||
'BreenSteve': 'crsbr',
|
|
||||||
'BrendaStarr': 'tmbre',
|
|
||||||
'BrewsterRockit': 'tmrkt',
|
|
||||||
'BrittChris': 'crcbr',
|
|
||||||
'BroomHilda': 'tmbro',
|
|
||||||
'Candorville': 'cand',
|
|
||||||
'CarlsonStuart': 'sc',
|
|
||||||
'CatalinoKen': 'crkca',
|
|
||||||
'Cathy': 'ca',
|
|
||||||
'CathySpanish': 'spca',
|
|
||||||
'CEstLaVie': 'clv',
|
|
||||||
'CityThe': 'derf',
|
|
||||||
'ClearBlueWater': 'cbw',
|
|
||||||
'Cleats': 'cle',
|
|
||||||
'CloseToHome': 'cl',
|
|
||||||
'CombsPaul': 'tmcmb',
|
|
||||||
'CompuToon': 'tmcom',
|
|
||||||
'Condorito': 'cond',
|
|
||||||
'ConradPaul': 'tmpco',
|
|
||||||
'Cornered': 'co',
|
|
||||||
'CulDeSac': 'cds',
|
|
||||||
'DanzigerJeff': 'jd',
|
|
||||||
'DaviesMatt': 'tmmda',
|
|
||||||
'DeepCover': 'deep',
|
|
||||||
'DeeringJohn': 'crjde',
|
|
||||||
'DickTracy': 'tmdic',
|
|
||||||
'DinetteSetThe': 'crdin',
|
|
||||||
'DogEatDoug': 'crdog',
|
|
||||||
'DonWright': 'tmdow',
|
|
||||||
'Doodles': 'tmdoo',
|
|
||||||
'Doonesbury': 'db',
|
|
||||||
'DuplexThe': 'dp',
|
|
||||||
'Eek': 'eek',
|
|
||||||
'ElderberriesThe': 'eld',
|
|
||||||
'FacesInTheNews': 'kw',
|
|
||||||
'FlightDeck': 'crfd',
|
|
||||||
'FloAndFriends': 'crflo',
|
|
||||||
'FlyingMccoysThe': 'fmc',
|
|
||||||
'ForBetterOrForWorse': 'fb',
|
|
||||||
'ForHeavenSSake': 'crfhs',
|
|
||||||
'FoxtrotClassics': 'ftcl',
|
|
||||||
'Foxtrot': 'ft',
|
|
||||||
'FoxtrotSpanish': 'spft',
|
|
||||||
'FrankAndErnest': 'fa',
|
|
||||||
'FredBassetSpanish': 'spfba',
|
|
||||||
'FredBasset': 'tmfba',
|
|
||||||
'FrogApplause': 'frog',
|
|
||||||
'FuscoBrothersThe': 'fu',
|
|
||||||
'Garfield': 'ga',
|
|
||||||
'GarfieldSpanish': 'gh',
|
|
||||||
'GasolineAlley': 'tmgas',
|
|
||||||
'GaturroSpanish': 'spgat',
|
|
||||||
'GilThorp': 'tmgil',
|
|
||||||
'GingerMeggs': 'gin',
|
|
||||||
'GingerMeggsSpanish': 'spgin',
|
|
||||||
'GirlsAndSports': 'crgis',
|
|
||||||
'GorrellBob': 'crbgo',
|
|
||||||
'GoTeamBob': 'gtb',
|
|
||||||
'HammondBruce': 'hb',
|
|
||||||
'HandelsmanWalt': 'tmwha',
|
|
||||||
'HeartOfTheCity': 'hc',
|
|
||||||
'Heathcliff': 'crhea',
|
|
||||||
'HeathcliffSpanish': 'crhes',
|
|
||||||
'HerbAndJamaal': 'crher',
|
|
||||||
'HigginsJack': 'jh',
|
|
||||||
'HomeAndAway': 'wphaa',
|
|
||||||
'HorseyDavid': 'tmdho',
|
|
||||||
'Housebroken': 'tmhou',
|
|
||||||
'HubertAndAbby': 'haa',
|
|
||||||
'IdiotBox': 'ibox',
|
|
||||||
'ImagineThis': 'imt',
|
|
||||||
'InkPen': 'ink',
|
|
||||||
'InTheBleachers': 'bl',
|
|
||||||
'ItsAllAboutYou': 'wpiay',
|
|
||||||
'JamesBondSpanish': 'spjb',
|
|
||||||
'JonesClay': 'crcjo',
|
|
||||||
'KallaugherKevin': 'cwkal',
|
|
||||||
'KChroniclesThe': 'kk',
|
|
||||||
'KelleySteve': 'crske',
|
|
||||||
'Kudzu': 'tmkud',
|
|
||||||
'LaCucaracha': 'lc',
|
|
||||||
'LegendOfBill': 'lob',
|
|
||||||
'LibertyMeadows': 'crlib',
|
|
||||||
'Lio': 'lio',
|
|
||||||
'LittleDogLost': 'wpldl',
|
|
||||||
'LocherDick': 'tmdlo',
|
|
||||||
'LooseParts': 'tmloo',
|
|
||||||
'LostSheep': 'lost',
|
|
||||||
'LoweChan': 'tmclo',
|
|
||||||
'LuckovichMike': 'crmlu',
|
|
||||||
'LuckyCow': 'luc',
|
|
||||||
'MarkstienGary': 'crgma',
|
|
||||||
'MarletteDoug': 'tmdma',
|
|
||||||
'MccoyGlenn': 'gm',
|
|
||||||
'MeaningOfLilaThe': 'crlil',
|
|
||||||
'MeehanStreak': 'tmmee',
|
|
||||||
'MiddletonsThe': 'tmmid',
|
|
||||||
'MinimumSecurity': 'ms',
|
|
||||||
'ModestyBlaiseSpanish': 'spmb',
|
|
||||||
'Momma': 'crmom',
|
|
||||||
'MorinJim': 'cwjmo',
|
|
||||||
'MuttJeffSpanish': 'spmut',
|
|
||||||
'MythTickle': 'myth',
|
|
||||||
'NAoQV': 'naqv',
|
|
||||||
'NaturalSelection': 'crns',
|
|
||||||
'NestHeads': 'cpnst',
|
|
||||||
'Neurotica': 'neu',
|
|
||||||
'NonSequitur': 'nq',
|
|
||||||
'OhmanJack': 'tmjoh',
|
|
||||||
'OliphantPat': 'po',
|
|
||||||
'OnAClaireDay': 'crocd',
|
|
||||||
'OneBigHappy': 'crobh',
|
|
||||||
'OtherCoastThe': 'crtoc',
|
|
||||||
'OutOfTheGenePool': 'wpgen',
|
|
||||||
'Overboard': 'ob',
|
|
||||||
'OverboardSpanish': 'spob',
|
|
||||||
'PepeSpanish': 'sppep',
|
|
||||||
'PettJoel': 'jp',
|
|
||||||
'Pibgorn': 'pib',
|
|
||||||
'Pickles': 'wppic',
|
|
||||||
'Pluggers': 'tmplu',
|
|
||||||
'PoochCafe': 'poc',
|
|
||||||
'PoochCafeSpanish': 'sppoc',
|
|
||||||
'PopCulture': 'pop',
|
|
||||||
'PowellDwane': 'crdpo',
|
|
||||||
'Preteena': 'pr',
|
|
||||||
'PricklyCity': 'prc',
|
|
||||||
'QuigmansThe': 'tmqui',
|
|
||||||
'RallComic': 'tr',
|
|
||||||
'RamirezMicheal': 'crmrm',
|
|
||||||
'RamseyMarshall': 'crmra',
|
|
||||||
'RealLifeAdventures': 'rl',
|
|
||||||
'RedAndRover': 'wpred',
|
|
||||||
'RedMeat': 'red',
|
|
||||||
'ReynoldsUnwrapped': 'rw',
|
|
||||||
'RonaldinhoGaucho': 'ron',
|
|
||||||
'RonaldinhoGauchoSpanish': 'spron',
|
|
||||||
'Rubes': 'crrub',
|
|
||||||
'SackSteve': 'tmssa',
|
|
||||||
'SargentBen': 'bs',
|
|
||||||
'SargentBenSpanish': 'spbs',
|
|
||||||
'SendHelp': 'send',
|
|
||||||
'ShenemanDrew': 'tmdsh',
|
|
||||||
'SherffiusDrew': 'crjsh',
|
|
||||||
'Shoecabbage': 'shcab',
|
|
||||||
'Shoe': 'tmsho',
|
|
||||||
'SigmundSpanish': 'spsig',
|
|
||||||
'Slowpoke': 'slow',
|
|
||||||
'SmallWorld': 'small',
|
|
||||||
'SpaceIsThePlace': 'sitp',
|
|
||||||
'SpeedBump': 'crspe',
|
|
||||||
'StanisScott': 'crsst',
|
|
||||||
'StateOfTheUnion': 'crsou',
|
|
||||||
'StayskalWayne': 'tmwst',
|
|
||||||
'StoneSoup': 'ss',
|
|
||||||
'StrangeBrew': 'crstr',
|
|
||||||
'SummersDana': 'tmdsu',
|
|
||||||
'SuttonImpact': 'stn',
|
|
||||||
'Sylvia': 'tmsyl',
|
|
||||||
'SzepPaul': 'crpsz',
|
|
||||||
'TankMcnamara': 'tm',
|
|
||||||
'TeenageMutantNinjaTurtles': 'tmnt',
|
|
||||||
'TelnaesAnn': 'tmate',
|
|
||||||
'TheArgyleSweater': 'tas',
|
|
||||||
'ThePinkPanther': 'tmpnk',
|
|
||||||
'TheWizardOfId': 'crwiz',
|
|
||||||
'TheWizardOfIdSpanish': 'crwis',
|
|
||||||
'ThInk': 'think',
|
|
||||||
'ThompsonMike': 'crmth',
|
|
||||||
'ThroughThickAndThin': 'cpthk',
|
|
||||||
'TinySepuku': 'tiny',
|
|
||||||
'Toby': 'toby',
|
|
||||||
'TolesTom': 'tt',
|
|
||||||
'TomTheDancingBug': 'td',
|
|
||||||
'TooMuchCoffeeMan': 'tmcm',
|
|
||||||
'Trevor': 'trev',
|
|
||||||
'TutelandiaSpanish': 'sptut',
|
|
||||||
'VarvelGary': 'crgva',
|
|
||||||
'WassermanDan': 'tmdwa',
|
|
||||||
'WatchYourHead': 'wpwyh',
|
|
||||||
'Waylay': 'min',
|
|
||||||
'WeePals': 'crwee',
|
|
||||||
'WinnieThePooh': 'crwin',
|
|
||||||
'WitOfTheWorld': 'cwwit',
|
|
||||||
'WorkingItOut': 'crwio',
|
|
||||||
'WriteDon': 'tmdow',
|
|
||||||
'YennySpanish': 'spyen',
|
|
||||||
'Yenny': 'yen',
|
|
||||||
'ZackHill': 'crzhi',
|
|
||||||
'ZiggySpanish': 'spzi',
|
|
||||||
'Ziggy': 'zi',
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for name, shortname in comics.items():
|
for name, category in comics.items():
|
||||||
add(name, shortname)
|
add(name, category)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012 Bastian Kleineidam
|
# Copyright (C) 2012 Bastian Kleineidam
|
||||||
|
|
||||||
|
from re import compile
|
||||||
from ..scraper import _BasicScraper
|
from ..scraper import _BasicScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,9 @@ class _BasicScraper(object):
|
||||||
@cvar prevSearch: A compiled regex that will locate the URL for the
|
@cvar prevSearch: A compiled regex that will locate the URL for the
|
||||||
previous strip when applied to a strip page.
|
previous strip when applied to a strip page.
|
||||||
'''
|
'''
|
||||||
|
# if more than one image per URL is expected
|
||||||
|
multipleImagesPerStrip = False
|
||||||
|
# usually the index format help
|
||||||
help = 'Sorry, no help for this comic yet.'
|
help = 'Sorry, no help for this comic yet.'
|
||||||
|
|
||||||
def __init__(self, indexes=None):
|
def __init__(self, indexes=None):
|
||||||
|
@ -44,7 +47,9 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def getStrip(self, url):
|
def getStrip(self, url):
|
||||||
"""Get comic strip for given URL."""
|
"""Get comic strip for given URL."""
|
||||||
imageUrls = fetchUrls(url, self.imageSearch)
|
imageUrls = fetchUrls(url, self.imageSearch)[0]
|
||||||
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
|
raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||||
return self.getComicStrip(url, imageUrls)
|
return self.getComicStrip(url, imageUrls)
|
||||||
|
|
||||||
def getComicStrip(self, url, imageUrls):
|
def getComicStrip(self, url, imageUrls):
|
||||||
|
@ -140,11 +145,13 @@ def get_scrapers():
|
||||||
"""
|
"""
|
||||||
global _scrapers
|
global _scrapers
|
||||||
if _scrapers is None:
|
if _scrapers is None:
|
||||||
|
out.write("Loading comic modules...")
|
||||||
modules = loader.get_modules()
|
modules = loader.get_modules()
|
||||||
plugins = loader.get_plugins(modules, _BasicScraper)
|
plugins = loader.get_plugins(modules, _BasicScraper)
|
||||||
_scrapers = list(plugins)
|
_scrapers = list(plugins)
|
||||||
_scrapers.sort(key=lambda s: s.get_name())
|
_scrapers.sort(key=lambda s: s.get_name())
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
|
out.write("... %d modules loaded." % len(_scrapers))
|
||||||
return _scrapers
|
return _scrapers
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
from __future__ import division, print_function
|
from __future__ import division, print_function
|
||||||
|
|
||||||
import urllib2, urlparse
|
import urllib2, urlparse
|
||||||
|
import requests
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import cgi
|
import cgi
|
||||||
|
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
||||||
@return: the generated regular expression suitable for re.compile()
|
@return: the generated regular expression suitable for re.compile()
|
||||||
@rtype: string
|
@rtype: string
|
||||||
"""
|
"""
|
||||||
if before:
|
|
||||||
before += "[^>]*"
|
|
||||||
if after:
|
|
||||||
after += "[^>]*"
|
|
||||||
attrs = dict(
|
attrs = dict(
|
||||||
tag=case_insensitive_re(tag),
|
tag=case_insensitive_re(tag),
|
||||||
attribute=case_insensitive_re(attribute),
|
attribute=case_insensitive_re(attribute),
|
||||||
|
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
|
||||||
before=before,
|
before=before,
|
||||||
after=after,
|
after=after,
|
||||||
)
|
)
|
||||||
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
|
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
|
||||||
|
|
||||||
|
|
||||||
def case_insensitive_re(name):
|
def case_insensitive_re(name):
|
||||||
|
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
def getPageContent(url):
|
def getPageContent(url):
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url)
|
page = urlopen(url)
|
||||||
data = page.read(MAX_FILESIZE)
|
data = page.text
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
match = baseSearch.search(data)
|
match = baseSearch.search(data)
|
||||||
|
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
|
||||||
imageUrl = match.group(1)
|
imageUrl = match.group(1)
|
||||||
if not imageUrl:
|
if not imageUrl:
|
||||||
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
|
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
|
||||||
out.write('matched image URL %r' % imageUrl, 2)
|
out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
|
||||||
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
|
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
|
||||||
if not imageUrls:
|
if not imageUrls:
|
||||||
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
|
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
|
||||||
|
@ -178,22 +175,18 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
||||||
out.write('Open URL %s' % url, 2)
|
out.write('Open URL %s' % url, 2)
|
||||||
assert retries >= 0, 'invalid retry value %r' % retries
|
assert retries >= 0, 'invalid retry value %r' % retries
|
||||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||||
req = urllib2.Request(url)
|
headers = {'User-Agent': UserAgent}
|
||||||
|
config = {"max_retries": retries}
|
||||||
if referrer:
|
if referrer:
|
||||||
req.add_header('Referer', referrer)
|
headers['Referer'] = referrer
|
||||||
req.add_header('User-Agent', UserAgent)
|
try:
|
||||||
tries = 0
|
req = requests.get(url, headers=headers, config=config)
|
||||||
while True:
|
req.raise_for_status()
|
||||||
try:
|
return req
|
||||||
return urllib2.urlopen(req)
|
except requests.exceptions.RequestException as err:
|
||||||
except IOError as err:
|
msg = 'URL retrieval of %s failed: %s' % (url, err)
|
||||||
msg = 'URL retrieval of %s failed: %s' % (url, err)
|
out.write(msg)
|
||||||
out.write(msg)
|
raise IOError(msg)
|
||||||
out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
|
|
||||||
time.sleep(retry_wait_seconds)
|
|
||||||
tries += 1
|
|
||||||
if tries >= retries:
|
|
||||||
raise IOError(msg)
|
|
||||||
|
|
||||||
|
|
||||||
def get_columns (fp):
|
def get_columns (fp):
|
||||||
|
@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
|
||||||
print("""********** Oops, I did it again. *************
|
print("""********** Oops, I did it again. *************
|
||||||
|
|
||||||
You have found an internal error in %(app)s. Please write a bug report
|
You have found an internal error in %(app)s. Please write a bug report
|
||||||
at %(url)s and include the following information:
|
at %(url)s and include at least the information below:
|
||||||
- your commandline arguments and any configuration file in ~/.dosage/
|
|
||||||
- the system information below
|
|
||||||
|
|
||||||
Not disclosing some of the information above due to privacy reasons is ok.
|
Not disclosing some of the information below due to privacy reasons is ok.
|
||||||
I will try to help you nonetheless, but you have to give me something
|
I will try to help you nonetheless, but you have to give me something
|
||||||
I can work with ;) .
|
I can work with ;) .
|
||||||
""" % dict(app=AppName, url=SupportUrl), file=out)
|
""" % dict(app=AppName, url=SupportUrl), file=out)
|
||||||
|
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
|
||||||
{"version": sys.version, "platform": sys.platform}, file=out)
|
{"version": sys.version, "platform": sys.platform}, file=out)
|
||||||
stime = strtime(time.time())
|
stime = strtime(time.time())
|
||||||
print("Local time:", stime, file=out)
|
print("Local time:", stime, file=out)
|
||||||
|
print("sys.argv", sys.argv, file=out)
|
||||||
|
|
||||||
|
|
||||||
def strtime(t):
|
def strtime(t):
|
||||||
|
|
2
requirements.txt
Normal file
2
requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
requests
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from dosagelib import scraper
|
from dosagelib import scraper
|
||||||
|
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.name = self.scraperclass.get_name()
|
self.name = self.scraperclass.get_name()
|
||||||
self.url = self.scraperclass.starter()
|
self.url = self.scraperclass.starter()
|
||||||
|
# create a temporary directory for images
|
||||||
|
self.tmpdir = tempfile.mkdtemp()
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
shutil.rmtree(self.tmpdir)
|
||||||
|
|
||||||
|
def get_saved_images(self):
|
||||||
|
"""Get saved images."""
|
||||||
|
dirs = tuple(self.name.split('/'))
|
||||||
|
return os.listdir(os.path.join(self.tmpdir, *dirs))
|
||||||
|
|
||||||
def test_comic(self):
|
def test_comic(self):
|
||||||
# Test a scraper. It must be able to traverse backward for
|
# Test a scraper. It must be able to traverse backward for
|
||||||
|
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
|
||||||
# on at least 4 pages.
|
# on at least 4 pages.
|
||||||
scraperobj = self.scraperclass()
|
scraperobj = self.scraperclass()
|
||||||
num = empty = 0
|
num = empty = 0
|
||||||
for strip in islice(scraperobj.getAllStrips(), 0, 5):
|
max_strips = 5
|
||||||
|
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
|
||||||
images = 0
|
images = 0
|
||||||
for image in strip.getImages():
|
for image in strip.getImages():
|
||||||
images += 1
|
images += 1
|
||||||
|
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
|
||||||
num += 1
|
num += 1
|
||||||
if self.scraperclass.prevSearch:
|
if self.scraperclass.prevSearch:
|
||||||
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
|
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
|
||||||
|
# check that at exactly or for multiple pages at least 5 images are saved
|
||||||
|
saved_images = self.get_saved_images()
|
||||||
|
num_images = len(saved_images)
|
||||||
|
if self.scraperclass.multipleImagesPerStrip:
|
||||||
|
self.check(num_images >= max_strips,
|
||||||
|
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
|
||||||
|
else:
|
||||||
|
self.check(num_images == max_strips,
|
||||||
|
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
|
||||||
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
|
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
|
||||||
|
|
||||||
def check_stripurl(self, strip):
|
def check_stripurl(self, strip):
|
||||||
|
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
|
||||||
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
|
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
|
||||||
|
|
||||||
def save(self, image):
|
def save(self, image):
|
||||||
# create a temporary directory
|
|
||||||
tmpdir = tempfile.mkdtemp()
|
|
||||||
try:
|
try:
|
||||||
image.save(tmpdir)
|
image.save(self.tmpdir)
|
||||||
except Exception as msg:
|
except Exception as msg:
|
||||||
self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
|
self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
|
||||||
finally:
|
|
||||||
shutil.rmtree(tmpdir)
|
|
||||||
|
|
||||||
def check(self, condition, msg):
|
def check(self, condition, msg):
|
||||||
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
|
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
|
||||||
|
|
||||||
|
|
||||||
|
def make_comic_tester(name, **kwargs):
|
||||||
|
"""Create and return a _ComicTester class with given name and attributes."""
|
||||||
|
return type(name, (_ComicTester,), kwargs)
|
||||||
|
|
||||||
|
|
||||||
def generate_comic_testers():
|
def generate_comic_testers():
|
||||||
"""For each comic scraper, create a test class."""
|
"""For each comic scraper, create a test class."""
|
||||||
|
g = globals()
|
||||||
# Limit number of scraper tests for now
|
# Limit number of scraper tests for now
|
||||||
max_scrapers = 100
|
max_scrapers = 10000
|
||||||
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
|
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
|
||||||
name = 'Test'+scraperclass.__name__
|
name = 'Test'+scraperclass.__name__
|
||||||
globals()[name] = type(name,
|
g[name] = make_comic_tester(name, scraperclass=scraperclass)
|
||||||
(_ComicTester,),
|
|
||||||
dict(scraperclass=scraperclass)
|
|
||||||
)
|
|
||||||
|
|
||||||
generate_comic_testers()
|
generate_comic_testers()
|
||||||
|
|
Loading…
Reference in a new issue