Fix comics, improve tests, use python-requests.

This commit is contained in:
Bastian Kleineidam 2012-11-26 18:44:31 +01:00
parent d4eee7719d
commit 0556ffd30a
16 changed files with 191 additions and 403 deletions

View file

@ -40,10 +40,11 @@ manual page.
Dependencies
-------------
Dosage requires Python version 2.7 or higher, which can be downloaded
from http://www.python.org.
No external Python modules are required - only the Python Standard Library
that gets installed with Python.
Python version 2.7 or higher, which can be downloaded
from http://www.python.org/
Also the python-requests module must be installed, which can be downloaded
from http://docs.python-requests.org/en/latest/
Installation
-------------
@ -59,7 +60,7 @@ or if you do not have root permissions:
Technical Description
----------------------
Dosage is written entirely in Python and relies on regular expressions to
Dosage is written in Python and relies on regular expressions to
do most of the grunt work.
For each webcomic Dosage has a plugin module, found in the "plugins"

View file

@ -4,6 +4,7 @@ Features:
- cmdline: Added proper return codes for error conditions.
- comics: Added more robust regular expressions for HTML tags.
They match case insensitive and ignore whitespaces now.
- comics: Use the python-requests module for HTTP requests.
Changes:
- installation: Added support for dynamic configuration values.

View file

@ -2,7 +2,6 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
import urllib2
import os
import locale
import rfc822
@ -55,18 +54,24 @@ class ComicImage(object):
"""Connect to host and get meta information."""
try:
self.urlobj = urlopen(self.url, referrer=self.referrer)
except urllib2.HTTPError as he:
except IOError as he:
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
if self.urlobj.info().getmaintype() != 'image' and \
self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
content_type = self.urlobj.headers.get('content-type')
content_type = content_type.split(';', 1)[0]
if '/' in content_type:
maintype, subtype = content_type.split('/', 1)
else:
maintype = content_type
subtype = None
if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
raise FetchComicError('No suitable image found to retrieve.', self.url)
# Always use mime type for file extension if it is sane.
if self.urlobj.info().getmaintype() == 'image':
self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.info().get('content-length', 0))
self.lastModified = self.urlobj.info().get('last-modified')
if maintype == 'image':
self.ext = '.' + subtype.replace('jpeg', 'jpg')
self.contentLength = int(self.urlobj.headers.get('content-length', 0))
self.lastModified = self.urlobj.headers.get('last-modified')
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
def touch(self, filename):
@ -88,7 +93,6 @@ class ComicImage(object):
fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
self.urlobj.close()
self.touch(fn)
out.write('Skipping existing file "%s".' % (fn,), 1)
return fn, False
@ -97,7 +101,7 @@ class ComicImage(object):
out.write('Writing comic to file %s...' % (fn,), 3)
with open(fn, 'wb') as comicOut:
startTime = time.time()
comicOut.write(self.urlobj.read())
comicOut.write(self.urlobj.content)
endTime = time.time()
self.touch(fn)
except:
@ -114,7 +118,5 @@ class ComicImage(object):
attrs = dict(fn=fn, bytes=bytes, speed=speed)
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
getHandler().comicDownloaded(self.name, fn)
finally:
self.urlobj.close()
return fn, True

View file

@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
class DMFA(_BasicScraper):
latestUrl = 'http://www.missmab.com/'
stripUrl = latestUrl + 'Comics/Vol_%s.php'
imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"])+')+
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
help = 'Index format: nnn (normally, some specials)'

View file

@ -4,22 +4,29 @@
from re import compile
from ..scraper import make_scraper
from ..helpers import bounceStarter, queryNamer
from ..helpers import bounceStarter
from ..util import tagre
def add(name):
classname = 'DrunkDuck_%s' % name
url = 'http://www.drunkduck.com/%s/' % name
linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
@classmethod
def namer(cls, imageUrl, pageUrl):
index = int(pageUrl.rstrip('/').split('/')[-1])
ext = imageUrl.rsplit('.')[-1]
return '%d.%s' % (index, ext)
globals()[classname] = make_scraper(classname,
name = 'DrunkDuck/' + name,
starter = bounceStarter(url, compile(linkSearch % 'next')),
stripUrl = url + 'index.php?p=%s' % name,
imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
prevSearch= compile(linkSearch % 'previous'),
starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
stripUrl = url + '%s/',
imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
help = 'Index format: n (unpadded)',
namer = queryNamer('p', usePageUrl=True),
namer = namer,
)
comics = (

View file

@ -1,47 +1,26 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from ..scraper import _BasicScraper
from re import compile
from ..scraper import make_scraper
from ..util import asciify
def fallenangel(name, shortname):
pass # XXX
class _TheFallenAngel(_BasicScraper):
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
help = 'Index format: yyyymmdd'
@property
def baseUrl(self):
return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
@property
def stripUrl(self):
return self.baseUrl + '?date=%s'
def starter(self):
return self.baseUrl
class HighMaintenance(_TheFallenAngel):
name = 'TheFallenAngel/HighMaintenance'
shortName = 'hm'
class FAWK(_TheFallenAngel):
name = 'TheFallenAngel/FAWK'
shortName = 'fawk'
class MalloryChan(_TheFallenAngel):
name = 'TheFallenAngel/MalloryChan'
shortName = 'mallorychan'
def add(name, shortname):
latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
classname = asciify(name)
globals()[classname] = make_scraper(classname,
latestUrl = latestUrl,
stripUrl = latestUrl + '?date=%s',
name='FallenAngel/' + name,
imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
prevSearch = compile(r' <a href="(http://www.thefallenangel.co.uk/.+?)"><img[^>]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
help = 'Index format: yyyymmdd',
)
add('HighMaintenance', 'hm')
add('FAWK', 'fawk')
add('MalloryChan', 'mallorychan')

View file

@ -29,30 +29,30 @@ def add(name, repl=''):
# http://www.gocomics.com/features
# note that comics from creators.com are not repeated here
# Duplicate comics from creators.com are commented out
add('2 Cows and a Chicken')
add('9 Chickweed Lane')
add('9 to 5')
add('The Academia Waltz')
add('Adam at Home')
add('Agnes')
#add('Agnes')
add('Alley Oop', repl='-')
add('Andertoons')
add('Andy Capp')
#add('Andy Capp')
add('Angry Little Girls', repl='-')
add('Animal Crackers')
add('Annie')
add('The Argyle Sweater')
add('Arlo and Janis')
add('Ask Shagg')
add('BC')
#add('Ask Shagg')
#add('BC')
add('Back in the Day')
add('Bad Reporter')
add('Baldo')
add('Ballard Street')
#add('Ballard Street')
add('Banana Triangle', repl='-')
add('Barkeater Lake')
add('The Barn')
#add('The Barn')
add('Barney and Clyde')
add('Basic Instructions')
add('Beardo')
@ -81,13 +81,13 @@ add('Brewster Rockit')
add('Broom Hilda')
add('The Buckets')
add('Buni')
add('Cafe con Leche')
#add('Cafe con Leche')
add('Calvin and Hobbes')
add('Candorville')
add('Cathy')
add('Cest la Vie')
add('Cheap Thrills Cuisine', repl='-')
add('Chuckle Bros')
#add('Chuckle Bros')
add('Citizen Dog')
add('The City')
add('Cleats')
@ -99,15 +99,15 @@ add('Cow and Boy')
add('CowTown')
add('Crumb')
add('Cul de Sac')
add('Daddys Home')
#add('Daddys Home')
add('Dark Side of the Horse')
add('Deep Cover')
add('Diamond Lil')
#add('Diamond Lil')
add('Dick Tracy')
add('The Dinette Set')
#add('The Dinette Set')
add('Dixie Drive', repl='-')
add('Dog Eat Doug')
add('Dogs of C Kennel')
#add('Dog Eat Doug')
#add('Dogs of C Kennel')
add('Domestic Abuse')
add('Doonesbury')
add('The Doozies')
@ -122,18 +122,18 @@ add('F Minus')
add('Family Tree')
add('Farcus')
add('Fat Cats', repl='-')
add('Flo and Friends')
#add('Flo and Friends')
add('The Flying McCoys')
add('Foolish Mortals', repl='-')
add('For Better or For Worse')
add('For Heavens Sake')
#add('For Heavens Sake')
add('Fort Knox')
add('FoxTrot')
add('FoxTrot Classics')
add('Frank and Ernest')
add('Frazz')
add('Fred Basset')
add('Free Range')
#add('Free Range')
add('Freshly Squeezed')
add('Frog Applause')
add('The Fusco Brothers')
@ -154,9 +154,9 @@ add('Haiku Ewe')
add('Ham Shears')
add('Health Capsules')
add('Heart of the City')
add('Heathcliff')
#add('Heathcliff')
add('Heavenly Nostrils')
add('Herb and Jamaal')
#add('Herb and Jamaal')
add('Herman')
add('Home and Away')
add('HUBRIS!')
@ -184,7 +184,7 @@ add('La Cucaracha')
add('Last Kiss')
add('The LeftyBosco Picture Show')
add('Legend of Bill')
add('Liberty Meadows')
#add('Liberty Meadows')
add('Lil Abner')
add('Lio')
add('Little Dog Lost')
@ -201,7 +201,7 @@ add('Maintaining')
add('Marias Day')
add('Marmaduke')
add('McArroni')
add('The Meaning of Lila')
#add('The Meaning of Lila')
add('Medium Large')
add('Meg Classics')
add('The Middletons')
@ -209,7 +209,7 @@ add('Mike du Jour')
add('Minimum Security')
add('Moderately Confused')
add('Molly and the Bear')
add('Momma')
#add('Momma')
add('Monty')
add('Motley Classics')
add('Mr. Gigi and the Squid')
@ -217,7 +217,7 @@ add('Mutt and Jeff')
add('My Cage')
add('MythTickle')
add('Nancy')
add('Nest Heads')
#add('Nest Heads')
add('NEUROTICA')
add('New Adventures of Queen Victoria')
add('Non Sequitur')
@ -225,10 +225,10 @@ add('The Norm Classics')
add('Nothing is Not Something')
add('Off the Mark')
add('Ollie and Quentin')
add('On A Claire Day')
add('One Big Happy')
#add('On A Claire Day')
#add('One Big Happy')
add('Ordinary Bill')
add('The Other Coast')
#add('The Other Coast')
add('Out of the Gene Pool Re-Runs')
add('Over the Hedge')
add('Overboard')
@ -254,10 +254,10 @@ add('Reply All')
add('Rip Haywire')
add('Ripleys Believe It or Not')
add('Rose is Rose')
add('Rubes')
#add('Rubes')
add('Rudy Park')
add('Savage Chickens')
add('Scary Gary')
#add('Scary Gary')
add('Shirley and Son Classics')
add('Shoe')
add('Shoecabbage')
@ -266,11 +266,11 @@ add('Skin Horse')
add('Skippy')
add('Slowpoke')
add('Soup to Nutz')
add('Speed Bump')
#add('Speed Bump')
add('Spot the Frog')
add('Starslip')
add('Stone Soup')
add('Strange Brew')
#add('Strange Brew')
add('The Sunshine Club')
add('Sylvia')
add('Tank McNamara')
@ -280,7 +280,7 @@ add('Tales of TerraTopia')
add('That is Priceless')
add('Thats Life')
add('Thatababy')
add('Thin Lines')
#add('Thin Lines')
add('Tiny Sepuku')
add('TOBY')
add('Todays Dogg')
@ -293,12 +293,12 @@ add('Unstrange Phenomena')
add('U.S. Acres')
add('Viivi and Wagner')
add('Watch Your Head')
add('Wee Pals')
add('Wizard of Id')
#add('Wee Pals')
#add('Wizard of Id')
add('Working Daze')
add('Working It Out')
#add('Working It Out')
add('W.T. Duck')
add('Zack Hill')
#add('Zack Hill')
add('Ziggy')
# http://www.gocomics.com/explore/editorial_list

View file

@ -18,9 +18,9 @@ def add(name, urls):
name='KeenSpot/' + name,
latestUrl=latestUrl,
stripUrl=baseUrl + 'd/%s.html',
imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
'(?:<img[^>]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
help = 'Index format: yyyymmdd',
)

View file

@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
class NineteenNinetySeven(_BasicScraper):
name = '1997'
latestUrl = 'http://www.1977thecomic.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
help = 'Index format: yyyy/mm/dd/strip-name'

View file

@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
latestUrl = 'http://www.sheldoncomics.com/'
stripUrl = latestUrl + 'archive/%s.html'
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
help = 'Index format: yymmdd'

View file

@ -1,280 +1,54 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile, sub
"""
The Universal comics only have some samples, but those samples are always the newest ones.
"""
import datetime
from re import compile, escape
from ..scraper import make_scraper
from ..util import fetchUrl, tagre
from ..util import tagre, asciify, getPageContent
def add(name, shortName):
homepage = 'http://content.uclick.com/a2z.html'
baseUrl = 'http://www.uclick.com/client/zzz/%s/'
latestUrl = baseUrl % shortName
classname = 'UClick_%s' % name
def parse_strdate(strdate):
"""Parse date string. XXX this is locale dependant but it should not be."""
return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
def add(name, category):
shortname = name.replace(' ', '').lower()
latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
classname = 'UClick_%s' % asciify(name)
@classmethod
def fetchSubmodules(cls):
exclusions = ('index',)
# XXX refactor this mess
submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
possibles = [partsMatch.match(match).groups() for match in matches]
def normalizeName(name):
name = sub(r'&(.)acute;', r'\1', name).title()
return ''.join([c for c in name if c.isalnum()])
def fetchSubmodule(module):
try:
return fetchUrl(cls.baseUrl % module, cls.imageSearch)
except Exception:
# XXX log error
return False
return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
def namer(cls, imageUrl, pageUrl):
"""Parse publish date from page content which looks like:
<img alt="Marmaduke" src="http://assets.amuniversal.com/07e7f270fa08012ff506001dd8b71c47" />
<h4>published: Sunday, November 11, 2012</h4>
"""
data = getPageContent(pageUrl)[0]
ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+<h4>published: ([^<]+)')
mo = ro.search(data)
if mo:
strdate = mo.group(1)
return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname,
name='UClick/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '%s/',
imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
help = 'Index format: yyyy/mm/dd',
imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+<h4>published'),
multipleImagesPerStrip = True,
prevSearch = None,
help = 'Index format: none',
namer = namer,
)
# List is from http://www.universaluclick.com/comics/list
comics = {
'5thWave': 'fw',
'9To5': 'tmntf',
'AdamHome': 'ad',
'Agnes': 'cragn',
'AlcarazLalo': 'la',
'AlcarazLaloSpanish': 'spla',
'AndersonNick': 'wpnan',
'AndyCapp': 'crcap',
'AnimalCrackers': 'tmani',
'Annie': 'tmann',
'AsayChuck': 'crcas',
'AskShagg': 'crask',
'AuthTony': 'ta',
'BadReporter': 'bad',
'Baldo': 'ba',
'BaldoSpanish': 'be',
'BallardStreet': 'crbal',
'BarkEaterLake': 'bark',
'BarstowDonna': 'dba',
'BC': 'crbc',
'BCSpanish': 'crbcs',
'BeattieBruce': 'crbbe',
'BennetClay': 'wpcbe',
'BensonLisa': 'wplbe',
'BensonSteve': 'crsbe',
'BigTop': 'bt',
'Biographic': 'biov',
'Bleeker': 'blk',
'BobTheSquirrel': 'bob',
'BoilingPoint': 'boil',
'BokChip': 'crcbo',
'BoNanas': 'bon',
'Boomerangs': 'boom',
'BoondocksThe': 'bo',
'BottomLiners': 'tmbot',
'BoundAndGagged': 'tmbou',
'Brainwaves': 'bwv',
'BreenSteve': 'crsbr',
'BrendaStarr': 'tmbre',
'BrewsterRockit': 'tmrkt',
'BrittChris': 'crcbr',
'BroomHilda': 'tmbro',
'Candorville': 'cand',
'CarlsonStuart': 'sc',
'CatalinoKen': 'crkca',
'Cathy': 'ca',
'CathySpanish': 'spca',
'CEstLaVie': 'clv',
'CityThe': 'derf',
'ClearBlueWater': 'cbw',
'Cleats': 'cle',
'CloseToHome': 'cl',
'CombsPaul': 'tmcmb',
'CompuToon': 'tmcom',
'Condorito': 'cond',
'ConradPaul': 'tmpco',
'Cornered': 'co',
'CulDeSac': 'cds',
'DanzigerJeff': 'jd',
'DaviesMatt': 'tmmda',
'DeepCover': 'deep',
'DeeringJohn': 'crjde',
'DickTracy': 'tmdic',
'DinetteSetThe': 'crdin',
'DogEatDoug': 'crdog',
'DonWright': 'tmdow',
'Doodles': 'tmdoo',
'Doonesbury': 'db',
'DuplexThe': 'dp',
'Eek': 'eek',
'ElderberriesThe': 'eld',
'FacesInTheNews': 'kw',
'FlightDeck': 'crfd',
'FloAndFriends': 'crflo',
'FlyingMccoysThe': 'fmc',
'ForBetterOrForWorse': 'fb',
'ForHeavenSSake': 'crfhs',
'FoxtrotClassics': 'ftcl',
'Foxtrot': 'ft',
'FoxtrotSpanish': 'spft',
'FrankAndErnest': 'fa',
'FredBassetSpanish': 'spfba',
'FredBasset': 'tmfba',
'FrogApplause': 'frog',
'FuscoBrothersThe': 'fu',
'Garfield': 'ga',
'GarfieldSpanish': 'gh',
'GasolineAlley': 'tmgas',
'GaturroSpanish': 'spgat',
'GilThorp': 'tmgil',
'GingerMeggs': 'gin',
'GingerMeggsSpanish': 'spgin',
'GirlsAndSports': 'crgis',
'GorrellBob': 'crbgo',
'GoTeamBob': 'gtb',
'HammondBruce': 'hb',
'HandelsmanWalt': 'tmwha',
'HeartOfTheCity': 'hc',
'Heathcliff': 'crhea',
'HeathcliffSpanish': 'crhes',
'HerbAndJamaal': 'crher',
'HigginsJack': 'jh',
'HomeAndAway': 'wphaa',
'HorseyDavid': 'tmdho',
'Housebroken': 'tmhou',
'HubertAndAbby': 'haa',
'IdiotBox': 'ibox',
'ImagineThis': 'imt',
'InkPen': 'ink',
'InTheBleachers': 'bl',
'ItsAllAboutYou': 'wpiay',
'JamesBondSpanish': 'spjb',
'JonesClay': 'crcjo',
'KallaugherKevin': 'cwkal',
'KChroniclesThe': 'kk',
'KelleySteve': 'crske',
'Kudzu': 'tmkud',
'LaCucaracha': 'lc',
'LegendOfBill': 'lob',
'LibertyMeadows': 'crlib',
'Lio': 'lio',
'LittleDogLost': 'wpldl',
'LocherDick': 'tmdlo',
'LooseParts': 'tmloo',
'LostSheep': 'lost',
'LoweChan': 'tmclo',
'LuckovichMike': 'crmlu',
'LuckyCow': 'luc',
'MarkstienGary': 'crgma',
'MarletteDoug': 'tmdma',
'MccoyGlenn': 'gm',
'MeaningOfLilaThe': 'crlil',
'MeehanStreak': 'tmmee',
'MiddletonsThe': 'tmmid',
'MinimumSecurity': 'ms',
'ModestyBlaiseSpanish': 'spmb',
'Momma': 'crmom',
'MorinJim': 'cwjmo',
'MuttJeffSpanish': 'spmut',
'MythTickle': 'myth',
'NAoQV': 'naqv',
'NaturalSelection': 'crns',
'NestHeads': 'cpnst',
'Neurotica': 'neu',
'NonSequitur': 'nq',
'OhmanJack': 'tmjoh',
'OliphantPat': 'po',
'OnAClaireDay': 'crocd',
'OneBigHappy': 'crobh',
'OtherCoastThe': 'crtoc',
'OutOfTheGenePool': 'wpgen',
'Overboard': 'ob',
'OverboardSpanish': 'spob',
'PepeSpanish': 'sppep',
'PettJoel': 'jp',
'Pibgorn': 'pib',
'Pickles': 'wppic',
'Pluggers': 'tmplu',
'PoochCafe': 'poc',
'PoochCafeSpanish': 'sppoc',
'PopCulture': 'pop',
'PowellDwane': 'crdpo',
'Preteena': 'pr',
'PricklyCity': 'prc',
'QuigmansThe': 'tmqui',
'RallComic': 'tr',
'RamirezMicheal': 'crmrm',
'RamseyMarshall': 'crmra',
'RealLifeAdventures': 'rl',
'RedAndRover': 'wpred',
'RedMeat': 'red',
'ReynoldsUnwrapped': 'rw',
'RonaldinhoGaucho': 'ron',
'RonaldinhoGauchoSpanish': 'spron',
'Rubes': 'crrub',
'SackSteve': 'tmssa',
'SargentBen': 'bs',
'SargentBenSpanish': 'spbs',
'SendHelp': 'send',
'ShenemanDrew': 'tmdsh',
'SherffiusDrew': 'crjsh',
'Shoecabbage': 'shcab',
'Shoe': 'tmsho',
'SigmundSpanish': 'spsig',
'Slowpoke': 'slow',
'SmallWorld': 'small',
'SpaceIsThePlace': 'sitp',
'SpeedBump': 'crspe',
'StanisScott': 'crsst',
'StateOfTheUnion': 'crsou',
'StayskalWayne': 'tmwst',
'StoneSoup': 'ss',
'StrangeBrew': 'crstr',
'SummersDana': 'tmdsu',
'SuttonImpact': 'stn',
'Sylvia': 'tmsyl',
'SzepPaul': 'crpsz',
'TankMcnamara': 'tm',
'TeenageMutantNinjaTurtles': 'tmnt',
'TelnaesAnn': 'tmate',
'TheArgyleSweater': 'tas',
'ThePinkPanther': 'tmpnk',
'TheWizardOfId': 'crwiz',
'TheWizardOfIdSpanish': 'crwis',
'ThInk': 'think',
'ThompsonMike': 'crmth',
'ThroughThickAndThin': 'cpthk',
'TinySepuku': 'tiny',
'Toby': 'toby',
'TolesTom': 'tt',
'TomTheDancingBug': 'td',
'TooMuchCoffeeMan': 'tmcm',
'Trevor': 'trev',
'TutelandiaSpanish': 'sptut',
'VarvelGary': 'crgva',
'WassermanDan': 'tmdwa',
'WatchYourHead': 'wpwyh',
'Waylay': 'min',
'WeePals': 'crwee',
'WinnieThePooh': 'crwin',
'WitOfTheWorld': 'cwwit',
'WorkingItOut': 'crwio',
'WriteDon': 'tmdow',
'YennySpanish': 'spyen',
'Yenny': 'yen',
'ZackHill': 'crzhi',
'ZiggySpanish': 'spzi',
'Ziggy': 'zi',
'9 Chickweed Lane': 'strip',
}
for name, shortname in comics.items():
add(name, shortname)
for name, category in comics.items():
add(name, category)

View file

@ -2,6 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
from re import compile
from ..scraper import _BasicScraper
from ..util import tagre

View file

@ -22,6 +22,9 @@ class _BasicScraper(object):
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
# if more than one image per URL is expected
multipleImagesPerStrip = False
# usually the index format help
help = 'Sorry, no help for this comic yet.'
def __init__(self, indexes=None):
@ -44,7 +47,9 @@ class _BasicScraper(object):
def getStrip(self, url):
"""Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch)
imageUrls = fetchUrls(url, self.imageSearch)[0]
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
@ -140,11 +145,13 @@ def get_scrapers():
"""
global _scrapers
if _scrapers is None:
out.write("Loading comic modules...")
modules = loader.get_modules()
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name())
check_scrapers()
out.write("... %d modules loaded." % len(_scrapers))
return _scrapers

View file

@ -4,6 +4,7 @@
from __future__ import division, print_function
import urllib2, urlparse
import requests
import sys
import os
import cgi
@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
if before:
before += "[^>]*"
if after:
after += "[^>]*"
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
before=before,
after=after,
)
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
def case_insensitive_re(name):
@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url):
# read page data
page = urlopen(url)
data = page.read(MAX_FILESIZE)
data = page.text
# determine base URL
baseUrl = None
match = baseSearch.search(data)
@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
imageUrl = match.group(1)
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.write('matched image URL %r' % imageUrl, 2)
out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls:
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
@ -178,21 +175,17 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
out.write('Open URL %s' % url, 2)
assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
req = urllib2.Request(url)
headers = {'User-Agent': UserAgent}
config = {"max_retries": retries}
if referrer:
req.add_header('Referer', referrer)
req.add_header('User-Agent', UserAgent)
tries = 0
while True:
headers['Referer'] = referrer
try:
return urllib2.urlopen(req)
except IOError as err:
req = requests.get(url, headers=headers, config=config)
req.raise_for_status()
return req
except requests.exceptions.RequestException as err:
msg = 'URL retrieval of %s failed: %s' % (url, err)
out.write(msg)
out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
time.sleep(retry_wait_seconds)
tries += 1
if tries >= retries:
raise IOError(msg)
@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
print("""********** Oops, I did it again. *************
You have found an internal error in %(app)s. Please write a bug report
at %(url)s and include the following information:
- your commandline arguments and any configuration file in ~/.dosage/
- the system information below
at %(url)s and include at least the information below:
Not disclosing some of the information above due to privacy reasons is ok.
Not disclosing some of the information below due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out)
@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
{"version": sys.version, "platform": sys.platform}, file=out)
stime = strtime(time.time())
print("Local time:", stime, file=out)
print("sys.argv", sys.argv, file=out)
def strtime(t):

2
requirements.txt Normal file
View file

@ -0,0 +1,2 @@
requests

View file

@ -4,6 +4,7 @@
import tempfile
import shutil
import re
import os
from itertools import islice
from unittest import TestCase
from dosagelib import scraper
@ -16,6 +17,16 @@ class _ComicTester(TestCase):
def setUp(self):
self.name = self.scraperclass.get_name()
self.url = self.scraperclass.starter()
# create a temporary directory for images
self.tmpdir = tempfile.mkdtemp()
def tearDown(self):
shutil.rmtree(self.tmpdir)
def get_saved_images(self):
"""Get saved images."""
dirs = tuple(self.name.split('/'))
return os.listdir(os.path.join(self.tmpdir, *dirs))
def test_comic(self):
# Test a scraper. It must be able to traverse backward for
@ -23,7 +34,8 @@ class _ComicTester(TestCase):
# on at least 4 pages.
scraperobj = self.scraperclass()
num = empty = 0
for strip in islice(scraperobj.getAllStrips(), 0, 5):
max_strips = 5
for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0
for image in strip.getImages():
images += 1
@ -35,6 +47,15 @@ class _ComicTester(TestCase):
num += 1
if self.scraperclass.prevSearch:
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
# check that at exactly or for multiple pages at least 5 images are saved
saved_images = self.get_saved_images()
num_images = len(saved_images)
if self.scraperclass.multipleImagesPerStrip:
self.check(num_images >= max_strips,
'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
else:
self.check(num_images == max_strips,
'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
def check_stripurl(self, strip):
@ -50,28 +71,28 @@ class _ComicTester(TestCase):
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
def save(self, image):
# create a temporary directory
tmpdir = tempfile.mkdtemp()
try:
image.save(tmpdir)
image.save(self.tmpdir)
except Exception as msg:
self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
finally:
shutil.rmtree(tmpdir)
self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
def check(self, condition, msg):
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
def make_comic_tester(name, **kwargs):
"""Create and return a _ComicTester class with given name and attributes."""
return type(name, (_ComicTester,), kwargs)
def generate_comic_testers():
"""For each comic scraper, create a test class."""
g = globals()
# Limit number of scraper tests for now
max_scrapers = 100
max_scrapers = 10000
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
name = 'Test'+scraperclass.__name__
globals()[name] = type(name,
(_ComicTester,),
dict(scraperclass=scraperclass)
)
g[name] = make_comic_tester(name, scraperclass=scraperclass)
generate_comic_testers()