diff --git a/doc/README.txt b/doc/README.txt
index 6e48f11c5..5041c56f2 100644
--- a/doc/README.txt
+++ b/doc/README.txt
@@ -40,10 +40,11 @@ manual page.
Dependencies
-------------
-Dosage requires Python version 2.7 or higher, which can be downloaded
-from http://www.python.org.
-No external Python modules are required - only the Python Standard Library
-that gets installed with Python.
+Python version 2.7 or higher, which can be downloaded
+from http://www.python.org/
+
+Also the python-requests module must be installed, which can be downloaded
+from http://docs.python-requests.org/en/latest/
Installation
-------------
@@ -59,7 +60,7 @@ or if you do not have root permissions:
Technical Description
----------------------
-Dosage is written entirely in Python and relies on regular expressions to
+Dosage is written in Python and relies on regular expressions to
do most of the grunt work.
For each webcomic Dosage has a plugin module, found in the "plugins"
diff --git a/doc/changelog.txt b/doc/changelog.txt
index 5cb01c80a..81b1608e2 100644
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@@ -4,6 +4,7 @@ Features:
- cmdline: Added proper return codes for error conditions.
- comics: Added more robust regular expressions for HTML tags.
They match case insensitive and ignore whitespaces now.
+- comics: Use the python-requests module for HTTP requests.
Changes:
- installation: Added support for dynamic configuration values.
diff --git a/dosagelib/comic.py b/dosagelib/comic.py
index 70e12fec4..b2e21a892 100644
--- a/dosagelib/comic.py
+++ b/dosagelib/comic.py
@@ -2,7 +2,6 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
-import urllib2
import os
import locale
import rfc822
@@ -55,18 +54,24 @@ class ComicImage(object):
"""Connect to host and get meta information."""
try:
self.urlobj = urlopen(self.url, referrer=self.referrer)
- except urllib2.HTTPError as he:
+ except IOError as he:
raise FetchComicError('Unable to retrieve URL.', self.url, he.code)
- if self.urlobj.info().getmaintype() != 'image' and \
- self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'):
+ content_type = self.urlobj.headers.get('content-type')
+ content_type = content_type.split(';', 1)[0]
+ if '/' in content_type:
+ maintype, subtype = content_type.split('/', 1)
+ else:
+ maintype = content_type
+ subtype = None
+ if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'):
raise FetchComicError('No suitable image found to retrieve.', self.url)
# Always use mime type for file extension if it is sane.
- if self.urlobj.info().getmaintype() == 'image':
- self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg')
- self.contentLength = int(self.urlobj.info().get('content-length', 0))
- self.lastModified = self.urlobj.info().get('last-modified')
+ if maintype == 'image':
+ self.ext = '.' + subtype.replace('jpeg', 'jpg')
+ self.contentLength = int(self.urlobj.headers.get('content-length', 0))
+ self.lastModified = self.urlobj.headers.get('last-modified')
out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2)
def touch(self, filename):
@@ -88,7 +93,6 @@ class ComicImage(object):
fn = os.path.join(comicDir, filename)
if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize:
- self.urlobj.close()
self.touch(fn)
out.write('Skipping existing file "%s".' % (fn,), 1)
return fn, False
@@ -97,7 +101,7 @@ class ComicImage(object):
out.write('Writing comic to file %s...' % (fn,), 3)
with open(fn, 'wb') as comicOut:
startTime = time.time()
- comicOut.write(self.urlobj.read())
+ comicOut.write(self.urlobj.content)
endTime = time.time()
self.touch(fn)
except:
@@ -114,7 +118,5 @@ class ComicImage(object):
attrs = dict(fn=fn, bytes=bytes, speed=speed)
out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1)
getHandler().comicDownloaded(self.name, fn)
- finally:
- self.urlobj.close()
return fn, True
diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py
index 175c9b0e3..685a6dde9 100644
--- a/dosagelib/plugins/d.py
+++ b/dosagelib/plugins/d.py
@@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams
class DMFA(_BasicScraper):
latestUrl = 'http://www.missmab.com/'
stripUrl = latestUrl + 'Comics/Vol_%s.php'
- imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)'))
+ imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"])+')+
tagre("img", "src", r'(?:../)?Images/comicprev.gif'))
help = 'Index format: nnn (normally, some specials)'
diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py
index a8ca15c49..492da01bd 100644
--- a/dosagelib/plugins/drunkduck.py
+++ b/dosagelib/plugins/drunkduck.py
@@ -4,22 +4,29 @@
from re import compile
from ..scraper import make_scraper
-from ..helpers import bounceStarter, queryNamer
+from ..helpers import bounceStarter
from ..util import tagre
def add(name):
classname = 'DrunkDuck_%s' % name
url = 'http://www.drunkduck.com/%s/' % name
- linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page")
+ linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name)
+
+ @classmethod
+ def namer(cls, imageUrl, pageUrl):
+ index = int(pageUrl.rstrip('/').split('/')[-1])
+ ext = imageUrl.rsplit('.')[-1]
+ return '%d.%s' % (index, ext)
+
globals()[classname] = make_scraper(classname,
name = 'DrunkDuck/' + name,
- starter = bounceStarter(url, compile(linkSearch % 'next')),
- stripUrl = url + 'index.php?p=%s' % name,
- imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")),
- prevSearch= compile(linkSearch % 'previous'),
+ starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))),
+ stripUrl = url + '%s/',
+ imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")),
+ prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")),
help = 'Index format: n (unpadded)',
- namer = queryNamer('p', usePageUrl=True),
+ namer = namer,
)
comics = (
diff --git a/dosagelib/plugins/fallenangel.py b/dosagelib/plugins/fallenangel.py
index d0fceea5f..b9d3fb933 100644
--- a/dosagelib/plugins/fallenangel.py
+++ b/dosagelib/plugins/fallenangel.py
@@ -1,47 +1,26 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
-from ..scraper import _BasicScraper
+
+from re import compile
+from ..scraper import make_scraper
+from ..util import asciify
-def fallenangel(name, shortname):
- pass # XXX
-
-class _TheFallenAngel(_BasicScraper):
- imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"')
- prevSearch = compile(r' ]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"')
- help = 'Index format: yyyymmdd'
-
- @property
- def baseUrl(self):
- return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,)
-
-
- @property
- def stripUrl(self):
- return self.baseUrl + '?date=%s'
-
-
- def starter(self):
- return self.baseUrl
-
-
-
-class HighMaintenance(_TheFallenAngel):
- name = 'TheFallenAngel/HighMaintenance'
- shortName = 'hm'
-
-
-
-class FAWK(_TheFallenAngel):
- name = 'TheFallenAngel/FAWK'
- shortName = 'fawk'
-
-
-
-class MalloryChan(_TheFallenAngel):
- name = 'TheFallenAngel/MalloryChan'
- shortName = 'mallorychan'
+def add(name, shortname):
+ latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname
+ classname = asciify(name)
+ globals()[classname] = make_scraper(classname,
+ latestUrl = latestUrl,
+ stripUrl = latestUrl + '?date=%s',
+ name='FallenAngel/' + name,
+ imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'),
+ prevSearch = compile(r' ]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'),
+ help = 'Index format: yyyymmdd',
+ )
+add('HighMaintenance', 'hm')
+add('FAWK', 'fawk')
+add('MalloryChan', 'mallorychan')
diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py
index 7a861e309..dd6dacee2 100644
--- a/dosagelib/plugins/gocomics.py
+++ b/dosagelib/plugins/gocomics.py
@@ -29,30 +29,30 @@ def add(name, repl=''):
# http://www.gocomics.com/features
-# note that comics from creators.com are not repeated here
+# Duplicate comics from creators.com are commented out
add('2 Cows and a Chicken')
add('9 Chickweed Lane')
add('9 to 5')
add('The Academia Waltz')
add('Adam at Home')
-add('Agnes')
+#add('Agnes')
add('Alley Oop', repl='-')
add('Andertoons')
-add('Andy Capp')
+#add('Andy Capp')
add('Angry Little Girls', repl='-')
add('Animal Crackers')
add('Annie')
add('The Argyle Sweater')
add('Arlo and Janis')
-add('Ask Shagg')
-add('BC')
+#add('Ask Shagg')
+#add('BC')
add('Back in the Day')
add('Bad Reporter')
add('Baldo')
-add('Ballard Street')
+#add('Ballard Street')
add('Banana Triangle', repl='-')
add('Barkeater Lake')
-add('The Barn')
+#add('The Barn')
add('Barney and Clyde')
add('Basic Instructions')
add('Beardo')
@@ -81,13 +81,13 @@ add('Brewster Rockit')
add('Broom Hilda')
add('The Buckets')
add('Buni')
-add('Cafe con Leche')
+#add('Cafe con Leche')
add('Calvin and Hobbes')
add('Candorville')
add('Cathy')
add('Cest la Vie')
add('Cheap Thrills Cuisine', repl='-')
-add('Chuckle Bros')
+#add('Chuckle Bros')
add('Citizen Dog')
add('The City')
add('Cleats')
@@ -99,15 +99,15 @@ add('Cow and Boy')
add('CowTown')
add('Crumb')
add('Cul de Sac')
-add('Daddys Home')
+#add('Daddys Home')
add('Dark Side of the Horse')
add('Deep Cover')
-add('Diamond Lil')
+#add('Diamond Lil')
add('Dick Tracy')
-add('The Dinette Set')
+#add('The Dinette Set')
add('Dixie Drive', repl='-')
-add('Dog Eat Doug')
-add('Dogs of C Kennel')
+#add('Dog Eat Doug')
+#add('Dogs of C Kennel')
add('Domestic Abuse')
add('Doonesbury')
add('The Doozies')
@@ -122,18 +122,18 @@ add('F Minus')
add('Family Tree')
add('Farcus')
add('Fat Cats', repl='-')
-add('Flo and Friends')
+#add('Flo and Friends')
add('The Flying McCoys')
add('Foolish Mortals', repl='-')
add('For Better or For Worse')
-add('For Heavens Sake')
+#add('For Heavens Sake')
add('Fort Knox')
add('FoxTrot')
add('FoxTrot Classics')
add('Frank and Ernest')
add('Frazz')
add('Fred Basset')
-add('Free Range')
+#add('Free Range')
add('Freshly Squeezed')
add('Frog Applause')
add('The Fusco Brothers')
@@ -154,9 +154,9 @@ add('Haiku Ewe')
add('Ham Shears')
add('Health Capsules')
add('Heart of the City')
-add('Heathcliff')
+#add('Heathcliff')
add('Heavenly Nostrils')
-add('Herb and Jamaal')
+#add('Herb and Jamaal')
add('Herman')
add('Home and Away')
add('HUBRIS!')
@@ -184,7 +184,7 @@ add('La Cucaracha')
add('Last Kiss')
add('The LeftyBosco Picture Show')
add('Legend of Bill')
-add('Liberty Meadows')
+#add('Liberty Meadows')
add('Lil Abner')
add('Lio')
add('Little Dog Lost')
@@ -201,7 +201,7 @@ add('Maintaining')
add('Marias Day')
add('Marmaduke')
add('McArroni')
-add('The Meaning of Lila')
+#add('The Meaning of Lila')
add('Medium Large')
add('Meg Classics')
add('The Middletons')
@@ -209,7 +209,7 @@ add('Mike du Jour')
add('Minimum Security')
add('Moderately Confused')
add('Molly and the Bear')
-add('Momma')
+#add('Momma')
add('Monty')
add('Motley Classics')
add('Mr. Gigi and the Squid')
@@ -217,7 +217,7 @@ add('Mutt and Jeff')
add('My Cage')
add('MythTickle')
add('Nancy')
-add('Nest Heads')
+#add('Nest Heads')
add('NEUROTICA')
add('New Adventures of Queen Victoria')
add('Non Sequitur')
@@ -225,10 +225,10 @@ add('The Norm Classics')
add('Nothing is Not Something')
add('Off the Mark')
add('Ollie and Quentin')
-add('On A Claire Day')
-add('One Big Happy')
+#add('On A Claire Day')
+#add('One Big Happy')
add('Ordinary Bill')
-add('The Other Coast')
+#add('The Other Coast')
add('Out of the Gene Pool Re-Runs')
add('Over the Hedge')
add('Overboard')
@@ -254,10 +254,10 @@ add('Reply All')
add('Rip Haywire')
add('Ripleys Believe It or Not')
add('Rose is Rose')
-add('Rubes')
+#add('Rubes')
add('Rudy Park')
add('Savage Chickens')
-add('Scary Gary')
+#add('Scary Gary')
add('Shirley and Son Classics')
add('Shoe')
add('Shoecabbage')
@@ -266,11 +266,11 @@ add('Skin Horse')
add('Skippy')
add('Slowpoke')
add('Soup to Nutz')
-add('Speed Bump')
+#add('Speed Bump')
add('Spot the Frog')
add('Starslip')
add('Stone Soup')
-add('Strange Brew')
+#add('Strange Brew')
add('The Sunshine Club')
add('Sylvia')
add('Tank McNamara')
@@ -280,7 +280,7 @@ add('Tales of TerraTopia')
add('That is Priceless')
add('Thats Life')
add('Thatababy')
-add('Thin Lines')
+#add('Thin Lines')
add('Tiny Sepuku')
add('TOBY')
add('Todays Dogg')
@@ -293,12 +293,12 @@ add('Unstrange Phenomena')
add('U.S. Acres')
add('Viivi and Wagner')
add('Watch Your Head')
-add('Wee Pals')
-add('Wizard of Id')
+#add('Wee Pals')
+#add('Wizard of Id')
add('Working Daze')
-add('Working It Out')
+#add('Working It Out')
add('W.T. Duck')
-add('Zack Hill')
+#add('Zack Hill')
add('Ziggy')
# http://www.gocomics.com/explore/editorial_list
diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py
index 7bb18117a..e0e8686b8 100644
--- a/dosagelib/plugins/keenspot.py
+++ b/dosagelib/plugins/keenspot.py
@@ -18,9 +18,9 @@ def add(name, urls):
name='KeenSpot/' + name,
latestUrl=latestUrl,
stripUrl=baseUrl + 'd/%s.html',
- imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')),
- prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') +
- '(?:]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'),
+ imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')),
+ prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
+ '(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'),
help = 'Index format: yyyymmdd',
)
diff --git a/dosagelib/plugins/num.py b/dosagelib/plugins/num.py
index 4adb53102..7504bad25 100644
--- a/dosagelib/plugins/num.py
+++ b/dosagelib/plugins/num.py
@@ -11,6 +11,7 @@ from ..scraper import _BasicScraper
class NineteenNinetySeven(_BasicScraper):
name = '1997'
latestUrl = 'http://www.1977thecomic.com/'
+ stripUrl = latestUrl + '%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous")
help = 'Index format: yyyy/mm/dd/strip-name'
diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py
index fbf5235a3..0cbd3f107 100644
--- a/dosagelib/plugins/s.py
+++ b/dosagelib/plugins/s.py
@@ -61,7 +61,7 @@ class Sheldon(_BasicScraper):
latestUrl = 'http://www.sheldoncomics.com/'
stripUrl = latestUrl + 'archive/%s.html'
imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)'))
- prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev"))
+ prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev"))
help = 'Index format: yymmdd'
diff --git a/dosagelib/plugins/uc.py b/dosagelib/plugins/uc.py
index 00d1ec553..22c16b243 100644
--- a/dosagelib/plugins/uc.py
+++ b/dosagelib/plugins/uc.py
@@ -1,280 +1,54 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
-
-from re import compile, sub
+"""
+The Universal comics only have some samples, but those samples are always the newest ones.
+"""
+import datetime
+from re import compile, escape
from ..scraper import make_scraper
-from ..util import fetchUrl, tagre
+from ..util import tagre, asciify, getPageContent
-def add(name, shortName):
- homepage = 'http://content.uclick.com/a2z.html'
- baseUrl = 'http://www.uclick.com/client/zzz/%s/'
- latestUrl = baseUrl % shortName
- classname = 'UClick_%s' % name
+def parse_strdate(strdate):
+ """Parse date string. XXX this is locale dependant but it should not be."""
+ return datetime.datetime.strptime(strdate, "%A, %B %d, %Y")
+
+
+def add(name, category):
+ shortname = name.replace(' ', '').lower()
+ latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname)
+ classname = 'UClick_%s' % asciify(name)
@classmethod
- def fetchSubmodules(cls):
- exclusions = ('index',)
- # XXX refactor this mess
- submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)'))
- partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html'))
- matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0]
- possibles = [partsMatch.match(match).groups() for match in matches]
-
- def normalizeName(name):
- name = sub(r'&(.)acute;', r'\1', name).title()
- return ''.join([c for c in name if c.isalnum()])
-
- def fetchSubmodule(module):
- try:
- return fetchUrl(cls.baseUrl % module, cls.imageSearch)
- except Exception:
- # XXX log error
- return False
-
- return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)]
+ def namer(cls, imageUrl, pageUrl):
+ """Parse publish date from page content which looks like:
+
+ published: Sunday, November 11, 2012
+ """
+ data = getPageContent(pageUrl)[0]
+ ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+published: ([^<]+)')
+ mo = ro.search(data)
+ if mo:
+ strdate = mo.group(1)
+ return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname,
name='UClick/' + name,
latestUrl = latestUrl,
stripUrl = latestUrl + '%s/',
- imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')),
- prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'),
- help = 'Index format: yyyy/mm/dd',
+ imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+published'),
+ multipleImagesPerStrip = True,
+ prevSearch = None,
+ help = 'Index format: none',
+ namer = namer,
)
+# List is from http://www.universaluclick.com/comics/list
comics = {
- '5thWave': 'fw',
- '9To5': 'tmntf',
- 'AdamHome': 'ad',
- 'Agnes': 'cragn',
- 'AlcarazLalo': 'la',
- 'AlcarazLaloSpanish': 'spla',
- 'AndersonNick': 'wpnan',
- 'AndyCapp': 'crcap',
- 'AnimalCrackers': 'tmani',
- 'Annie': 'tmann',
- 'AsayChuck': 'crcas',
- 'AskShagg': 'crask',
- 'AuthTony': 'ta',
- 'BadReporter': 'bad',
- 'Baldo': 'ba',
- 'BaldoSpanish': 'be',
- 'BallardStreet': 'crbal',
- 'BarkEaterLake': 'bark',
- 'BarstowDonna': 'dba',
- 'BC': 'crbc',
- 'BCSpanish': 'crbcs',
- 'BeattieBruce': 'crbbe',
- 'BennetClay': 'wpcbe',
- 'BensonLisa': 'wplbe',
- 'BensonSteve': 'crsbe',
- 'BigTop': 'bt',
- 'Biographic': 'biov',
- 'Bleeker': 'blk',
- 'BobTheSquirrel': 'bob',
- 'BoilingPoint': 'boil',
- 'BokChip': 'crcbo',
- 'BoNanas': 'bon',
- 'Boomerangs': 'boom',
- 'BoondocksThe': 'bo',
- 'BottomLiners': 'tmbot',
- 'BoundAndGagged': 'tmbou',
- 'Brainwaves': 'bwv',
- 'BreenSteve': 'crsbr',
- 'BrendaStarr': 'tmbre',
- 'BrewsterRockit': 'tmrkt',
- 'BrittChris': 'crcbr',
- 'BroomHilda': 'tmbro',
- 'Candorville': 'cand',
- 'CarlsonStuart': 'sc',
- 'CatalinoKen': 'crkca',
- 'Cathy': 'ca',
- 'CathySpanish': 'spca',
- 'CEstLaVie': 'clv',
- 'CityThe': 'derf',
- 'ClearBlueWater': 'cbw',
- 'Cleats': 'cle',
- 'CloseToHome': 'cl',
- 'CombsPaul': 'tmcmb',
- 'CompuToon': 'tmcom',
- 'Condorito': 'cond',
- 'ConradPaul': 'tmpco',
- 'Cornered': 'co',
- 'CulDeSac': 'cds',
- 'DanzigerJeff': 'jd',
- 'DaviesMatt': 'tmmda',
- 'DeepCover': 'deep',
- 'DeeringJohn': 'crjde',
- 'DickTracy': 'tmdic',
- 'DinetteSetThe': 'crdin',
- 'DogEatDoug': 'crdog',
- 'DonWright': 'tmdow',
- 'Doodles': 'tmdoo',
- 'Doonesbury': 'db',
- 'DuplexThe': 'dp',
- 'Eek': 'eek',
- 'ElderberriesThe': 'eld',
- 'FacesInTheNews': 'kw',
- 'FlightDeck': 'crfd',
- 'FloAndFriends': 'crflo',
- 'FlyingMccoysThe': 'fmc',
- 'ForBetterOrForWorse': 'fb',
- 'ForHeavenSSake': 'crfhs',
- 'FoxtrotClassics': 'ftcl',
- 'Foxtrot': 'ft',
- 'FoxtrotSpanish': 'spft',
- 'FrankAndErnest': 'fa',
- 'FredBassetSpanish': 'spfba',
- 'FredBasset': 'tmfba',
- 'FrogApplause': 'frog',
- 'FuscoBrothersThe': 'fu',
- 'Garfield': 'ga',
- 'GarfieldSpanish': 'gh',
- 'GasolineAlley': 'tmgas',
- 'GaturroSpanish': 'spgat',
- 'GilThorp': 'tmgil',
- 'GingerMeggs': 'gin',
- 'GingerMeggsSpanish': 'spgin',
- 'GirlsAndSports': 'crgis',
- 'GorrellBob': 'crbgo',
- 'GoTeamBob': 'gtb',
- 'HammondBruce': 'hb',
- 'HandelsmanWalt': 'tmwha',
- 'HeartOfTheCity': 'hc',
- 'Heathcliff': 'crhea',
- 'HeathcliffSpanish': 'crhes',
- 'HerbAndJamaal': 'crher',
- 'HigginsJack': 'jh',
- 'HomeAndAway': 'wphaa',
- 'HorseyDavid': 'tmdho',
- 'Housebroken': 'tmhou',
- 'HubertAndAbby': 'haa',
- 'IdiotBox': 'ibox',
- 'ImagineThis': 'imt',
- 'InkPen': 'ink',
- 'InTheBleachers': 'bl',
- 'ItsAllAboutYou': 'wpiay',
- 'JamesBondSpanish': 'spjb',
- 'JonesClay': 'crcjo',
- 'KallaugherKevin': 'cwkal',
- 'KChroniclesThe': 'kk',
- 'KelleySteve': 'crske',
- 'Kudzu': 'tmkud',
- 'LaCucaracha': 'lc',
- 'LegendOfBill': 'lob',
- 'LibertyMeadows': 'crlib',
- 'Lio': 'lio',
- 'LittleDogLost': 'wpldl',
- 'LocherDick': 'tmdlo',
- 'LooseParts': 'tmloo',
- 'LostSheep': 'lost',
- 'LoweChan': 'tmclo',
- 'LuckovichMike': 'crmlu',
- 'LuckyCow': 'luc',
- 'MarkstienGary': 'crgma',
- 'MarletteDoug': 'tmdma',
- 'MccoyGlenn': 'gm',
- 'MeaningOfLilaThe': 'crlil',
- 'MeehanStreak': 'tmmee',
- 'MiddletonsThe': 'tmmid',
- 'MinimumSecurity': 'ms',
- 'ModestyBlaiseSpanish': 'spmb',
- 'Momma': 'crmom',
- 'MorinJim': 'cwjmo',
- 'MuttJeffSpanish': 'spmut',
- 'MythTickle': 'myth',
- 'NAoQV': 'naqv',
- 'NaturalSelection': 'crns',
- 'NestHeads': 'cpnst',
- 'Neurotica': 'neu',
- 'NonSequitur': 'nq',
- 'OhmanJack': 'tmjoh',
- 'OliphantPat': 'po',
- 'OnAClaireDay': 'crocd',
- 'OneBigHappy': 'crobh',
- 'OtherCoastThe': 'crtoc',
- 'OutOfTheGenePool': 'wpgen',
- 'Overboard': 'ob',
- 'OverboardSpanish': 'spob',
- 'PepeSpanish': 'sppep',
- 'PettJoel': 'jp',
- 'Pibgorn': 'pib',
- 'Pickles': 'wppic',
- 'Pluggers': 'tmplu',
- 'PoochCafe': 'poc',
- 'PoochCafeSpanish': 'sppoc',
- 'PopCulture': 'pop',
- 'PowellDwane': 'crdpo',
- 'Preteena': 'pr',
- 'PricklyCity': 'prc',
- 'QuigmansThe': 'tmqui',
- 'RallComic': 'tr',
- 'RamirezMicheal': 'crmrm',
- 'RamseyMarshall': 'crmra',
- 'RealLifeAdventures': 'rl',
- 'RedAndRover': 'wpred',
- 'RedMeat': 'red',
- 'ReynoldsUnwrapped': 'rw',
- 'RonaldinhoGaucho': 'ron',
- 'RonaldinhoGauchoSpanish': 'spron',
- 'Rubes': 'crrub',
- 'SackSteve': 'tmssa',
- 'SargentBen': 'bs',
- 'SargentBenSpanish': 'spbs',
- 'SendHelp': 'send',
- 'ShenemanDrew': 'tmdsh',
- 'SherffiusDrew': 'crjsh',
- 'Shoecabbage': 'shcab',
- 'Shoe': 'tmsho',
- 'SigmundSpanish': 'spsig',
- 'Slowpoke': 'slow',
- 'SmallWorld': 'small',
- 'SpaceIsThePlace': 'sitp',
- 'SpeedBump': 'crspe',
- 'StanisScott': 'crsst',
- 'StateOfTheUnion': 'crsou',
- 'StayskalWayne': 'tmwst',
- 'StoneSoup': 'ss',
- 'StrangeBrew': 'crstr',
- 'SummersDana': 'tmdsu',
- 'SuttonImpact': 'stn',
- 'Sylvia': 'tmsyl',
- 'SzepPaul': 'crpsz',
- 'TankMcnamara': 'tm',
- 'TeenageMutantNinjaTurtles': 'tmnt',
- 'TelnaesAnn': 'tmate',
- 'TheArgyleSweater': 'tas',
- 'ThePinkPanther': 'tmpnk',
- 'TheWizardOfId': 'crwiz',
- 'TheWizardOfIdSpanish': 'crwis',
- 'ThInk': 'think',
- 'ThompsonMike': 'crmth',
- 'ThroughThickAndThin': 'cpthk',
- 'TinySepuku': 'tiny',
- 'Toby': 'toby',
- 'TolesTom': 'tt',
- 'TomTheDancingBug': 'td',
- 'TooMuchCoffeeMan': 'tmcm',
- 'Trevor': 'trev',
- 'TutelandiaSpanish': 'sptut',
- 'VarvelGary': 'crgva',
- 'WassermanDan': 'tmdwa',
- 'WatchYourHead': 'wpwyh',
- 'Waylay': 'min',
- 'WeePals': 'crwee',
- 'WinnieThePooh': 'crwin',
- 'WitOfTheWorld': 'cwwit',
- 'WorkingItOut': 'crwio',
- 'WriteDon': 'tmdow',
- 'YennySpanish': 'spyen',
- 'Yenny': 'yen',
- 'ZackHill': 'crzhi',
- 'ZiggySpanish': 'spzi',
- 'Ziggy': 'zi',
+ '9 Chickweed Lane': 'strip',
}
-for name, shortname in comics.items():
- add(name, shortname)
+for name, category in comics.items():
+ add(name, category)
diff --git a/dosagelib/plugins/y.py b/dosagelib/plugins/y.py
index 4f7833d61..d00ee05ab 100644
--- a/dosagelib/plugins/y.py
+++ b/dosagelib/plugins/y.py
@@ -2,6 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
+from re import compile
from ..scraper import _BasicScraper
from ..util import tagre
diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index e02bb9963..144a861f2 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -22,6 +22,9 @@ class _BasicScraper(object):
@cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page.
'''
+ # if more than one image per URL is expected
+ multipleImagesPerStrip = False
+ # usually the index format help
help = 'Sorry, no help for this comic yet.'
def __init__(self, indexes=None):
@@ -44,7 +47,9 @@ class _BasicScraper(object):
def getStrip(self, url):
"""Get comic strip for given URL."""
- imageUrls = fetchUrls(url, self.imageSearch)
+ imageUrls = fetchUrls(url, self.imageSearch)[0]
+ if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
+ raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls)
def getComicStrip(self, url, imageUrls):
@@ -140,11 +145,13 @@ def get_scrapers():
"""
global _scrapers
if _scrapers is None:
+ out.write("Loading comic modules...")
modules = loader.get_modules()
plugins = loader.get_plugins(modules, _BasicScraper)
_scrapers = list(plugins)
_scrapers.sort(key=lambda s: s.get_name())
check_scrapers()
+ out.write("... %d modules loaded." % len(_scrapers))
return _scrapers
diff --git a/dosagelib/util.py b/dosagelib/util.py
index 49d8f5803..c12dd0c00 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -4,6 +4,7 @@
from __future__ import division, print_function
import urllib2, urlparse
+import requests
import sys
import os
import cgi
@@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
@return: the generated regular expression suitable for re.compile()
@rtype: string
"""
- if before:
- before += "[^>]*"
- if after:
- after += "[^>]*"
attrs = dict(
tag=case_insensitive_re(tag),
attribute=case_insensitive_re(attribute),
@@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""):
before=before,
after=after,
)
- return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs
+ return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs
def case_insensitive_re(name):
@@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url):
# read page data
page = urlopen(url)
- data = page.read(MAX_FILESIZE)
+ data = page.text
# determine base URL
baseUrl = None
match = baseSearch.search(data)
@@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None):
imageUrl = match.group(1)
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
- out.write('matched image URL %r' % imageUrl, 2)
+ out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2)
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls:
out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern))
@@ -178,22 +175,18 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
out.write('Open URL %s' % url, 2)
assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
- req = urllib2.Request(url)
+ headers = {'User-Agent': UserAgent}
+ config = {"max_retries": retries}
if referrer:
- req.add_header('Referer', referrer)
- req.add_header('User-Agent', UserAgent)
- tries = 0
- while True:
- try:
- return urllib2.urlopen(req)
- except IOError as err:
- msg = 'URL retrieval of %s failed: %s' % (url, err)
- out.write(msg)
- out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2)
- time.sleep(retry_wait_seconds)
- tries += 1
- if tries >= retries:
- raise IOError(msg)
+ headers['Referer'] = referrer
+ try:
+ req = requests.get(url, headers=headers, config=config)
+ req.raise_for_status()
+ return req
+ except requests.exceptions.RequestException as err:
+ msg = 'URL retrieval of %s failed: %s' % (url, err)
+ out.write(msg)
+ raise IOError(msg)
def get_columns (fp):
@@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None):
print("""********** Oops, I did it again. *************
You have found an internal error in %(app)s. Please write a bug report
-at %(url)s and include the following information:
-- your commandline arguments and any configuration file in ~/.dosage/
-- the system information below
+at %(url)s and include at least the information below:
-Not disclosing some of the information above due to privacy reasons is ok.
+Not disclosing some of the information below due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""" % dict(app=AppName, url=SupportUrl), file=out)
@@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr):
{"version": sys.version, "platform": sys.platform}, file=out)
stime = strtime(time.time())
print("Local time:", stime, file=out)
+ print("sys.argv", sys.argv, file=out)
def strtime(t):
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..3288e9274
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+requests
+
diff --git a/tests/test_comics.py b/tests/test_comics.py
index 8b2f353d9..c50bcf60c 100644
--- a/tests/test_comics.py
+++ b/tests/test_comics.py
@@ -4,6 +4,7 @@
import tempfile
import shutil
import re
+import os
from itertools import islice
from unittest import TestCase
from dosagelib import scraper
@@ -16,6 +17,16 @@ class _ComicTester(TestCase):
def setUp(self):
self.name = self.scraperclass.get_name()
self.url = self.scraperclass.starter()
+ # create a temporary directory for images
+ self.tmpdir = tempfile.mkdtemp()
+
+ def tearDown(self):
+ shutil.rmtree(self.tmpdir)
+
+ def get_saved_images(self):
+ """Get saved images."""
+ dirs = tuple(self.name.split('/'))
+ return os.listdir(os.path.join(self.tmpdir, *dirs))
def test_comic(self):
# Test a scraper. It must be able to traverse backward for
@@ -23,7 +34,8 @@ class _ComicTester(TestCase):
# on at least 4 pages.
scraperobj = self.scraperclass()
num = empty = 0
- for strip in islice(scraperobj.getAllStrips(), 0, 5):
+ max_strips = 5
+ for strip in islice(scraperobj.getAllStrips(), 0, max_strips):
images = 0
for image in strip.getImages():
images += 1
@@ -35,6 +47,15 @@ class _ComicTester(TestCase):
num += 1
if self.scraperclass.prevSearch:
self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num)
+ # check that at exactly or for multiple pages at least 5 images are saved
+ saved_images = self.get_saved_images()
+ num_images = len(saved_images)
+ if self.scraperclass.multipleImagesPerStrip:
+ self.check(num_images >= max_strips,
+ 'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
+ else:
+ self.check(num_images == max_strips,
+ 'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir))
self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty)
def check_stripurl(self, strip):
@@ -50,28 +71,28 @@ class _ComicTester(TestCase):
self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch))
def save(self, image):
- # create a temporary directory
- tmpdir = tempfile.mkdtemp()
try:
- image.save(tmpdir)
+ image.save(self.tmpdir)
except Exception as msg:
- self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg))
- finally:
- shutil.rmtree(tmpdir)
+ self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg))
def check(self, condition, msg):
self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg))
+def make_comic_tester(name, **kwargs):
+ """Create and return a _ComicTester class with given name and attributes."""
+ return type(name, (_ComicTester,), kwargs)
+
+
def generate_comic_testers():
"""For each comic scraper, create a test class."""
+ g = globals()
# Limit number of scraper tests for now
- max_scrapers = 100
+ max_scrapers = 10000
for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers):
name = 'Test'+scraperclass.__name__
- globals()[name] = type(name,
- (_ComicTester,),
- dict(scraperclass=scraperclass)
- )
+ g[name] = make_comic_tester(name, scraperclass=scraperclass)
+
generate_comic_testers()