From 0556ffd30a6616c1ee515b18fece9d9d9bfedb59 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Mon, 26 Nov 2012 18:44:31 +0100 Subject: [PATCH] Fix comics, improve tests, use python-requests. --- doc/README.txt | 11 +- doc/changelog.txt | 1 + dosagelib/comic.py | 26 +-- dosagelib/plugins/d.py | 2 +- dosagelib/plugins/drunkduck.py | 21 ++- dosagelib/plugins/fallenangel.py | 57 ++---- dosagelib/plugins/gocomics.py | 70 ++++---- dosagelib/plugins/keenspot.py | 6 +- dosagelib/plugins/num.py | 1 + dosagelib/plugins/s.py | 2 +- dosagelib/plugins/uc.py | 296 ++++--------------------------- dosagelib/plugins/y.py | 1 + dosagelib/scraper.py | 9 +- dosagelib/util.py | 44 ++--- requirements.txt | 2 + tests/test_comics.py | 45 +++-- 16 files changed, 191 insertions(+), 403 deletions(-) create mode 100644 requirements.txt diff --git a/doc/README.txt b/doc/README.txt index 6e48f11c5..5041c56f2 100644 --- a/doc/README.txt +++ b/doc/README.txt @@ -40,10 +40,11 @@ manual page. Dependencies ------------- -Dosage requires Python version 2.7 or higher, which can be downloaded -from http://www.python.org. -No external Python modules are required - only the Python Standard Library -that gets installed with Python. +Python version 2.7 or higher, which can be downloaded +from http://www.python.org/ + +Also the python-requests module must be installed, which can be downloaded +from http://docs.python-requests.org/en/latest/ Installation ------------- @@ -59,7 +60,7 @@ or if you do not have root permissions: Technical Description ---------------------- -Dosage is written entirely in Python and relies on regular expressions to +Dosage is written in Python and relies on regular expressions to do most of the grunt work. For each webcomic Dosage has a plugin module, found in the "plugins" diff --git a/doc/changelog.txt b/doc/changelog.txt index 5cb01c80a..81b1608e2 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -4,6 +4,7 @@ Features: - cmdline: Added proper return codes for error conditions. - comics: Added more robust regular expressions for HTML tags. They match case insensitive and ignore whitespaces now. +- comics: Use the python-requests module for HTTP requests. Changes: - installation: Added support for dynamic configuration values. diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 70e12fec4..b2e21a892 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -2,7 +2,6 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012 Bastian Kleineidam -import urllib2 import os import locale import rfc822 @@ -55,18 +54,24 @@ class ComicImage(object): """Connect to host and get meta information.""" try: self.urlobj = urlopen(self.url, referrer=self.referrer) - except urllib2.HTTPError as he: + except IOError as he: raise FetchComicError('Unable to retrieve URL.', self.url, he.code) - if self.urlobj.info().getmaintype() != 'image' and \ - self.urlobj.info().gettype() not in ('application/octet-stream', 'application/x-shockwave-flash'): + content_type = self.urlobj.headers.get('content-type') + content_type = content_type.split(';', 1)[0] + if '/' in content_type: + maintype, subtype = content_type.split('/', 1) + else: + maintype = content_type + subtype = None + if maintype != 'image' and content_type not in ('application/octet-stream', 'application/x-shockwave-flash'): raise FetchComicError('No suitable image found to retrieve.', self.url) # Always use mime type for file extension if it is sane. - if self.urlobj.info().getmaintype() == 'image': - self.ext = '.' + self.urlobj.info().getsubtype().replace('jpeg', 'jpg') - self.contentLength = int(self.urlobj.info().get('content-length', 0)) - self.lastModified = self.urlobj.info().get('last-modified') + if maintype == 'image': + self.ext = '.' + subtype.replace('jpeg', 'jpg') + self.contentLength = int(self.urlobj.headers.get('content-length', 0)) + self.lastModified = self.urlobj.headers.get('last-modified') out.write('... filename = %r, ext = %r, contentLength = %d' % (self.filename, self.ext, self.contentLength), 2) def touch(self, filename): @@ -88,7 +93,6 @@ class ComicImage(object): fn = os.path.join(comicDir, filename) if os.path.isfile(fn) and os.path.getsize(fn) >= comicSize: - self.urlobj.close() self.touch(fn) out.write('Skipping existing file "%s".' % (fn,), 1) return fn, False @@ -97,7 +101,7 @@ class ComicImage(object): out.write('Writing comic to file %s...' % (fn,), 3) with open(fn, 'wb') as comicOut: startTime = time.time() - comicOut.write(self.urlobj.read()) + comicOut.write(self.urlobj.content) endTime = time.time() self.touch(fn) except: @@ -114,7 +118,5 @@ class ComicImage(object): attrs = dict(fn=fn, bytes=bytes, speed=speed) out.write('Saved "%(fn)s" (%(bytes)s bytes, %(speed)s/sec).' % attrs, 1) getHandler().comicDownloaded(self.name, fn) - finally: - self.urlobj.close() return fn, True diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index 175c9b0e3..685a6dde9 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -13,7 +13,7 @@ from ..util import tagre, getQueryParams class DMFA(_BasicScraper): latestUrl = 'http://www.missmab.com/' stripUrl = latestUrl + 'Comics/Vol_%s.php' - imageSearch = compile(tagre("img", "src", r'(Comics/|Vol)[^"]+)')) + imageSearch = compile(tagre("img", "src", r'((?:Comics/|Vol)[^"]+)')) prevSearch = compile(tagre("a", "href", r'([^"])+')+ tagre("img", "src", r'(?:../)?Images/comicprev.gif')) help = 'Index format: nnn (normally, some specials)' diff --git a/dosagelib/plugins/drunkduck.py b/dosagelib/plugins/drunkduck.py index a8ca15c49..492da01bd 100644 --- a/dosagelib/plugins/drunkduck.py +++ b/dosagelib/plugins/drunkduck.py @@ -4,22 +4,29 @@ from re import compile from ..scraper import make_scraper -from ..helpers import bounceStarter, queryNamer +from ..helpers import bounceStarter from ..util import tagre def add(name): classname = 'DrunkDuck_%s' % name url = 'http://www.drunkduck.com/%s/' % name - linkSearch = tagre("a", "href", r"(/[^/]*/index\.php\?p=\d+)", quote="'", after="The %s page") + linkSearch = tagre("a", "href", r"(/%s/\d+/)" % name) + + @classmethod + def namer(cls, imageUrl, pageUrl): + index = int(pageUrl.rstrip('/').split('/')[-1]) + ext = imageUrl.rsplit('.')[-1] + return '%d.%s' % (index, ext) + globals()[classname] = make_scraper(classname, name = 'DrunkDuck/' + name, - starter = bounceStarter(url, compile(linkSearch % 'next')), - stripUrl = url + 'index.php?p=%s' % name, - imageSearch = compile(tagre("img", "src", r"(http://[a-z0-9]*\.drunkduck\.com/[^/]*/pages/[^'/]+)", quote="'")), - prevSearch= compile(linkSearch % 'previous'), + starter = bounceStarter(url, compile(linkSearch + tagre("img", "class", "arrow_next"))), + stripUrl = url + '%s/', + imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")), + prevSearch= compile(linkSearch + tagre("img", "class", "arrow_prev")), help = 'Index format: n (unpadded)', - namer = queryNamer('p', usePageUrl=True), + namer = namer, ) comics = ( diff --git a/dosagelib/plugins/fallenangel.py b/dosagelib/plugins/fallenangel.py index d0fceea5f..b9d3fb933 100644 --- a/dosagelib/plugins/fallenangel.py +++ b/dosagelib/plugins/fallenangel.py @@ -1,47 +1,26 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012 Bastian Kleineidam -from ..scraper import _BasicScraper + +from re import compile +from ..scraper import make_scraper +from ..util import asciify -def fallenangel(name, shortname): - pass # XXX - -class _TheFallenAngel(_BasicScraper): - imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"') - prevSearch = compile(r' ]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"') - help = 'Index format: yyyymmdd' - - @property - def baseUrl(self): - return 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % (self.shortName,) - - - @property - def stripUrl(self): - return self.baseUrl + '?date=%s' - - - def starter(self): - return self.baseUrl - - - -class HighMaintenance(_TheFallenAngel): - name = 'TheFallenAngel/HighMaintenance' - shortName = 'hm' - - - -class FAWK(_TheFallenAngel): - name = 'TheFallenAngel/FAWK' - shortName = 'fawk' - - - -class MalloryChan(_TheFallenAngel): - name = 'TheFallenAngel/MalloryChan' - shortName = 'mallorychan' +def add(name, shortname): + latestUrl = 'http://www.thefallenangel.co.uk/cgi-bin/%sautokeen/autokeenlite.cgi' % shortname + classname = asciify(name) + globals()[classname] = make_scraper(classname, + latestUrl = latestUrl, + stripUrl = latestUrl + '?date=%s', + name='FallenAngel/' + name, + imageSearch = compile(r'SRC="(http://www.thefallenangel.co.uk/\w+comics/.+?)"'), + prevSearch = compile(r' ]+?src="http://www.thefallenangel.co.uk/images/previousday.jpg"'), + help = 'Index format: yyyymmdd', + ) +add('HighMaintenance', 'hm') +add('FAWK', 'fawk') +add('MalloryChan', 'mallorychan') diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index 7a861e309..dd6dacee2 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -29,30 +29,30 @@ def add(name, repl=''): # http://www.gocomics.com/features -# note that comics from creators.com are not repeated here +# Duplicate comics from creators.com are commented out add('2 Cows and a Chicken') add('9 Chickweed Lane') add('9 to 5') add('The Academia Waltz') add('Adam at Home') -add('Agnes') +#add('Agnes') add('Alley Oop', repl='-') add('Andertoons') -add('Andy Capp') +#add('Andy Capp') add('Angry Little Girls', repl='-') add('Animal Crackers') add('Annie') add('The Argyle Sweater') add('Arlo and Janis') -add('Ask Shagg') -add('BC') +#add('Ask Shagg') +#add('BC') add('Back in the Day') add('Bad Reporter') add('Baldo') -add('Ballard Street') +#add('Ballard Street') add('Banana Triangle', repl='-') add('Barkeater Lake') -add('The Barn') +#add('The Barn') add('Barney and Clyde') add('Basic Instructions') add('Beardo') @@ -81,13 +81,13 @@ add('Brewster Rockit') add('Broom Hilda') add('The Buckets') add('Buni') -add('Cafe con Leche') +#add('Cafe con Leche') add('Calvin and Hobbes') add('Candorville') add('Cathy') add('Cest la Vie') add('Cheap Thrills Cuisine', repl='-') -add('Chuckle Bros') +#add('Chuckle Bros') add('Citizen Dog') add('The City') add('Cleats') @@ -99,15 +99,15 @@ add('Cow and Boy') add('CowTown') add('Crumb') add('Cul de Sac') -add('Daddys Home') +#add('Daddys Home') add('Dark Side of the Horse') add('Deep Cover') -add('Diamond Lil') +#add('Diamond Lil') add('Dick Tracy') -add('The Dinette Set') +#add('The Dinette Set') add('Dixie Drive', repl='-') -add('Dog Eat Doug') -add('Dogs of C Kennel') +#add('Dog Eat Doug') +#add('Dogs of C Kennel') add('Domestic Abuse') add('Doonesbury') add('The Doozies') @@ -122,18 +122,18 @@ add('F Minus') add('Family Tree') add('Farcus') add('Fat Cats', repl='-') -add('Flo and Friends') +#add('Flo and Friends') add('The Flying McCoys') add('Foolish Mortals', repl='-') add('For Better or For Worse') -add('For Heavens Sake') +#add('For Heavens Sake') add('Fort Knox') add('FoxTrot') add('FoxTrot Classics') add('Frank and Ernest') add('Frazz') add('Fred Basset') -add('Free Range') +#add('Free Range') add('Freshly Squeezed') add('Frog Applause') add('The Fusco Brothers') @@ -154,9 +154,9 @@ add('Haiku Ewe') add('Ham Shears') add('Health Capsules') add('Heart of the City') -add('Heathcliff') +#add('Heathcliff') add('Heavenly Nostrils') -add('Herb and Jamaal') +#add('Herb and Jamaal') add('Herman') add('Home and Away') add('HUBRIS!') @@ -184,7 +184,7 @@ add('La Cucaracha') add('Last Kiss') add('The LeftyBosco Picture Show') add('Legend of Bill') -add('Liberty Meadows') +#add('Liberty Meadows') add('Lil Abner') add('Lio') add('Little Dog Lost') @@ -201,7 +201,7 @@ add('Maintaining') add('Marias Day') add('Marmaduke') add('McArroni') -add('The Meaning of Lila') +#add('The Meaning of Lila') add('Medium Large') add('Meg Classics') add('The Middletons') @@ -209,7 +209,7 @@ add('Mike du Jour') add('Minimum Security') add('Moderately Confused') add('Molly and the Bear') -add('Momma') +#add('Momma') add('Monty') add('Motley Classics') add('Mr. Gigi and the Squid') @@ -217,7 +217,7 @@ add('Mutt and Jeff') add('My Cage') add('MythTickle') add('Nancy') -add('Nest Heads') +#add('Nest Heads') add('NEUROTICA') add('New Adventures of Queen Victoria') add('Non Sequitur') @@ -225,10 +225,10 @@ add('The Norm Classics') add('Nothing is Not Something') add('Off the Mark') add('Ollie and Quentin') -add('On A Claire Day') -add('One Big Happy') +#add('On A Claire Day') +#add('One Big Happy') add('Ordinary Bill') -add('The Other Coast') +#add('The Other Coast') add('Out of the Gene Pool Re-Runs') add('Over the Hedge') add('Overboard') @@ -254,10 +254,10 @@ add('Reply All') add('Rip Haywire') add('Ripleys Believe It or Not') add('Rose is Rose') -add('Rubes') +#add('Rubes') add('Rudy Park') add('Savage Chickens') -add('Scary Gary') +#add('Scary Gary') add('Shirley and Son Classics') add('Shoe') add('Shoecabbage') @@ -266,11 +266,11 @@ add('Skin Horse') add('Skippy') add('Slowpoke') add('Soup to Nutz') -add('Speed Bump') +#add('Speed Bump') add('Spot the Frog') add('Starslip') add('Stone Soup') -add('Strange Brew') +#add('Strange Brew') add('The Sunshine Club') add('Sylvia') add('Tank McNamara') @@ -280,7 +280,7 @@ add('Tales of TerraTopia') add('That is Priceless') add('Thats Life') add('Thatababy') -add('Thin Lines') +#add('Thin Lines') add('Tiny Sepuku') add('TOBY') add('Todays Dogg') @@ -293,12 +293,12 @@ add('Unstrange Phenomena') add('U.S. Acres') add('Viivi and Wagner') add('Watch Your Head') -add('Wee Pals') -add('Wizard of Id') +#add('Wee Pals') +#add('Wizard of Id') add('Working Daze') -add('Working It Out') +#add('Working It Out') add('W.T. Duck') -add('Zack Hill') +#add('Zack Hill') add('Ziggy') # http://www.gocomics.com/explore/editorial_list diff --git a/dosagelib/plugins/keenspot.py b/dosagelib/plugins/keenspot.py index 7bb18117a..e0e8686b8 100644 --- a/dosagelib/plugins/keenspot.py +++ b/dosagelib/plugins/keenspot.py @@ -18,9 +18,9 @@ def add(name, urls): name='KeenSpot/' + name, latestUrl=latestUrl, stripUrl=baseUrl + 'd/%s.html', - imageSearch = compile(tagre("img", "src", r'([^"]*comics/[^"]+)')), - prevSearch = compile(tagre("a", "href", r'"([^"]*d/\d{8}\.html)') + - '(?:]+?(?:name="previous_day"|alt="Previous"|src="[^"]*back[^"]*")|Previous comic)'), + imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')), + prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') + + '(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')'), help = 'Index format: yyyymmdd', ) diff --git a/dosagelib/plugins/num.py b/dosagelib/plugins/num.py index 4adb53102..7504bad25 100644 --- a/dosagelib/plugins/num.py +++ b/dosagelib/plugins/num.py @@ -11,6 +11,7 @@ from ..scraper import _BasicScraper class NineteenNinetySeven(_BasicScraper): name = '1997' latestUrl = 'http://www.1977thecomic.com/' + stripUrl = latestUrl + '%s' imageSearch = compile(tagre("img", "src", r'(http://www\.1977thecomic\.com/comics-1977/[^"]+)')) prevSearch = compile(tagre("a", "href", r'([^"]+)')+"Previous") help = 'Index format: yyyy/mm/dd/strip-name' diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index fbf5235a3..0cbd3f107 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -61,7 +61,7 @@ class Sheldon(_BasicScraper): latestUrl = 'http://www.sheldoncomics.com/' stripUrl = latestUrl + 'archive/%s.html' imageSearch = compile(tagre("img", "src", r'(/strips/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'/archive/\d+\.html)', after="sidenav-prev")) + prevSearch = compile(tagre("a", "href", r'(/archive/\d+\.html)', after="sidenav-prev")) help = 'Index format: yymmdd' diff --git a/dosagelib/plugins/uc.py b/dosagelib/plugins/uc.py index 00d1ec553..22c16b243 100644 --- a/dosagelib/plugins/uc.py +++ b/dosagelib/plugins/uc.py @@ -1,280 +1,54 @@ # -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012 Bastian Kleineidam - -from re import compile, sub +""" +The Universal comics only have some samples, but those samples are always the newest ones. +""" +import datetime +from re import compile, escape from ..scraper import make_scraper -from ..util import fetchUrl, tagre +from ..util import tagre, asciify, getPageContent -def add(name, shortName): - homepage = 'http://content.uclick.com/a2z.html' - baseUrl = 'http://www.uclick.com/client/zzz/%s/' - latestUrl = baseUrl % shortName - classname = 'UClick_%s' % name +def parse_strdate(strdate): + """Parse date string. XXX this is locale dependant but it should not be.""" + return datetime.datetime.strptime(strdate, "%A, %B %d, %Y") + + +def add(name, category): + shortname = name.replace(' ', '').lower() + latestUrl = 'http://www.universaluclick.com/comics/%s/%s' % (category, shortname) + classname = 'UClick_%s' % asciify(name) @classmethod - def fetchSubmodules(cls): - exclusions = ('index',) - # XXX refactor this mess - submoduleSearch = compile(tagre("a", "href", r'(http://content\.uclick\.com/content/\w+\.html)')) - partsMatch = compile(tagre("a", "href", r'http://content\.uclick\.com/content/(\w+?)\.html')) - matches = fetchManyMatches(cls.homepage, (submoduleSearch,))[0] - possibles = [partsMatch.match(match).groups() for match in matches] - - def normalizeName(name): - name = sub(r'&(.)acute;', r'\1', name).title() - return ''.join([c for c in name if c.isalnum()]) - - def fetchSubmodule(module): - try: - return fetchUrl(cls.baseUrl % module, cls.imageSearch) - except Exception: - # XXX log error - return False - - return [normalizeName(name) for part, name in possibles if part not in exclusions and fetchSubmodule(part)] + def namer(cls, imageUrl, pageUrl): + """Parse publish date from page content which looks like: + Marmaduke +

published: Sunday, November 11, 2012

+ """ + data = getPageContent(pageUrl)[0] + ro = compile(tagre("img", "src", escape(imageUrl)) + r'\s+

published: ([^<]+)') + mo = ro.search(data) + if mo: + strdate = mo.group(1) + return parse_strdate(strdate).strftime("%Y%m%d") globals()[classname] = make_scraper(classname, name='UClick/' + name, latestUrl = latestUrl, stripUrl = latestUrl + '%s/', - imageSearch = compile(tagre("img", "src", r'(http://synd\.imgsrv\.uclick\.com/comics/\w+/\d{4}/[^"]+\.gif)')), - prevSearch = compile(tagre("a", "href", r'(/client/zzz/\w+/\d{4}/\d{2}/\d{2}/)') + 'Previous date'), - help = 'Index format: yyyy/mm/dd', + imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^"]+)') + r'\s+

published'), + multipleImagesPerStrip = True, + prevSearch = None, + help = 'Index format: none', + namer = namer, ) +# List is from http://www.universaluclick.com/comics/list comics = { - '5thWave': 'fw', - '9To5': 'tmntf', - 'AdamHome': 'ad', - 'Agnes': 'cragn', - 'AlcarazLalo': 'la', - 'AlcarazLaloSpanish': 'spla', - 'AndersonNick': 'wpnan', - 'AndyCapp': 'crcap', - 'AnimalCrackers': 'tmani', - 'Annie': 'tmann', - 'AsayChuck': 'crcas', - 'AskShagg': 'crask', - 'AuthTony': 'ta', - 'BadReporter': 'bad', - 'Baldo': 'ba', - 'BaldoSpanish': 'be', - 'BallardStreet': 'crbal', - 'BarkEaterLake': 'bark', - 'BarstowDonna': 'dba', - 'BC': 'crbc', - 'BCSpanish': 'crbcs', - 'BeattieBruce': 'crbbe', - 'BennetClay': 'wpcbe', - 'BensonLisa': 'wplbe', - 'BensonSteve': 'crsbe', - 'BigTop': 'bt', - 'Biographic': 'biov', - 'Bleeker': 'blk', - 'BobTheSquirrel': 'bob', - 'BoilingPoint': 'boil', - 'BokChip': 'crcbo', - 'BoNanas': 'bon', - 'Boomerangs': 'boom', - 'BoondocksThe': 'bo', - 'BottomLiners': 'tmbot', - 'BoundAndGagged': 'tmbou', - 'Brainwaves': 'bwv', - 'BreenSteve': 'crsbr', - 'BrendaStarr': 'tmbre', - 'BrewsterRockit': 'tmrkt', - 'BrittChris': 'crcbr', - 'BroomHilda': 'tmbro', - 'Candorville': 'cand', - 'CarlsonStuart': 'sc', - 'CatalinoKen': 'crkca', - 'Cathy': 'ca', - 'CathySpanish': 'spca', - 'CEstLaVie': 'clv', - 'CityThe': 'derf', - 'ClearBlueWater': 'cbw', - 'Cleats': 'cle', - 'CloseToHome': 'cl', - 'CombsPaul': 'tmcmb', - 'CompuToon': 'tmcom', - 'Condorito': 'cond', - 'ConradPaul': 'tmpco', - 'Cornered': 'co', - 'CulDeSac': 'cds', - 'DanzigerJeff': 'jd', - 'DaviesMatt': 'tmmda', - 'DeepCover': 'deep', - 'DeeringJohn': 'crjde', - 'DickTracy': 'tmdic', - 'DinetteSetThe': 'crdin', - 'DogEatDoug': 'crdog', - 'DonWright': 'tmdow', - 'Doodles': 'tmdoo', - 'Doonesbury': 'db', - 'DuplexThe': 'dp', - 'Eek': 'eek', - 'ElderberriesThe': 'eld', - 'FacesInTheNews': 'kw', - 'FlightDeck': 'crfd', - 'FloAndFriends': 'crflo', - 'FlyingMccoysThe': 'fmc', - 'ForBetterOrForWorse': 'fb', - 'ForHeavenSSake': 'crfhs', - 'FoxtrotClassics': 'ftcl', - 'Foxtrot': 'ft', - 'FoxtrotSpanish': 'spft', - 'FrankAndErnest': 'fa', - 'FredBassetSpanish': 'spfba', - 'FredBasset': 'tmfba', - 'FrogApplause': 'frog', - 'FuscoBrothersThe': 'fu', - 'Garfield': 'ga', - 'GarfieldSpanish': 'gh', - 'GasolineAlley': 'tmgas', - 'GaturroSpanish': 'spgat', - 'GilThorp': 'tmgil', - 'GingerMeggs': 'gin', - 'GingerMeggsSpanish': 'spgin', - 'GirlsAndSports': 'crgis', - 'GorrellBob': 'crbgo', - 'GoTeamBob': 'gtb', - 'HammondBruce': 'hb', - 'HandelsmanWalt': 'tmwha', - 'HeartOfTheCity': 'hc', - 'Heathcliff': 'crhea', - 'HeathcliffSpanish': 'crhes', - 'HerbAndJamaal': 'crher', - 'HigginsJack': 'jh', - 'HomeAndAway': 'wphaa', - 'HorseyDavid': 'tmdho', - 'Housebroken': 'tmhou', - 'HubertAndAbby': 'haa', - 'IdiotBox': 'ibox', - 'ImagineThis': 'imt', - 'InkPen': 'ink', - 'InTheBleachers': 'bl', - 'ItsAllAboutYou': 'wpiay', - 'JamesBondSpanish': 'spjb', - 'JonesClay': 'crcjo', - 'KallaugherKevin': 'cwkal', - 'KChroniclesThe': 'kk', - 'KelleySteve': 'crske', - 'Kudzu': 'tmkud', - 'LaCucaracha': 'lc', - 'LegendOfBill': 'lob', - 'LibertyMeadows': 'crlib', - 'Lio': 'lio', - 'LittleDogLost': 'wpldl', - 'LocherDick': 'tmdlo', - 'LooseParts': 'tmloo', - 'LostSheep': 'lost', - 'LoweChan': 'tmclo', - 'LuckovichMike': 'crmlu', - 'LuckyCow': 'luc', - 'MarkstienGary': 'crgma', - 'MarletteDoug': 'tmdma', - 'MccoyGlenn': 'gm', - 'MeaningOfLilaThe': 'crlil', - 'MeehanStreak': 'tmmee', - 'MiddletonsThe': 'tmmid', - 'MinimumSecurity': 'ms', - 'ModestyBlaiseSpanish': 'spmb', - 'Momma': 'crmom', - 'MorinJim': 'cwjmo', - 'MuttJeffSpanish': 'spmut', - 'MythTickle': 'myth', - 'NAoQV': 'naqv', - 'NaturalSelection': 'crns', - 'NestHeads': 'cpnst', - 'Neurotica': 'neu', - 'NonSequitur': 'nq', - 'OhmanJack': 'tmjoh', - 'OliphantPat': 'po', - 'OnAClaireDay': 'crocd', - 'OneBigHappy': 'crobh', - 'OtherCoastThe': 'crtoc', - 'OutOfTheGenePool': 'wpgen', - 'Overboard': 'ob', - 'OverboardSpanish': 'spob', - 'PepeSpanish': 'sppep', - 'PettJoel': 'jp', - 'Pibgorn': 'pib', - 'Pickles': 'wppic', - 'Pluggers': 'tmplu', - 'PoochCafe': 'poc', - 'PoochCafeSpanish': 'sppoc', - 'PopCulture': 'pop', - 'PowellDwane': 'crdpo', - 'Preteena': 'pr', - 'PricklyCity': 'prc', - 'QuigmansThe': 'tmqui', - 'RallComic': 'tr', - 'RamirezMicheal': 'crmrm', - 'RamseyMarshall': 'crmra', - 'RealLifeAdventures': 'rl', - 'RedAndRover': 'wpred', - 'RedMeat': 'red', - 'ReynoldsUnwrapped': 'rw', - 'RonaldinhoGaucho': 'ron', - 'RonaldinhoGauchoSpanish': 'spron', - 'Rubes': 'crrub', - 'SackSteve': 'tmssa', - 'SargentBen': 'bs', - 'SargentBenSpanish': 'spbs', - 'SendHelp': 'send', - 'ShenemanDrew': 'tmdsh', - 'SherffiusDrew': 'crjsh', - 'Shoecabbage': 'shcab', - 'Shoe': 'tmsho', - 'SigmundSpanish': 'spsig', - 'Slowpoke': 'slow', - 'SmallWorld': 'small', - 'SpaceIsThePlace': 'sitp', - 'SpeedBump': 'crspe', - 'StanisScott': 'crsst', - 'StateOfTheUnion': 'crsou', - 'StayskalWayne': 'tmwst', - 'StoneSoup': 'ss', - 'StrangeBrew': 'crstr', - 'SummersDana': 'tmdsu', - 'SuttonImpact': 'stn', - 'Sylvia': 'tmsyl', - 'SzepPaul': 'crpsz', - 'TankMcnamara': 'tm', - 'TeenageMutantNinjaTurtles': 'tmnt', - 'TelnaesAnn': 'tmate', - 'TheArgyleSweater': 'tas', - 'ThePinkPanther': 'tmpnk', - 'TheWizardOfId': 'crwiz', - 'TheWizardOfIdSpanish': 'crwis', - 'ThInk': 'think', - 'ThompsonMike': 'crmth', - 'ThroughThickAndThin': 'cpthk', - 'TinySepuku': 'tiny', - 'Toby': 'toby', - 'TolesTom': 'tt', - 'TomTheDancingBug': 'td', - 'TooMuchCoffeeMan': 'tmcm', - 'Trevor': 'trev', - 'TutelandiaSpanish': 'sptut', - 'VarvelGary': 'crgva', - 'WassermanDan': 'tmdwa', - 'WatchYourHead': 'wpwyh', - 'Waylay': 'min', - 'WeePals': 'crwee', - 'WinnieThePooh': 'crwin', - 'WitOfTheWorld': 'cwwit', - 'WorkingItOut': 'crwio', - 'WriteDon': 'tmdow', - 'YennySpanish': 'spyen', - 'Yenny': 'yen', - 'ZackHill': 'crzhi', - 'ZiggySpanish': 'spzi', - 'Ziggy': 'zi', + '9 Chickweed Lane': 'strip', } -for name, shortname in comics.items(): - add(name, shortname) +for name, category in comics.items(): + add(name, category) diff --git a/dosagelib/plugins/y.py b/dosagelib/plugins/y.py index 4f7833d61..d00ee05ab 100644 --- a/dosagelib/plugins/y.py +++ b/dosagelib/plugins/y.py @@ -2,6 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012 Bastian Kleineidam +from re import compile from ..scraper import _BasicScraper from ..util import tagre diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index e02bb9963..144a861f2 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -22,6 +22,9 @@ class _BasicScraper(object): @cvar prevSearch: A compiled regex that will locate the URL for the previous strip when applied to a strip page. ''' + # if more than one image per URL is expected + multipleImagesPerStrip = False + # usually the index format help help = 'Sorry, no help for this comic yet.' def __init__(self, indexes=None): @@ -44,7 +47,9 @@ class _BasicScraper(object): def getStrip(self, url): """Get comic strip for given URL.""" - imageUrls = fetchUrls(url, self.imageSearch) + imageUrls = fetchUrls(url, self.imageSearch)[0] + if len(imageUrls) > 1 and not self.multipleImagesPerStrip: + raise ValueError("found %d images with %s" % (len(imageUrls), self.imageSearch.pattern)) return self.getComicStrip(url, imageUrls) def getComicStrip(self, url, imageUrls): @@ -140,11 +145,13 @@ def get_scrapers(): """ global _scrapers if _scrapers is None: + out.write("Loading comic modules...") modules = loader.get_modules() plugins = loader.get_plugins(modules, _BasicScraper) _scrapers = list(plugins) _scrapers.sort(key=lambda s: s.get_name()) check_scrapers() + out.write("... %d modules loaded." % len(_scrapers)) return _scrapers diff --git a/dosagelib/util.py b/dosagelib/util.py index 49d8f5803..c12dd0c00 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -4,6 +4,7 @@ from __future__ import division, print_function import urllib2, urlparse +import requests import sys import os import cgi @@ -42,10 +43,6 @@ def tagre(tag, attribute, value, quote='"', before="", after=""): @return: the generated regular expression suitable for re.compile() @rtype: string """ - if before: - before += "[^>]*" - if after: - after += "[^>]*" attrs = dict( tag=case_insensitive_re(tag), attribute=case_insensitive_re(attribute), @@ -54,7 +51,7 @@ def tagre(tag, attribute, value, quote='"', before="", after=""): before=before, after=after, ) - return r'<\s*%(tag)s\s+(?:[^>]*%(before)s\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s>' % attrs + return r'<\s*%(tag)s\s+(?:[^>]*%(before)s[^>]*\s+)?%(attribute)s\s*=\s*%(quote)s%(value)s%(quote)s[^>]*%(after)s[^>]*>' % attrs def case_insensitive_re(name): @@ -74,7 +71,7 @@ baseSearch = re.compile(tagre("base", "href", '([^"]*)')) def getPageContent(url): # read page data page = urlopen(url) - data = page.read(MAX_FILESIZE) + data = page.text # determine base URL baseUrl = None match = baseSearch.search(data) @@ -105,7 +102,7 @@ def fetchUrls(url, imageSearch, prevSearch=None): imageUrl = match.group(1) if not imageUrl: raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern)) - out.write('matched image URL %r' % imageUrl, 2) + out.write('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern), 2) imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl))) if not imageUrls: out.write("warning: no images found at %s with pattern %s" % (url, imageSearch.pattern)) @@ -178,22 +175,18 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5): out.write('Open URL %s' % url, 2) assert retries >= 0, 'invalid retry value %r' % retries assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds - req = urllib2.Request(url) + headers = {'User-Agent': UserAgent} + config = {"max_retries": retries} if referrer: - req.add_header('Referer', referrer) - req.add_header('User-Agent', UserAgent) - tries = 0 - while True: - try: - return urllib2.urlopen(req) - except IOError as err: - msg = 'URL retrieval of %s failed: %s' % (url, err) - out.write(msg) - out.write('waiting %d seconds and retrying (%d)' % (retry_wait_seconds, tries), 2) - time.sleep(retry_wait_seconds) - tries += 1 - if tries >= retries: - raise IOError(msg) + headers['Referer'] = referrer + try: + req = requests.get(url, headers=headers, config=config) + req.raise_for_status() + return req + except requests.exceptions.RequestException as err: + msg = 'URL retrieval of %s failed: %s' % (url, err) + out.write(msg) + raise IOError(msg) def get_columns (fp): @@ -259,11 +252,9 @@ def internal_error(out=sys.stderr, etype=None, evalue=None, tb=None): print("""********** Oops, I did it again. ************* You have found an internal error in %(app)s. Please write a bug report -at %(url)s and include the following information: -- your commandline arguments and any configuration file in ~/.dosage/ -- the system information below +at %(url)s and include at least the information below: -Not disclosing some of the information above due to privacy reasons is ok. +Not disclosing some of the information below due to privacy reasons is ok. I will try to help you nonetheless, but you have to give me something I can work with ;) . """ % dict(app=AppName, url=SupportUrl), file=out) @@ -308,6 +299,7 @@ def print_app_info(out=sys.stderr): {"version": sys.version, "platform": sys.platform}, file=out) stime = strtime(time.time()) print("Local time:", stime, file=out) + print("sys.argv", sys.argv, file=out) def strtime(t): diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..3288e9274 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests + diff --git a/tests/test_comics.py b/tests/test_comics.py index 8b2f353d9..c50bcf60c 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -4,6 +4,7 @@ import tempfile import shutil import re +import os from itertools import islice from unittest import TestCase from dosagelib import scraper @@ -16,6 +17,16 @@ class _ComicTester(TestCase): def setUp(self): self.name = self.scraperclass.get_name() self.url = self.scraperclass.starter() + # create a temporary directory for images + self.tmpdir = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.tmpdir) + + def get_saved_images(self): + """Get saved images.""" + dirs = tuple(self.name.split('/')) + return os.listdir(os.path.join(self.tmpdir, *dirs)) def test_comic(self): # Test a scraper. It must be able to traverse backward for @@ -23,7 +34,8 @@ class _ComicTester(TestCase): # on at least 4 pages. scraperobj = self.scraperclass() num = empty = 0 - for strip in islice(scraperobj.getAllStrips(), 0, 5): + max_strips = 5 + for strip in islice(scraperobj.getAllStrips(), 0, max_strips): images = 0 for image in strip.getImages(): images += 1 @@ -35,6 +47,15 @@ class _ComicTester(TestCase): num += 1 if self.scraperclass.prevSearch: self.check(num >= 4, 'traversal failed after %d strips, check the prevSearch pattern.' % num) + # check that at exactly or for multiple pages at least 5 images are saved + saved_images = self.get_saved_images() + num_images = len(saved_images) + if self.scraperclass.multipleImagesPerStrip: + self.check(num_images >= max_strips, + 'saved %d %s instead of at least %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir)) + else: + self.check(num_images == max_strips, + 'saved %d %s instead of %d images in %s' % (num_images, saved_images, max_strips, self.tmpdir)) self.check(empty == 0, 'failed to find images on %d pages, check the imageSearch pattern.' % empty) def check_stripurl(self, strip): @@ -50,28 +71,28 @@ class _ComicTester(TestCase): self.check(mo is not None, 'strip URL %r does not match stripUrl pattern %s' % (strip.stripUrl, urlmatch)) def save(self, image): - # create a temporary directory - tmpdir = tempfile.mkdtemp() try: - image.save(tmpdir) + image.save(self.tmpdir) except Exception as msg: - self.check(False, 'could not save %s to %s: %s' % (image.url, tmpdir, msg)) - finally: - shutil.rmtree(tmpdir) + self.check(False, 'could not save %s to %s: %s' % (image.url, self.tmpdir, msg)) def check(self, condition, msg): self.assertTrue(condition, "%s %s %s" % (self.name, self.url, msg)) +def make_comic_tester(name, **kwargs): + """Create and return a _ComicTester class with given name and attributes.""" + return type(name, (_ComicTester,), kwargs) + + def generate_comic_testers(): """For each comic scraper, create a test class.""" + g = globals() # Limit number of scraper tests for now - max_scrapers = 100 + max_scrapers = 10000 for scraperclass in islice(scraper.get_scrapers(), 0, max_scrapers): name = 'Test'+scraperclass.__name__ - globals()[name] = type(name, - (_ComicTester,), - dict(scraperclass=scraperclass) - ) + g[name] = make_comic_tester(name, scraperclass=scraperclass) + generate_comic_testers()