Refactor: Make namer a method.

When #42 is realized, the naming of files might differ between comic
modules, so the namer's logical location is the instance, not the class.
This commit is contained in:
Tobias Gruetzmacher 2016-04-21 08:20:49 +02:00
parent 5bd2a49f48
commit c3f32dfef7
27 changed files with 100 additions and 142 deletions

View file

@ -8,22 +8,20 @@ from __future__ import absolute_import, division, print_function
from .util import getQueryParams
def queryNamer(paramName, usePageUrl=False):
def queryNamer(param, use_page_url=False):
"""Get name from URL query part."""
@classmethod
def _namer(cls, imageUrl, pageUrl):
def _namer(self, image_url, page_url):
"""Get URL query part."""
url = pageUrl if usePageUrl else imageUrl
return getQueryParams(url)[paramName][0]
url = page_url if use_page_url else image_url
return getQueryParams(url)[param][0]
return _namer
def regexNamer(regex, usePageUrl=False):
def regexNamer(regex, use_page_url=False):
"""Get name from regular expression."""
@classmethod
def _namer(cls, imageUrl, pageUrl):
def _namer(self, image_url, page_url):
"""Get first regular expression group."""
url = pageUrl if usePageUrl else imageUrl
url = page_url if use_page_url else image_url
mo = regex.search(url)
if mo:
return mo.group(1)

View file

@ -28,8 +28,7 @@ class AbstruseGoose(_BasicScraper):
help = 'Index format: n (unpadded)'
textSearch = compile(tagre("img", "title", r'([^"]+)'))
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
index = int(page_url.rstrip('/').split('/')[-1])
name = image_url.split('/')[-1].split('.')[0]
return 'c%03d-%s' % (index, name)
@ -300,8 +299,7 @@ class AstronomyPOTD(_ParserScraper):
"""Skip pages without images."""
return data.xpath('//iframe') # videos
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],
image_url.split('/')[-1].split('.')[0])

View file

@ -76,11 +76,10 @@ class Beetlebum(_BasicScraper):
help = 'Index format: yyyy/mm/dd/striptitle'
lang = 'de'
@classmethod
def namer(cls, imageUrl, pageUrl):
indexes = tuple(pageUrl.rstrip('/').split('/')[-4:])
def namer(self, image_url, page_url):
indexes = tuple(page_url.rstrip('/').split('/')[-4:])
name = '%s-%s-%s-%s' % indexes
name = name + '_' + imageUrl.split('/')[-1]
name = name + '_' + image_url.split('/')[-1]
return name
@ -154,8 +153,7 @@ class BloomingFaeries(_ParserScraper):
imageSearch = '//div[@id="comic"]//img'
prevSearch = WP_PREV_SEARCH
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
return "_".join(image_url.rsplit('/', 3)[1:])
@ -197,9 +195,8 @@ class BoyOnAStickAndSlither(_BasicScraper):
"<span>Next page")
help = 'Index format: n (unpadded)'
@classmethod
def namer(cls, imageUrl, pageUrl):
return pageUrl.rsplit('/')[-1]
def namer(self, image_url, page_url):
return page_url.rsplit('/')[-1]
class BratHalla(_WordPressScraper):

View file

@ -57,9 +57,8 @@ class Carciphona(_BasicScraper):
r'(view\.php\?page=[0-9]+[^"]*)'))
starter = indirectStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
ip = imageUrl.split('/')
def namer(self, image_url, page_url):
ip = image_url.split('/')
return "volume_%s_page_%s" % (ip[-2], ip[-1])
@ -262,9 +261,8 @@ class CorydonCafe(_ParserScraper):
latestSearch = '//ul//a'
help = 'Index format: yyyy/stripname'
@classmethod
def namer(cls, imageUrl, pageUrl):
return pageUrl.split('/')[-1].split('.')[0]
def namer(self, image_url, page_url):
return page_url.split('/')[-1].split('.')[0]
class CourtingDisaster(_WordPressScraper):
@ -349,10 +347,9 @@ class CyanideAndHappiness(_BasicScraper):
"""Skip pages without images."""
return "/comics/play-button.png" in data[0]
@classmethod
def namer(cls, imageUrl, pageUrl):
imgname = imageUrl.split('/')[-1]
def namer(self, image_url, page_url):
imgname = image_url.split('/')[-1]
# only get the first 100 chars for the image name
imgname = imgname[:100]
imgnum = pageUrl.split('/')[-2]
imgnum = page_url.split('/')[-2]
return '%s_%s' % (imgnum, imgname)

View file

@ -24,9 +24,8 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
if imageFolder is None:
imageFolder = shortName
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%03d' % int(getQueryParams(pageUrl)['page'][0])
def namer(self, image_url, page_url):
return '%03d' % int(getQueryParams(page_url)['page'][0])
def _starter(self):
# first, try hopping to previous and next comic

View file

@ -22,12 +22,11 @@ class _ComicFury(_ParserScraper):
help = 'Index format: n'
starter = bounceStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
parts = pageUrl.split('/')
path, ext = os.path.splitext(imageUrl)
def namer(self, image_url, page_url):
parts = page_url.split('/')
path, ext = os.path.splitext(image_url)
num = parts[-1]
return "%s_%s%s" % (cls.__name__[2:], num, ext)
return "%s_%s%s" % (self.__class__.__name__[2:], num, ext)
@property
def url(self):

View file

@ -34,10 +34,9 @@ class DamnLol(_BasicScraper):
help = 'Index format: stripname-number'
starter = bounceStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
ext = imageUrl.rsplit('.', 1)[1]
path = pageUrl.rsplit('/', 1)[1][:-5]
def namer(self, image_url, page_url):
ext = image_url.rsplit('.', 1)[1]
path = page_url.rsplit('/', 1)[1][:-5]
stripname, number = path.rsplit('-', 1)
return '%s-%s.%s' % (number, stripname, ext)
@ -136,10 +135,9 @@ class DieselSweeties(_BasicScraper):
tagre("img", "src", r'(?:http://www\.dieselsweeties\.com/ximages/blackbackarrow160.png|/ximages/prev\.gif)'))
help = 'Index format: n (unpadded)'
@classmethod
def namer(cls, imageUrl, pageUrl):
index = int(imageUrl.split('/')[-1].split('.')[0])
return 'sw%02d' % (index,)
def namer(self, image_url, page_url):
index = int(image_url.split('/')[-1].split('.')[0])
return 'sw%02d' % index
class Dilbert(_BasicScraper):
@ -154,9 +152,8 @@ class Dilbert(_BasicScraper):
after="Click to see"))
help = 'Index format: yyyy-mm-dd'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = pageUrl.rsplit("/", 1)[1]
def namer(self, image_url, page_url):
name = page_url.rsplit("/", 1)[1]
return "%s" % name

View file

@ -24,10 +24,9 @@ def add(name, path):
classname = '_DrunkDuck_%s' % name
_url = 'http://www.theduckwebcomics.com/%s/' % path
@classmethod
def _namer(cls, imageUrl, pageUrl):
index = int(pageUrl.rstrip('/').split('/')[-1])
ext = imageUrl.rsplit('.')[-1]
def _namer(self, image_url, page_url):
index = int(page_url.rstrip('/').split('/')[-1])
ext = image_url.rsplit('.')[-1]
return '%d.%s' % (index, ext)
def _starter(self):

View file

@ -26,13 +26,12 @@ class EarthsongSaga(_ParserScraper):
return [x.replace('earthsongsaga.com/../',
'earthsongsaga.com/') for x in urls]
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
imgmatch = compile(r'images/vol(\d+)/ch(\d+)/(\d+)\.\w+$',
IGNORECASE).search(imageUrl)
IGNORECASE).search(image_url)
if not imgmatch:
imgmatch = compile(r'images/vol(\d+)/ch(\d+)/ch(\d+)cover\.\w+$',
IGNORECASE).search(imageUrl)
IGNORECASE).search(image_url)
suffix = "cover"
else:
suffix = ""

View file

@ -146,9 +146,8 @@ class GoneWithTheBlastwave(_BasicScraper):
r'<img src="images/page/default/latest')
help = 'Index format: n'
@classmethod
def namer(cls, imageUrl, pageUrl):
return '%02d' % int(compile(r'nro=(\d+)').search(pageUrl).group(1))
def namer(self, image_url, page_url):
return '%02d' % int(compile(r'nro=(\d+)').search(page_url).group(1))
class GrrlPower(_WordPressScraper):

View file

@ -26,10 +26,9 @@ class _GoComics(_ParserScraper):
def url(self):
return 'http://www.gocomics.com/' + self.path
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
prefix, year, month, day = page_url.rsplit('/', 3)
return "%s_%s%s%s.gif" % (cls.__name__[2:], year, month, day)
return "%s_%s%s%s.gif" % (self.__class__.__name__[2:], year, month, day)
def getIndexStripUrl(self, index):
return self.url + self.path + '/%s' % index

View file

@ -51,10 +51,9 @@ class HarkAVagrant(_BasicScraper):
tagre("img", "src", "buttonnext.png"))
help = 'Index format: number'
@classmethod
def namer(cls, imageUrl, pageUrl):
filename = imageUrl.rsplit('/', 1)[1]
num = pageUrl.rsplit('=', 1)[1]
def namer(self, image_url, page_url):
filename = image_url.rsplit('/', 1)[1]
num = page_url.rsplit('=', 1)[1]
return '%s-%s' % (num, filename)

View file

@ -26,11 +26,10 @@ class Lackadaisy(_BasicScraper):
help = 'Index format: n'
starter = bounceStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
"""Use comic id for filename."""
num = pageUrl.rsplit('=', 1)[-1]
ext = imageUrl.rsplit('.', 1)[-1]
num = page_url.rsplit('=', 1)[-1]
ext = image_url.rsplit('.', 1)[-1]
return 'lackadaisy_%s.%s' % (num, ext)

View file

@ -82,9 +82,8 @@ class MarriedToTheSea(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "&lt;&lt; Yesterday")
help = 'Index format: mmddyy'
@classmethod
def namer(cls, imageUrl, pageUrl):
unused, date, filename = imageUrl.rsplit('/', 2)
def namer(self, image_url, page_url):
unused, date, filename = image_url.rsplit('/', 2)
return '%s-%s' % (date, filename)

View file

@ -38,9 +38,8 @@ class NatalieDee(_BasicScraper):
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "&lt;&lt; Yesterday")
help = 'Index format: mmddyy'
@classmethod
def namer(cls, imageUrl, pageUrl):
unused, date, filename = imageUrl.rsplit('/', 2)
def namer(self, image_url, page_url):
unused, date, filename = image_url.rsplit('/', 2)
return '%s-%s' % (date, filename)

View file

@ -74,13 +74,12 @@ class OnTheFastrack(_BasicScraper):
url = 'http://onthefastrack.com/'
stripUrl = url + 'comics/%s'
firstStripUrl = stripUrl % 'november-13-2000'
imageSearch = compile(r'(http://safr\.kingfeatures\.com/idn/cnfeed/zone/js/content\.php\?file=.+)"')
imageSearch = compile(r'(https://safr\.kingfeatures\.com/idn/cnfeed/zone/js/content\.php\?file=.+)"')
prevSearch = compile(r'id="previouscomic" class="button white"><a href="(%scomics/[a-z0-9-]+/)"' % url)
help = 'Index format: monthname-dd-yyyy'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = pageUrl.rsplit('/', 3)[2]
def namer(self, image_url, page_url):
name = page_url.rsplit('/', 3)[2]
if name == "onthefastrack.com":
import datetime
name = datetime.date.today().strftime("%B-%d-%Y")

View file

@ -100,9 +100,8 @@ class PennyArcade(_BasicScraper):
prevUrl = "%s/%s/%s" % (dummy, yyyy, mm)
return prevUrl
@classmethod
def namer(cls, imageUrl, pageUrl):
p = pageUrl.split('/')
def namer(self, image_url, page_url):
p = page_url.split('/')
return '%04d%02d%02d' % (int(p[4]), int(p[5]), int(p[6]))
@ -161,7 +160,7 @@ class PiledHigherAndDeeper(_BasicScraper):
prevSearch = compile(r'<a href=((comics/)?archive\.php\?comicid=\d+)>.*<img [^>]*prev_button\.gif')
nextSearch = compile(r'<a href=(archive\.php\?comicid=\d+)>.*<img [^>]*next_button\.gif')
help = 'Index format: n (unpadded)'
namer = queryNamer('comicid', usePageUrl=True)
namer = queryNamer('comicid', use_page_url=True)
class Pimpette(_ParserScraper):

View file

@ -197,10 +197,9 @@ class SexyLosers(_BasicScraper):
help = 'Index format: nnn'
starter = indirectStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
index = pageUrl.split('/')[-1].split('.')[0]
title = imageUrl.split('/')[-1].split('.')[0]
def namer(self, image_url, page_url):
index = page_url.split('/')[-1].split('.')[0]
title = image_url.split('/')[-1].split('.')[0]
return index + '-' + title
@ -228,9 +227,8 @@ class ShermansLagoon(_BasicScraper):
prevSearch = compile(r'id="previouscomic" class="button white"><a href="(%scomics/[a-z0-9-]+/)"' % url)
help = 'Index format: monthname-day-year'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = pageUrl.rsplit('/', 3)[2]
def namer(self, image_url, page_url):
name = page_url.rsplit('/', 3)[2]
if name == "shermanslagoon.com":
name = datetime.date.today().strftime("%B-%d-%Y").lower()
# name is monthname-day-year
@ -309,10 +307,9 @@ class SMBC(_ParserScraper):
help = 'Index format: nnnn'
textSearch = '//img[@id="comic"]/@title'
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
"""Remove random noise from name."""
return imageUrl.rsplit('-', 1)[-1]
return image_url.rsplit('-', 1)[-1]
def shouldSkipUrl(self, url, data):
"""Skip promo or missing update pages."""
@ -339,11 +336,10 @@ class SnowFlame(_WordPressScraper):
def getIndexStripUrl(self, index):
return self.stripUrl % tuple(index.split('-'))
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, filename = imageUrl.rsplit('/', 1)
def namer(self, image_url, page_url):
prefix, filename = image_url.rsplit('/', 1)
ro = compile(r'snowflame-([^-]+)-([^-]+)')
mo = ro.search(pageUrl)
mo = ro.search(page_url)
chapter = mo.group(1)
page = mo.group(2)
return "%s-%s-%s" % (chapter, page, filename)
@ -434,15 +430,14 @@ class StarCrossdDestiny(_ParserScraper):
prevSearch = '//a[text()="prev"]'
help = 'Index format: nnnnnnnn'
@classmethod
def namer(cls, imageUrl, pageUrl):
if imageUrl.find('ch1') == -1:
def namer(self, image_url, page_url):
if image_url.find('ch1') == -1:
# At first all images were stored in a strips/ directory but
# that was changed with the introduction of book2
imageUrl = sub('(?:strips)|(?:images)', 'book1', imageUrl)
elif not imageUrl.find('strips') == -1:
imageUrl = imageUrl.replace('strips/', '')
directory, filename = imageUrl.split('/')[-2:]
image_url = sub('(?:strips)|(?:images)', 'book1', image_url)
elif not image_url.find('strips') == -1:
image_url = image_url.replace('strips/', '')
directory, filename = image_url.split('/')[-2:]
filename, extension = splitext(filename)
return directory + '-' + filename
@ -519,12 +514,11 @@ class StuffNoOneToldMe(_BasicScraper):
multipleImagesPerStrip = True
help = 'Index format: yyyy/mm/stripname'
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
"""Use page URL to construct meaningful image name."""
parts, year, month, stripname = pageUrl.rsplit('/', 3)
parts, year, month, stripname = page_url.rsplit('/', 3)
stripname = stripname.rsplit('.', 1)[0]
parts, imagename = imageUrl.rsplit('/', 1)
parts, imagename = image_url.rsplit('/', 1)
return '%s-%s-%s-%s' % (year, month, stripname, imagename)
def shouldSkipUrl(self, url, data):

View file

@ -68,8 +68,7 @@ class _SmackJeeves(_ParserScraper):
else:
return self.fetchUrl(prevurl, data, self.nextSearch)
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
parts = page_url.split('/')
name = parts[-2]
num = parts[-3]

View file

@ -20,8 +20,7 @@ class _Snafu(_ParserScraper):
def name(self):
return 'SnafuComics/' + super(_Snafu, self).name
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
year, month, name = image_url.rsplit('/', 3)[1:]
return "%04s_%02s_%s" % (year, month, name)

View file

@ -85,9 +85,8 @@ class TheOrderOfTheStick(_BasicScraper):
help = 'Index format: n (unpadded)'
starter = indirectStarter
@classmethod
def namer(cls, imageUrl, pageUrl):
return pageUrl.rsplit('/', 1)[-1][:-5]
def namer(self, image_url, page_url):
return page_url.rsplit('/', 1)[-1][:-5]
class TheParkingLotIsFull(_BasicScraper):
@ -123,11 +122,10 @@ class TheThinHLine(_BasicScraper):
pageData = self.getPage(pageUrl)
return super(TheThinHLine, self).getComicStrip(pageUrl, pageData)
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
"""Use page URL sequence which is apparently increasing."""
num = pageUrl.split('/')[-1]
ext = imageUrl.rsplit('.', 1)[1]
num = page_url.split('/')[-1]
ext = image_url.rsplit('.', 1)[1]
return "thethinhline-%s.%s" % (num, ext)

View file

@ -60,6 +60,5 @@ class ViiviJaWagner(_BasicScraper):
help = 'Index format: none'
lang = 'fi'
@classmethod
def namer(cls, imageUrl, pageUrl):
return imageUrl.split('=')[1]
def namer(self, image_url, page_url):
return image_url.split('=')[1]

View file

@ -64,10 +64,9 @@ class WebDesignerCOTW(_BasicScraper):
"""Skip non-comic URLs."""
return 'comics-of-the-week' not in url
@classmethod
def namer(cls, imageUrl, pageUrl):
imagename = imageUrl.rsplit('/', 1)[1]
week = compile(r'week-(\d+)').search(pageUrl).group(1)
def namer(self, image_url, page_url):
imagename = image_url.rsplit('/', 1)[1]
week = compile(r'week-(\d+)').search(page_url).group(1)
return "%s-%s" % (week, imagename)

View file

@ -20,8 +20,7 @@ class _WLPComics(_ParserScraper):
def name(self):
return 'WLP/' + super(_WLPComics, self).name
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
return (page_url.rsplit('/', 1)[-1].split('.')[0] + '_' +
image_url.rsplit('/', 1)[-1])

View file

@ -26,8 +26,7 @@ class Xkcd(_BasicScraper):
textSearch = compile(tagre("img", "title", r'([^"]+)',
before=r'//imgs\.xkcd\.com/comics/'))
@classmethod
def namer(cls, image_url, page_url):
def namer(self, image_url, page_url):
index = int(page_url.rstrip('/').rsplit('/', 1)[-1])
name = image_url.rsplit('/', 1)[-1].split('.')[0]
return '%03d-%s' % (index, name)

View file

@ -34,9 +34,8 @@ class Zapiro(_BasicScraper):
r'(http://mg\.co\.za/cartoon/[^"]+)'))
help = 'Index format: yyyy-mm-dd-stripname'
@classmethod
def namer(cls, imageUrl, pageUrl):
name = imageUrl.split('/')[-3]
def namer(self, image_url, page_url):
name = image_url.split('/')[-3]
return name
@ -84,7 +83,6 @@ class Zwarwald(_BasicScraper):
self.stripUrl % '495',
)
@classmethod
def namer(cls, imageUrl, pageUrl):
prefix, year, month, name = imageUrl.rsplit('/', 3)
def namer(self, image_url, page_url):
prefix, year, month, name = image_url.rsplit('/', 3)
return "%s_%s_%s" % (year, month, name)

View file

@ -237,8 +237,7 @@ class Scraper(object):
"""Get starter URL from where to scrape comic strips."""
return self.url
@classmethod
def namer(cls, imageUrl, pageUrl):
def namer(self, image_url, page_url):
"""Return filename for given image and page URL."""
return None