Use concrete scraper objects everywhere.
This is a first step for #42. Since most access to the scraper classes is through instances, modules can now dynamically override url and name (name is now a property).
This commit is contained in:
parent
0468f2f31a
commit
060281e5ff
13 changed files with 137 additions and 122 deletions
12
dosage
12
dosage
|
@ -140,7 +140,7 @@ def display_help(options):
|
||||||
def display_comic_help(scraperobj):
|
def display_comic_help(scraperobj):
|
||||||
"""Print help for a comic."""
|
"""Print help for a comic."""
|
||||||
orig_context = out.context
|
orig_context = out.context
|
||||||
out.context = scraperobj.getName()
|
out.context = scraperobj.name
|
||||||
try:
|
try:
|
||||||
out.info(u"URL: " + scraperobj.url)
|
out.info(u"URL: " + scraperobj.url)
|
||||||
out.info(u"Language: " + scraperobj.language())
|
out.info(u"Language: " + scraperobj.language())
|
||||||
|
@ -178,9 +178,9 @@ def vote_comic(scraperobj):
|
||||||
"""Vote for given comic scraper."""
|
"""Vote for given comic scraper."""
|
||||||
errors = 0
|
errors = 0
|
||||||
orig_context = out.context
|
orig_context = out.context
|
||||||
out.context = scraperobj.getName()
|
out.context = scraperobj.name
|
||||||
try:
|
try:
|
||||||
name = scraperobj.getName()
|
name = scraperobj.name
|
||||||
answer = scraperobj.vote()
|
answer = scraperobj.vote()
|
||||||
out.debug(u'Vote answer %r' % answer)
|
out.debug(u'Vote answer %r' % answer)
|
||||||
if answer == 'counted':
|
if answer == 'counted':
|
||||||
|
@ -230,7 +230,7 @@ def do_list(column_list=True, verbose=False):
|
||||||
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
|
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
|
||||||
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
||||||
scrapers = sorted(director.getAllScrapers(listing=True),
|
scrapers = sorted(director.getAllScrapers(listing=True),
|
||||||
key=lambda s: s.getName())
|
key=lambda s: s.name)
|
||||||
if column_list:
|
if column_list:
|
||||||
num, disabled = do_column_list(scrapers)
|
num, disabled = do_column_list(scrapers)
|
||||||
else:
|
else:
|
||||||
|
@ -293,7 +293,7 @@ def get_tagged_scraper_name(scraperobj, limit=None, reasons=None):
|
||||||
suffix = " [" + ", ".join(tags) + "]"
|
suffix = " [" + ", ".join(tags) + "]"
|
||||||
else:
|
else:
|
||||||
suffix = ""
|
suffix = ""
|
||||||
name = scraperobj.getName()
|
name = scraperobj.name
|
||||||
if limit is not None:
|
if limit is not None:
|
||||||
name = strlimit(name, limit)
|
name = strlimit(name, limit)
|
||||||
return name + suffix
|
return name + suffix
|
||||||
|
@ -317,7 +317,7 @@ def main():
|
||||||
def profile():
|
def profile():
|
||||||
"""Profile the loading of all scrapers."""
|
"""Profile the loading of all scrapers."""
|
||||||
import cProfile
|
import cProfile
|
||||||
cProfile.run("scraper.get_scraperclasses()", "dosage.prof")
|
cProfile.run("scraper.get_scrapers()", "dosage.prof")
|
||||||
|
|
||||||
|
|
||||||
def viewprof():
|
def viewprof():
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2014 Bastian Kleineidam
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
try:
|
try:
|
||||||
|
@ -14,6 +19,7 @@ try:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from . import events, scraper
|
from . import events, scraper
|
||||||
from .util import getDirname
|
from .util import getDirname
|
||||||
|
@ -55,6 +61,8 @@ def get_hostname(url):
|
||||||
|
|
||||||
|
|
||||||
lock = threading.Lock()
|
lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
def get_host_lock(url):
|
def get_host_lock(url):
|
||||||
"""Get lock object for given URL host."""
|
"""Get lock object for given URL host."""
|
||||||
hostname = get_hostname(url)
|
hostname = get_hostname(url)
|
||||||
|
@ -68,7 +76,7 @@ class ComicGetter(threading.Thread):
|
||||||
"""Store options."""
|
"""Store options."""
|
||||||
super(ComicGetter, self).__init__()
|
super(ComicGetter, self).__init__()
|
||||||
self.options = options
|
self.options = options
|
||||||
self.origname = self.getName()
|
self.origname = self.name
|
||||||
self.stopped = False
|
self.stopped = False
|
||||||
self.errors = 0
|
self.errors = 0
|
||||||
|
|
||||||
|
@ -76,10 +84,10 @@ class ComicGetter(threading.Thread):
|
||||||
"""Process from queue until it is empty."""
|
"""Process from queue until it is empty."""
|
||||||
try:
|
try:
|
||||||
while not self.stopped:
|
while not self.stopped:
|
||||||
scraperobj = jobs.get(False)
|
scraper = jobs.get(False)
|
||||||
self.setName(scraperobj.getName())
|
self.name = scraper.name
|
||||||
try:
|
try:
|
||||||
self.getStrips(scraperobj)
|
self.getStrips(scraper)
|
||||||
finally:
|
finally:
|
||||||
jobs.task_done()
|
jobs.task_done()
|
||||||
self.setName(self.origname)
|
self.setName(self.origname)
|
||||||
|
@ -93,7 +101,7 @@ class ComicGetter(threading.Thread):
|
||||||
with lock:
|
with lock:
|
||||||
host_lock = get_host_lock(scraperobj.url)
|
host_lock = get_host_lock(scraperobj.url)
|
||||||
with host_lock:
|
with host_lock:
|
||||||
self._getStrips(scraperobj)
|
self._getStrips(scraper)
|
||||||
|
|
||||||
def _getStrips(self, scraperobj):
|
def _getStrips(self, scraperobj):
|
||||||
"""Get all strips from a scraper."""
|
"""Get all strips from a scraper."""
|
||||||
|
@ -117,7 +125,8 @@ class ComicGetter(threading.Thread):
|
||||||
if self.stopped:
|
if self.stopped:
|
||||||
break
|
break
|
||||||
if self.options.all and not (self.errors or self.options.dry_run or
|
if self.options.all and not (self.errors or self.options.dry_run or
|
||||||
self.options.cont or scraperobj.indexes):
|
self.options.cont or
|
||||||
|
scraperobj.indexes):
|
||||||
scraperobj.setComplete(self.options.basepath)
|
scraperobj.setComplete(self.options.basepath)
|
||||||
except Exception as msg:
|
except Exception as msg:
|
||||||
out.exception(msg)
|
out.exception(msg)
|
||||||
|
@ -158,7 +167,8 @@ def getComics(options):
|
||||||
events.getHandler().start()
|
events.getHandler().start()
|
||||||
errors = 0
|
errors = 0
|
||||||
try:
|
try:
|
||||||
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
|
for scraperobj in getScrapers(options.comic, options.basepath,
|
||||||
|
options.adult, options.multimatch):
|
||||||
jobs.put(scraperobj)
|
jobs.put(scraperobj)
|
||||||
# start threads
|
# start threads
|
||||||
num_threads = min(options.parallel, jobs.qsize())
|
num_threads = min(options.parallel, jobs.qsize())
|
||||||
|
@ -200,16 +210,16 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
# only scrapers whose directory already exists
|
# only scrapers whose directory already exists
|
||||||
if len(comics) > 1:
|
if len(comics) > 1:
|
||||||
out.warn(u"using '@' as comic name ignores all other specified comics.")
|
out.warn(u"using '@' as comic name ignores all other specified comics.")
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperobj in scraper.get_scrapers():
|
||||||
dirname = getDirname(scraperclass.getName())
|
dirname = getDirname(scraperobj.name)
|
||||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||||
if shouldRunScraper(scraperclass, adult, listing):
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
yield scraperclass()
|
yield scraperobj
|
||||||
elif '@@' in comics:
|
elif '@@' in comics:
|
||||||
# all scrapers
|
# all scrapers
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperobj in scraper.get_scrapers():
|
||||||
if shouldRunScraper(scraperclass, adult, listing):
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
yield scraperclass()
|
yield scraperobj
|
||||||
else:
|
else:
|
||||||
# get only selected comic scrapers
|
# get only selected comic scrapers
|
||||||
# store them in a set to eliminate duplicates
|
# store them in a set to eliminate duplicates
|
||||||
|
@ -227,32 +237,34 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
else:
|
else:
|
||||||
name = comic
|
name = comic
|
||||||
indexes = None
|
indexes = None
|
||||||
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
|
scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed)
|
||||||
for scraperclass in scraperclasses:
|
for scraperobj in scrapers:
|
||||||
if shouldRunScraper(scraperclass, adult, listing):
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
scraperobj = scraperclass(indexes=indexes)
|
# FIXME: Find a better way to work with indexes
|
||||||
|
scraperobj.indexes = indexes
|
||||||
if scraperobj not in scrapers:
|
if scraperobj not in scrapers:
|
||||||
scrapers.add(scraperobj)
|
scrapers.add(scraperobj)
|
||||||
yield scraperobj
|
yield scraperobj
|
||||||
|
|
||||||
|
|
||||||
def shouldRunScraper(scraperclass, adult=True, listing=False):
|
def shouldRunScraper(scraperobj, adult=True, listing=False):
|
||||||
if listing:
|
if listing:
|
||||||
return True
|
return True
|
||||||
if not adult and scraperclass.adult:
|
if not adult and scraperobj.adult:
|
||||||
warn_adult(scraperclass)
|
warn_adult(scraperobj)
|
||||||
return False
|
return False
|
||||||
reasons = scraperclass.getDisabledReasons()
|
reasons = scraperobj.getDisabledReasons()
|
||||||
if reasons:
|
if reasons:
|
||||||
warn_disabled(scraperclass, reasons)
|
warn_disabled(scraperobj, reasons)
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def warn_adult(scraperclass):
|
def warn_adult(scraperobj):
|
||||||
"""Print warning about adult content."""
|
"""Print warning about adult content."""
|
||||||
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
|
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperobj.name)
|
||||||
|
|
||||||
def warn_disabled(scraperclass, reasons):
|
|
||||||
|
def warn_disabled(scraperobj, reasons):
|
||||||
"""Print warning about disabled comic modules."""
|
"""Print warning about disabled comic modules."""
|
||||||
out.warn(u"Skipping comic %s: %s" % (scraperclass.getName(), ' '.join(reasons.values())))
|
out.warn(u"Skipping comic %s: %s" % (scraperobj.name, ' '.join(reasons.values())))
|
||||||
|
|
|
@ -29,7 +29,7 @@ lock = threading.Lock()
|
||||||
|
|
||||||
def get_threadname():
|
def get_threadname():
|
||||||
"""Return name of current thread."""
|
"""Return name of current thread."""
|
||||||
return threading.current_thread().getName()
|
return threading.current_thread().name
|
||||||
|
|
||||||
|
|
||||||
class Output(object):
|
class Output(object):
|
||||||
|
|
|
@ -29,9 +29,9 @@ class _ComicFury(_ParserScraper):
|
||||||
num = parts[-1]
|
num = parts[-1]
|
||||||
return "%s_%s%s" % (cls.__name__[2:], num, ext)
|
return "%s_%s%s" % (cls.__name__[2:], num, ext)
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
return 'ComicFury/' + cls.__name__[2:]
|
return 'ComicFury/' + super(_ComicFury, self).name[2:]
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
return self.url + 'comics/%s' % index
|
return self.url + 'comics/%s' % index
|
||||||
|
|
|
@ -14,9 +14,9 @@ class _Creators(_ParserScraper):
|
||||||
prevSearch = '//a[@id="nav_prev"]'
|
prevSearch = '//a[@id="nav_prev"]'
|
||||||
latestSearch = '//div[contains(@class,"caption")]/a'
|
latestSearch = '//div[contains(@class,"caption")]/a'
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
return 'Creators/' + cls.__name__
|
return 'Creators/' + super(_Creators, self).name
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
start = self.url + self.path
|
start = self.url + self.path
|
||||||
|
|
|
@ -16,9 +16,9 @@ class _GoComics(_ParserScraper):
|
||||||
nextSearch = '//ul[@class="feature-nav"]//a[@class="next"]'
|
nextSearch = '//ul[@class="feature-nav"]//a[@class="next"]'
|
||||||
help = 'Index format: yyyy/mm/dd'
|
help = 'Index format: yyyy/mm/dd'
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
return 'GoComics/' + cls.__name__[2:]
|
return 'GoComics/' + super(_GoComics, self).name[2:]
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
url1 = self.url + self.path
|
url1 = self.url + self.path
|
||||||
|
|
|
@ -14,9 +14,9 @@ class _NuklearPower(_ParserScraper):
|
||||||
def starter(self):
|
def starter(self):
|
||||||
return self.url + self.path + '/'
|
return self.url + self.path + '/'
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
return 'NuklearPower/' + cls.__name__[2:]
|
return 'NuklearPower/' + super(_NuklearPower, self).name[2:]
|
||||||
|
|
||||||
|
|
||||||
class NP8BitTheater(_NuklearPower):
|
class NP8BitTheater(_NuklearPower):
|
||||||
|
|
|
@ -16,9 +16,9 @@ class _WLPComics(_ParserScraper):
|
||||||
starter = bounceStarter
|
starter = bounceStarter
|
||||||
help = 'Index format: nnn'
|
help = 'Index format: nnn'
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
return 'WLP/' + cls.__name__
|
return 'WLP/' + super(_WLPComics, self).name
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, image_url, page_url):
|
def namer(cls, image_url, page_url):
|
||||||
|
|
|
@ -90,13 +90,19 @@ class Scraper(object):
|
||||||
# HTTP session for configuration & cookies
|
# HTTP session for configuration & cookies
|
||||||
session = requests_session()
|
session = requests_session()
|
||||||
|
|
||||||
def __init__(self, indexes=None):
|
@property
|
||||||
|
def indexes(self):
|
||||||
|
return self._indexes
|
||||||
|
|
||||||
|
@indexes.setter
|
||||||
|
def indexes(self, val):
|
||||||
|
if val:
|
||||||
|
self._indexes = tuple(sorted(val))
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
"""Initialize internal variables."""
|
"""Initialize internal variables."""
|
||||||
self.urls = set()
|
self.urls = set()
|
||||||
if indexes:
|
self._indexes = tuple()
|
||||||
self.indexes = tuple(sorted(indexes))
|
|
||||||
else:
|
|
||||||
self.indexes = tuple()
|
|
||||||
self.skippedUrls = set()
|
self.skippedUrls = set()
|
||||||
self.hitFirstStripUrl = False
|
self.hitFirstStripUrl = False
|
||||||
|
|
||||||
|
@ -105,7 +111,7 @@ class Scraper(object):
|
||||||
if not isinstance(other, Scraper):
|
if not isinstance(other, Scraper):
|
||||||
return 1
|
return 1
|
||||||
# first, order by name
|
# first, order by name
|
||||||
d = cmp(self.getName(), other.getName())
|
d = cmp(self.name, other.name)
|
||||||
if d != 0:
|
if d != 0:
|
||||||
return d
|
return d
|
||||||
# then by indexes
|
# then by indexes
|
||||||
|
@ -113,7 +119,7 @@ class Scraper(object):
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
"""Get hash value from name and index list."""
|
"""Get hash value from name and index list."""
|
||||||
return hash((self.getName(), self.indexes))
|
return hash((self.name, self.indexes))
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Determine if search for images in given URL should be skipped."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
|
@ -141,7 +147,7 @@ class Scraper(object):
|
||||||
optional=self.textOptional)
|
optional=self.textOptional)
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer,
|
return ComicStrip(self.name, url, imageUrls, self.namer,
|
||||||
self.session, text=text)
|
self.session, text=text)
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
|
@ -217,24 +223,21 @@ class Scraper(object):
|
||||||
else:
|
else:
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug(u"Found previous URL %s" % prevUrl)
|
out.debug(u"Found previous URL %s" % prevUrl)
|
||||||
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
getHandler().comicPageLink(self.name, url, prevUrl)
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
"""Get comic strip URL from index."""
|
"""Get comic strip URL from index."""
|
||||||
return self.stripUrl % index
|
return self.stripUrl % index
|
||||||
|
|
||||||
@classmethod
|
@property
|
||||||
def getName(cls):
|
def name(self):
|
||||||
"""Get scraper name."""
|
"""Get scraper name."""
|
||||||
if hasattr(cls, 'name'):
|
return self.__class__.__name__
|
||||||
return cls.name
|
|
||||||
return cls.__name__
|
|
||||||
|
|
||||||
@classmethod
|
def starter(self):
|
||||||
def starter(cls):
|
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
return cls.url
|
return self.url
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
@ -261,18 +264,17 @@ class Scraper(object):
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
return self.starter()
|
return self.starter()
|
||||||
|
|
||||||
@classmethod
|
def vote(self):
|
||||||
def vote(cls):
|
|
||||||
"""Cast a public vote for this comic."""
|
"""Cast a public vote for this comic."""
|
||||||
url = configuration.VoteUrl + 'count/'
|
url = configuration.VoteUrl + 'count/'
|
||||||
uid = get_system_uid()
|
uid = get_system_uid()
|
||||||
data = {"name": cls.getName().replace('/', '_'), "uid": uid}
|
data = {"name": self.name.replace('/', '_'), "uid": uid}
|
||||||
page = urlopen(url, cls.session, data=data)
|
page = urlopen(url, self.session, data=data)
|
||||||
return page.text
|
return page.text
|
||||||
|
|
||||||
def getCompleteFile(self, basepath):
|
def getCompleteFile(self, basepath):
|
||||||
"""Get filename indicating all comics are downloaded."""
|
"""Get filename indicating all comics are downloaded."""
|
||||||
dirname = getDirname(self.getName())
|
dirname = getDirname(self.name)
|
||||||
return os.path.join(basepath, dirname, "complete.txt")
|
return os.path.join(basepath, dirname, "complete.txt")
|
||||||
|
|
||||||
def isComplete(self, basepath):
|
def isComplete(self, basepath):
|
||||||
|
@ -517,63 +519,66 @@ class _ParserScraper(Scraper):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def find_scraperclasses(comic, multiple_allowed=False):
|
def find_scrapers(comic, multiple_allowed=False):
|
||||||
"""Get a list comic scraper classes. Can return more than one entries if
|
"""Get a list comic scraper objects.
|
||||||
multiple_allowed is True, else it raises a ValueError if multiple
|
|
||||||
modules match. The match is a case insensitive substring search."""
|
Can return more than one entry if multiple_allowed is True, else it raises
|
||||||
|
a ValueError if multiple modules match. The match is a case insensitive
|
||||||
|
substring search.
|
||||||
|
"""
|
||||||
if not comic:
|
if not comic:
|
||||||
raise ValueError("empty comic name")
|
raise ValueError("empty comic name")
|
||||||
candidates = []
|
candidates = []
|
||||||
cname = comic.lower()
|
cname = comic.lower()
|
||||||
for scraperclass in get_scraperclasses():
|
for scrapers in get_scrapers():
|
||||||
lname = scraperclass.getName().lower()
|
lname = scrapers.name.lower()
|
||||||
if lname == cname:
|
if lname == cname:
|
||||||
# perfect match
|
# perfect match
|
||||||
if not multiple_allowed:
|
if not multiple_allowed:
|
||||||
return [scraperclass]
|
return [scrapers]
|
||||||
else:
|
else:
|
||||||
candidates.append(scraperclass)
|
candidates.append(scrapers)
|
||||||
elif cname in lname:
|
elif cname in lname:
|
||||||
candidates.append(scraperclass)
|
candidates.append(scrapers)
|
||||||
if len(candidates) > 1 and not multiple_allowed:
|
if len(candidates) > 1 and not multiple_allowed:
|
||||||
comics = ", ".join(x.getName() for x in candidates)
|
comics = ", ".join(x.name for x in candidates)
|
||||||
raise ValueError('multiple comics found: %s' % comics)
|
raise ValueError('multiple comics found: %s' % comics)
|
||||||
elif not candidates:
|
elif not candidates:
|
||||||
raise ValueError('comic %r not found' % comic)
|
raise ValueError('comic %r not found' % comic)
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
|
|
||||||
_scraperclasses = None
|
_scrapers = None
|
||||||
|
|
||||||
|
|
||||||
def get_scraperclasses():
|
def get_scrapers():
|
||||||
"""Find all comic scraper classes in the plugins directory.
|
"""Find all comic scraper classes in the plugins directory.
|
||||||
The result is cached.
|
The result is cached.
|
||||||
@return: list of Scraper classes
|
@return: list of Scraper classes
|
||||||
@rtype: list of Scraper
|
@rtype: list of Scraper
|
||||||
"""
|
"""
|
||||||
global _scraperclasses
|
global _scrapers
|
||||||
if _scraperclasses is None:
|
if _scrapers is None:
|
||||||
out.debug(u"Loading comic modules...")
|
out.debug(u"Loading comic modules...")
|
||||||
modules = loader.get_modules('plugins')
|
modules = loader.get_modules('plugins')
|
||||||
plugins = loader.get_plugins(modules, Scraper)
|
plugins = loader.get_plugins(modules, Scraper)
|
||||||
_scraperclasses = sorted(plugins, key=lambda p: p.getName())
|
_scrapers = sorted([x() for x in plugins], key=lambda p: p.name)
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
out.debug(u"... %d modules loaded." % len(_scrapers))
|
||||||
return _scraperclasses
|
return _scrapers
|
||||||
|
|
||||||
|
|
||||||
def check_scrapers():
|
def check_scrapers():
|
||||||
"""Check for duplicate scraper class names."""
|
"""Check for duplicate scraper names."""
|
||||||
d = {}
|
d = {}
|
||||||
for scraperclass in _scraperclasses:
|
for scraper in _scrapers:
|
||||||
name = scraperclass.getName().lower()
|
name = scraper.name.lower()
|
||||||
if name in d:
|
if name in d:
|
||||||
name1 = scraperclass.getName()
|
name1 = scraper.name
|
||||||
name2 = d[name].getName()
|
name2 = d[name].name
|
||||||
raise ValueError('duplicate scrapers %s and %s found' %
|
raise ValueError('duplicate scrapers %s and %s found' %
|
||||||
(name1, name2))
|
(name1, name2))
|
||||||
d[name] = scraperclass
|
d[name] = scraper
|
||||||
|
|
||||||
|
|
||||||
def make_scraper(classname, scraperType=_BasicScraper, **attributes):
|
def make_scraper(classname, scraperType=_BasicScraper, **attributes):
|
||||||
|
|
|
@ -13,8 +13,8 @@ from dosagelib import scraper
|
||||||
class TestComicNames(object):
|
class TestComicNames(object):
|
||||||
|
|
||||||
def test_names(self):
|
def test_names(self):
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperobj in scraper.get_scrapers():
|
||||||
name = scraperclass.getName()
|
name = scraperobj.name
|
||||||
assert name.count('/') <= 1
|
assert name.count('/') <= 1
|
||||||
if '/' in name:
|
if '/' in name:
|
||||||
comicname = name.split('/')[1]
|
comicname = name.split('/')[1]
|
||||||
|
|
|
@ -33,18 +33,17 @@ def get_lock(host):
|
||||||
return _locks[host]
|
return _locks[host]
|
||||||
|
|
||||||
|
|
||||||
def _get_saved_images(outdir, scraper):
|
def _get_saved_images(outdir, scraperobj):
|
||||||
"""Get saved images."""
|
"""Get saved images."""
|
||||||
dirs = tuple(scraper.getName().split('/'))
|
dirs = tuple(scraperobj.name.split('/'))
|
||||||
files = os.listdir(os.path.join(outdir, *dirs))
|
files = os.listdir(os.path.join(outdir, *dirs))
|
||||||
files = [x for x in files if not x.endswith(".txt")]
|
files = [x for x in files if not x.endswith(".txt")]
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
def test_comicmodule(tmpdir, scraperclass):
|
def test_comicmodule(tmpdir, scraperobj):
|
||||||
'''Test a scraper. It must be able to traverse backward for at least 5
|
'''Test a scraper. It must be able to traverse backward for at least 5
|
||||||
strips from the start, and find strip images on at least 4 pages.'''
|
strips from the start, and find strip images on at least 4 pages.'''
|
||||||
scraperobj = scraperclass()
|
|
||||||
# Limit number of connections to one host.
|
# Limit number of connections to one host.
|
||||||
host = get_host(scraperobj.url)
|
host = get_host(scraperobj.url)
|
||||||
try:
|
try:
|
||||||
|
@ -121,11 +120,11 @@ def _check_stripurl(strip, scraperobj):
|
||||||
assert mo is not None, err
|
assert mo is not None, err
|
||||||
|
|
||||||
|
|
||||||
def get_test_scraperclasses():
|
def get_test_scrapers():
|
||||||
"""Return scrapers that should be tested."""
|
"""Return scrapers that should be tested."""
|
||||||
if "TESTALL" in os.environ:
|
if "TESTALL" in os.environ:
|
||||||
# test all comics (this will take some time)
|
# test all comics (this will take some time)
|
||||||
scraperclasses = scraper.get_scraperclasses()
|
scrapers = scraper.get_scrapers()
|
||||||
else:
|
else:
|
||||||
if 'TESTCOMICS' in os.environ:
|
if 'TESTCOMICS' in os.environ:
|
||||||
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
|
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
|
||||||
|
@ -139,13 +138,13 @@ def get_test_scraperclasses():
|
||||||
]
|
]
|
||||||
scraper_pattern = re.compile('|'.join(testscrapernames))
|
scraper_pattern = re.compile('|'.join(testscrapernames))
|
||||||
|
|
||||||
scraperclasses = [
|
scrapers = [
|
||||||
scraperclass for scraperclass in scraper.get_scraperclasses()
|
scraperobj for scraperobj in scraper.get_scrapers()
|
||||||
if scraper_pattern.match(scraperclass.getName())
|
if scraper_pattern.match(scraperobj.name)
|
||||||
]
|
]
|
||||||
return scraperclasses
|
return scrapers
|
||||||
|
|
||||||
|
|
||||||
def pytest_generate_tests(metafunc):
|
def pytest_generate_tests(metafunc):
|
||||||
if 'scraperclass' in metafunc.fixturenames:
|
if 'scraperobj' in metafunc.fixturenames:
|
||||||
metafunc.parametrize('scraperclass', get_test_scraperclasses())
|
metafunc.parametrize('scraperobj', get_test_scrapers())
|
||||||
|
|
|
@ -9,20 +9,19 @@ from dosagelib import scraper
|
||||||
class TestScraper(object):
|
class TestScraper(object):
|
||||||
"""Test scraper module functions."""
|
"""Test scraper module functions."""
|
||||||
|
|
||||||
def test_get_scraperclasses(self):
|
def test_get_scrapers(self):
|
||||||
for scraperclass in scraper.get_scraperclasses():
|
for scraperobj in scraper.get_scrapers():
|
||||||
scraperobj = scraperclass()
|
scraperobj.indexes = ["bla"]
|
||||||
scraperobj = scraperclass(indexes=["bla"])
|
assert scraperobj.url, "missing url in %s" % scraperobj.name
|
||||||
assert scraperobj.url, "missing url in %s" % scraperobj.getName()
|
|
||||||
|
|
||||||
def test_find_scraperclasses_single(self):
|
def test_find_scrapers_single(self):
|
||||||
result = scraper.find_scraperclasses("xkcd")
|
result = scraper.find_scrapers("xkcd")
|
||||||
assert len(result) == 1
|
assert len(result) == 1
|
||||||
|
|
||||||
def test_find_scraperclasses_multi(self):
|
def test_find_scrapers_multi(self):
|
||||||
result = scraper.find_scraperclasses("a", multiple_allowed=True)
|
result = scraper.find_scrapers("a", multiple_allowed=True)
|
||||||
assert len(result) > 1
|
assert len(result) > 1
|
||||||
|
|
||||||
def test_find_scraperclasses_error(self):
|
def test_find_scrapers_error(self):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
scraper.find_scraperclasses("")
|
scraper.find_scrapers("")
|
||||||
|
|
|
@ -12,5 +12,5 @@ class ATestScraper(scraper._BasicScraper):
|
||||||
class TestVote(object):
|
class TestVote(object):
|
||||||
|
|
||||||
def test_vote(self):
|
def test_vote(self):
|
||||||
answer = ATestScraper.vote()
|
answer = ATestScraper().vote()
|
||||||
assert answer in ('counted', 'no'), 'invalid answer %r' % answer
|
assert answer in ('counted', 'no'), 'invalid answer %r' % answer
|
||||||
|
|
Loading…
Reference in a new issue