Use concrete scraper objects everywhere.

This is a first step for #42. Since most access to the scraper classes
is through instances, modules can now dynamically override url and name
(name is now a property).
This commit is contained in:
Tobias Gruetzmacher 2016-04-13 22:05:44 +02:00
parent 0468f2f31a
commit 060281e5ff
13 changed files with 137 additions and 122 deletions

12
dosage
View file

@ -140,7 +140,7 @@ def display_help(options):
def display_comic_help(scraperobj): def display_comic_help(scraperobj):
"""Print help for a comic.""" """Print help for a comic."""
orig_context = out.context orig_context = out.context
out.context = scraperobj.getName() out.context = scraperobj.name
try: try:
out.info(u"URL: " + scraperobj.url) out.info(u"URL: " + scraperobj.url)
out.info(u"Language: " + scraperobj.language()) out.info(u"Language: " + scraperobj.language())
@ -178,9 +178,9 @@ def vote_comic(scraperobj):
"""Vote for given comic scraper.""" """Vote for given comic scraper."""
errors = 0 errors = 0
orig_context = out.context orig_context = out.context
out.context = scraperobj.getName() out.context = scraperobj.name
try: try:
name = scraperobj.getName() name = scraperobj.name
answer = scraperobj.vote() answer = scraperobj.vote()
out.debug(u'Vote answer %r' % answer) out.debug(u'Vote answer %r' % answer)
if answer == 'counted': if answer == 'counted':
@ -230,7 +230,7 @@ def do_list(column_list=True, verbose=False):
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT) out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(director.getAllScrapers(listing=True), scrapers = sorted(director.getAllScrapers(listing=True),
key=lambda s: s.getName()) key=lambda s: s.name)
if column_list: if column_list:
num, disabled = do_column_list(scrapers) num, disabled = do_column_list(scrapers)
else: else:
@ -293,7 +293,7 @@ def get_tagged_scraper_name(scraperobj, limit=None, reasons=None):
suffix = " [" + ", ".join(tags) + "]" suffix = " [" + ", ".join(tags) + "]"
else: else:
suffix = "" suffix = ""
name = scraperobj.getName() name = scraperobj.name
if limit is not None: if limit is not None:
name = strlimit(name, limit) name = strlimit(name, limit)
return name + suffix return name + suffix
@ -317,7 +317,7 @@ def main():
def profile(): def profile():
"""Profile the loading of all scrapers.""" """Profile the loading of all scrapers."""
import cProfile import cProfile
cProfile.run("scraper.get_scraperclasses()", "dosage.prof") cProfile.run("scraper.get_scrapers()", "dosage.prof")
def viewprof(): def viewprof():

View file

@ -1,5 +1,10 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2014 Bastian Kleineidam # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
import os import os
import threading import threading
try: try:
@ -14,6 +19,7 @@ try:
from urllib.parse import urlparse from urllib.parse import urlparse
except ImportError: except ImportError:
from urlparse import urlparse from urlparse import urlparse
from .output import out from .output import out
from . import events, scraper from . import events, scraper
from .util import getDirname from .util import getDirname
@ -55,6 +61,8 @@ def get_hostname(url):
lock = threading.Lock() lock = threading.Lock()
def get_host_lock(url): def get_host_lock(url):
"""Get lock object for given URL host.""" """Get lock object for given URL host."""
hostname = get_hostname(url) hostname = get_hostname(url)
@ -68,7 +76,7 @@ class ComicGetter(threading.Thread):
"""Store options.""" """Store options."""
super(ComicGetter, self).__init__() super(ComicGetter, self).__init__()
self.options = options self.options = options
self.origname = self.getName() self.origname = self.name
self.stopped = False self.stopped = False
self.errors = 0 self.errors = 0
@ -76,10 +84,10 @@ class ComicGetter(threading.Thread):
"""Process from queue until it is empty.""" """Process from queue until it is empty."""
try: try:
while not self.stopped: while not self.stopped:
scraperobj = jobs.get(False) scraper = jobs.get(False)
self.setName(scraperobj.getName()) self.name = scraper.name
try: try:
self.getStrips(scraperobj) self.getStrips(scraper)
finally: finally:
jobs.task_done() jobs.task_done()
self.setName(self.origname) self.setName(self.origname)
@ -93,7 +101,7 @@ class ComicGetter(threading.Thread):
with lock: with lock:
host_lock = get_host_lock(scraperobj.url) host_lock = get_host_lock(scraperobj.url)
with host_lock: with host_lock:
self._getStrips(scraperobj) self._getStrips(scraper)
def _getStrips(self, scraperobj): def _getStrips(self, scraperobj):
"""Get all strips from a scraper.""" """Get all strips from a scraper."""
@ -117,7 +125,8 @@ class ComicGetter(threading.Thread):
if self.stopped: if self.stopped:
break break
if self.options.all and not (self.errors or self.options.dry_run or if self.options.all and not (self.errors or self.options.dry_run or
self.options.cont or scraperobj.indexes): self.options.cont or
scraperobj.indexes):
scraperobj.setComplete(self.options.basepath) scraperobj.setComplete(self.options.basepath)
except Exception as msg: except Exception as msg:
out.exception(msg) out.exception(msg)
@ -158,7 +167,8 @@ def getComics(options):
events.getHandler().start() events.getHandler().start()
errors = 0 errors = 0
try: try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch): for scraperobj in getScrapers(options.comic, options.basepath,
options.adult, options.multimatch):
jobs.put(scraperobj) jobs.put(scraperobj)
# start threads # start threads
num_threads = min(options.parallel, jobs.qsize()) num_threads = min(options.parallel, jobs.qsize())
@ -200,16 +210,16 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
# only scrapers whose directory already exists # only scrapers whose directory already exists
if len(comics) > 1: if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.") out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperclass in scraper.get_scraperclasses(): for scraperobj in scraper.get_scrapers():
dirname = getDirname(scraperclass.getName()) dirname = getDirname(scraperobj.name)
if os.path.isdir(os.path.join(basepath, dirname)): if os.path.isdir(os.path.join(basepath, dirname)):
if shouldRunScraper(scraperclass, adult, listing): if shouldRunScraper(scraperobj, adult, listing):
yield scraperclass() yield scraperobj
elif '@@' in comics: elif '@@' in comics:
# all scrapers # all scrapers
for scraperclass in scraper.get_scraperclasses(): for scraperobj in scraper.get_scrapers():
if shouldRunScraper(scraperclass, adult, listing): if shouldRunScraper(scraperobj, adult, listing):
yield scraperclass() yield scraperobj
else: else:
# get only selected comic scrapers # get only selected comic scrapers
# store them in a set to eliminate duplicates # store them in a set to eliminate duplicates
@ -227,32 +237,34 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
else: else:
name = comic name = comic
indexes = None indexes = None
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed) scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed)
for scraperclass in scraperclasses: for scraperobj in scrapers:
if shouldRunScraper(scraperclass, adult, listing): if shouldRunScraper(scraperobj, adult, listing):
scraperobj = scraperclass(indexes=indexes) # FIXME: Find a better way to work with indexes
scraperobj.indexes = indexes
if scraperobj not in scrapers: if scraperobj not in scrapers:
scrapers.add(scraperobj) scrapers.add(scraperobj)
yield scraperobj yield scraperobj
def shouldRunScraper(scraperclass, adult=True, listing=False): def shouldRunScraper(scraperobj, adult=True, listing=False):
if listing: if listing:
return True return True
if not adult and scraperclass.adult: if not adult and scraperobj.adult:
warn_adult(scraperclass) warn_adult(scraperobj)
return False return False
reasons = scraperclass.getDisabledReasons() reasons = scraperobj.getDisabledReasons()
if reasons: if reasons:
warn_disabled(scraperclass, reasons) warn_disabled(scraperobj, reasons)
return False return False
return True return True
def warn_adult(scraperclass): def warn_adult(scraperobj):
"""Print warning about adult content.""" """Print warning about adult content."""
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName()) out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperobj.name)
def warn_disabled(scraperclass, reasons):
def warn_disabled(scraperobj, reasons):
"""Print warning about disabled comic modules.""" """Print warning about disabled comic modules."""
out.warn(u"Skipping comic %s: %s" % (scraperclass.getName(), ' '.join(reasons.values()))) out.warn(u"Skipping comic %s: %s" % (scraperobj.name, ' '.join(reasons.values())))

View file

@ -29,7 +29,7 @@ lock = threading.Lock()
def get_threadname(): def get_threadname():
"""Return name of current thread.""" """Return name of current thread."""
return threading.current_thread().getName() return threading.current_thread().name
class Output(object): class Output(object):

View file

@ -29,9 +29,9 @@ class _ComicFury(_ParserScraper):
num = parts[-1] num = parts[-1]
return "%s_%s%s" % (cls.__name__[2:], num, ext) return "%s_%s%s" % (cls.__name__[2:], num, ext)
@classmethod @property
def getName(cls): def name(self):
return 'ComicFury/' + cls.__name__[2:] return 'ComicFury/' + super(_ComicFury, self).name[2:]
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
return self.url + 'comics/%s' % index return self.url + 'comics/%s' % index

View file

@ -14,9 +14,9 @@ class _Creators(_ParserScraper):
prevSearch = '//a[@id="nav_prev"]' prevSearch = '//a[@id="nav_prev"]'
latestSearch = '//div[contains(@class,"caption")]/a' latestSearch = '//div[contains(@class,"caption")]/a'
@classmethod @property
def getName(cls): def name(self):
return 'Creators/' + cls.__name__ return 'Creators/' + super(_Creators, self).name
def starter(self): def starter(self):
start = self.url + self.path start = self.url + self.path

View file

@ -16,9 +16,9 @@ class _GoComics(_ParserScraper):
nextSearch = '//ul[@class="feature-nav"]//a[@class="next"]' nextSearch = '//ul[@class="feature-nav"]//a[@class="next"]'
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
@classmethod @property
def getName(cls): def name(self):
return 'GoComics/' + cls.__name__[2:] return 'GoComics/' + super(_GoComics, self).name[2:]
def starter(self): def starter(self):
url1 = self.url + self.path url1 = self.url + self.path

View file

@ -14,9 +14,9 @@ class _NuklearPower(_ParserScraper):
def starter(self): def starter(self):
return self.url + self.path + '/' return self.url + self.path + '/'
@classmethod @property
def getName(cls): def name(self):
return 'NuklearPower/' + cls.__name__[2:] return 'NuklearPower/' + super(_NuklearPower, self).name[2:]
class NP8BitTheater(_NuklearPower): class NP8BitTheater(_NuklearPower):

View file

@ -16,9 +16,9 @@ class _WLPComics(_ParserScraper):
starter = bounceStarter starter = bounceStarter
help = 'Index format: nnn' help = 'Index format: nnn'
@classmethod @property
def getName(cls): def name(self):
return 'WLP/' + cls.__name__ return 'WLP/' + super(_WLPComics, self).name
@classmethod @classmethod
def namer(cls, image_url, page_url): def namer(cls, image_url, page_url):

View file

@ -90,13 +90,19 @@ class Scraper(object):
# HTTP session for configuration & cookies # HTTP session for configuration & cookies
session = requests_session() session = requests_session()
def __init__(self, indexes=None): @property
def indexes(self):
return self._indexes
@indexes.setter
def indexes(self, val):
if val:
self._indexes = tuple(sorted(val))
def __init__(self):
"""Initialize internal variables.""" """Initialize internal variables."""
self.urls = set() self.urls = set()
if indexes: self._indexes = tuple()
self.indexes = tuple(sorted(indexes))
else:
self.indexes = tuple()
self.skippedUrls = set() self.skippedUrls = set()
self.hitFirstStripUrl = False self.hitFirstStripUrl = False
@ -105,7 +111,7 @@ class Scraper(object):
if not isinstance(other, Scraper): if not isinstance(other, Scraper):
return 1 return 1
# first, order by name # first, order by name
d = cmp(self.getName(), other.getName()) d = cmp(self.name, other.name)
if d != 0: if d != 0:
return d return d
# then by indexes # then by indexes
@ -113,7 +119,7 @@ class Scraper(object):
def __hash__(self): def __hash__(self):
"""Get hash value from name and index list.""" """Get hash value from name and index list."""
return hash((self.getName(), self.indexes)) return hash((self.name, self.indexes))
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Determine if search for images in given URL should be skipped.""" """Determine if search for images in given URL should be skipped."""
@ -141,7 +147,7 @@ class Scraper(object):
optional=self.textOptional) optional=self.textOptional)
else: else:
text = None text = None
return ComicStrip(self.getName(), url, imageUrls, self.namer, return ComicStrip(self.name, url, imageUrls, self.namer,
self.session, text=text) self.session, text=text)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
@ -217,24 +223,21 @@ class Scraper(object):
else: else:
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug(u"Found previous URL %s" % prevUrl) out.debug(u"Found previous URL %s" % prevUrl)
getHandler().comicPageLink(self.getName(), url, prevUrl) getHandler().comicPageLink(self.name, url, prevUrl)
return prevUrl return prevUrl
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
"""Get comic strip URL from index.""" """Get comic strip URL from index."""
return self.stripUrl % index return self.stripUrl % index
@classmethod @property
def getName(cls): def name(self):
"""Get scraper name.""" """Get scraper name."""
if hasattr(cls, 'name'): return self.__class__.__name__
return cls.name
return cls.__name__
@classmethod def starter(self):
def starter(cls):
"""Get starter URL from where to scrape comic strips.""" """Get starter URL from where to scrape comic strips."""
return cls.url return self.url
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -261,18 +264,17 @@ class Scraper(object):
"""Get starter URL from where to scrape comic strips.""" """Get starter URL from where to scrape comic strips."""
return self.starter() return self.starter()
@classmethod def vote(self):
def vote(cls):
"""Cast a public vote for this comic.""" """Cast a public vote for this comic."""
url = configuration.VoteUrl + 'count/' url = configuration.VoteUrl + 'count/'
uid = get_system_uid() uid = get_system_uid()
data = {"name": cls.getName().replace('/', '_'), "uid": uid} data = {"name": self.name.replace('/', '_'), "uid": uid}
page = urlopen(url, cls.session, data=data) page = urlopen(url, self.session, data=data)
return page.text return page.text
def getCompleteFile(self, basepath): def getCompleteFile(self, basepath):
"""Get filename indicating all comics are downloaded.""" """Get filename indicating all comics are downloaded."""
dirname = getDirname(self.getName()) dirname = getDirname(self.name)
return os.path.join(basepath, dirname, "complete.txt") return os.path.join(basepath, dirname, "complete.txt")
def isComplete(self, basepath): def isComplete(self, basepath):
@ -517,63 +519,66 @@ class _ParserScraper(Scraper):
return res return res
def find_scraperclasses(comic, multiple_allowed=False): def find_scrapers(comic, multiple_allowed=False):
"""Get a list comic scraper classes. Can return more than one entries if """Get a list comic scraper objects.
multiple_allowed is True, else it raises a ValueError if multiple
modules match. The match is a case insensitive substring search.""" Can return more than one entry if multiple_allowed is True, else it raises
a ValueError if multiple modules match. The match is a case insensitive
substring search.
"""
if not comic: if not comic:
raise ValueError("empty comic name") raise ValueError("empty comic name")
candidates = [] candidates = []
cname = comic.lower() cname = comic.lower()
for scraperclass in get_scraperclasses(): for scrapers in get_scrapers():
lname = scraperclass.getName().lower() lname = scrapers.name.lower()
if lname == cname: if lname == cname:
# perfect match # perfect match
if not multiple_allowed: if not multiple_allowed:
return [scraperclass] return [scrapers]
else: else:
candidates.append(scraperclass) candidates.append(scrapers)
elif cname in lname: elif cname in lname:
candidates.append(scraperclass) candidates.append(scrapers)
if len(candidates) > 1 and not multiple_allowed: if len(candidates) > 1 and not multiple_allowed:
comics = ", ".join(x.getName() for x in candidates) comics = ", ".join(x.name for x in candidates)
raise ValueError('multiple comics found: %s' % comics) raise ValueError('multiple comics found: %s' % comics)
elif not candidates: elif not candidates:
raise ValueError('comic %r not found' % comic) raise ValueError('comic %r not found' % comic)
return candidates return candidates
_scraperclasses = None _scrapers = None
def get_scraperclasses(): def get_scrapers():
"""Find all comic scraper classes in the plugins directory. """Find all comic scraper classes in the plugins directory.
The result is cached. The result is cached.
@return: list of Scraper classes @return: list of Scraper classes
@rtype: list of Scraper @rtype: list of Scraper
""" """
global _scraperclasses global _scrapers
if _scraperclasses is None: if _scrapers is None:
out.debug(u"Loading comic modules...") out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins') modules = loader.get_modules('plugins')
plugins = loader.get_plugins(modules, Scraper) plugins = loader.get_plugins(modules, Scraper)
_scraperclasses = sorted(plugins, key=lambda p: p.getName()) _scrapers = sorted([x() for x in plugins], key=lambda p: p.name)
check_scrapers() check_scrapers()
out.debug(u"... %d modules loaded." % len(_scraperclasses)) out.debug(u"... %d modules loaded." % len(_scrapers))
return _scraperclasses return _scrapers
def check_scrapers(): def check_scrapers():
"""Check for duplicate scraper class names.""" """Check for duplicate scraper names."""
d = {} d = {}
for scraperclass in _scraperclasses: for scraper in _scrapers:
name = scraperclass.getName().lower() name = scraper.name.lower()
if name in d: if name in d:
name1 = scraperclass.getName() name1 = scraper.name
name2 = d[name].getName() name2 = d[name].name
raise ValueError('duplicate scrapers %s and %s found' % raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2)) (name1, name2))
d[name] = scraperclass d[name] = scraper
def make_scraper(classname, scraperType=_BasicScraper, **attributes): def make_scraper(classname, scraperType=_BasicScraper, **attributes):

View file

@ -13,8 +13,8 @@ from dosagelib import scraper
class TestComicNames(object): class TestComicNames(object):
def test_names(self): def test_names(self):
for scraperclass in scraper.get_scraperclasses(): for scraperobj in scraper.get_scrapers():
name = scraperclass.getName() name = scraperobj.name
assert name.count('/') <= 1 assert name.count('/') <= 1
if '/' in name: if '/' in name:
comicname = name.split('/')[1] comicname = name.split('/')[1]

View file

@ -33,18 +33,17 @@ def get_lock(host):
return _locks[host] return _locks[host]
def _get_saved_images(outdir, scraper): def _get_saved_images(outdir, scraperobj):
"""Get saved images.""" """Get saved images."""
dirs = tuple(scraper.getName().split('/')) dirs = tuple(scraperobj.name.split('/'))
files = os.listdir(os.path.join(outdir, *dirs)) files = os.listdir(os.path.join(outdir, *dirs))
files = [x for x in files if not x.endswith(".txt")] files = [x for x in files if not x.endswith(".txt")]
return files return files
def test_comicmodule(tmpdir, scraperclass): def test_comicmodule(tmpdir, scraperobj):
'''Test a scraper. It must be able to traverse backward for at least 5 '''Test a scraper. It must be able to traverse backward for at least 5
strips from the start, and find strip images on at least 4 pages.''' strips from the start, and find strip images on at least 4 pages.'''
scraperobj = scraperclass()
# Limit number of connections to one host. # Limit number of connections to one host.
host = get_host(scraperobj.url) host = get_host(scraperobj.url)
try: try:
@ -121,11 +120,11 @@ def _check_stripurl(strip, scraperobj):
assert mo is not None, err assert mo is not None, err
def get_test_scraperclasses(): def get_test_scrapers():
"""Return scrapers that should be tested.""" """Return scrapers that should be tested."""
if "TESTALL" in os.environ: if "TESTALL" in os.environ:
# test all comics (this will take some time) # test all comics (this will take some time)
scraperclasses = scraper.get_scraperclasses() scrapers = scraper.get_scrapers()
else: else:
if 'TESTCOMICS' in os.environ: if 'TESTCOMICS' in os.environ:
scraper_pattern = re.compile(os.environ['TESTCOMICS']) scraper_pattern = re.compile(os.environ['TESTCOMICS'])
@ -139,13 +138,13 @@ def get_test_scraperclasses():
] ]
scraper_pattern = re.compile('|'.join(testscrapernames)) scraper_pattern = re.compile('|'.join(testscrapernames))
scraperclasses = [ scrapers = [
scraperclass for scraperclass in scraper.get_scraperclasses() scraperobj for scraperobj in scraper.get_scrapers()
if scraper_pattern.match(scraperclass.getName()) if scraper_pattern.match(scraperobj.name)
] ]
return scraperclasses return scrapers
def pytest_generate_tests(metafunc): def pytest_generate_tests(metafunc):
if 'scraperclass' in metafunc.fixturenames: if 'scraperobj' in metafunc.fixturenames:
metafunc.parametrize('scraperclass', get_test_scraperclasses()) metafunc.parametrize('scraperobj', get_test_scrapers())

View file

@ -9,20 +9,19 @@ from dosagelib import scraper
class TestScraper(object): class TestScraper(object):
"""Test scraper module functions.""" """Test scraper module functions."""
def test_get_scraperclasses(self): def test_get_scrapers(self):
for scraperclass in scraper.get_scraperclasses(): for scraperobj in scraper.get_scrapers():
scraperobj = scraperclass() scraperobj.indexes = ["bla"]
scraperobj = scraperclass(indexes=["bla"]) assert scraperobj.url, "missing url in %s" % scraperobj.name
assert scraperobj.url, "missing url in %s" % scraperobj.getName()
def test_find_scraperclasses_single(self): def test_find_scrapers_single(self):
result = scraper.find_scraperclasses("xkcd") result = scraper.find_scrapers("xkcd")
assert len(result) == 1 assert len(result) == 1
def test_find_scraperclasses_multi(self): def test_find_scrapers_multi(self):
result = scraper.find_scraperclasses("a", multiple_allowed=True) result = scraper.find_scrapers("a", multiple_allowed=True)
assert len(result) > 1 assert len(result) > 1
def test_find_scraperclasses_error(self): def test_find_scrapers_error(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):
scraper.find_scraperclasses("") scraper.find_scrapers("")

View file

@ -12,5 +12,5 @@ class ATestScraper(scraper._BasicScraper):
class TestVote(object): class TestVote(object):
def test_vote(self): def test_vote(self):
answer = ATestScraper.vote() answer = ATestScraper().vote()
assert answer in ('counted', 'no'), 'invalid answer %r' % answer assert answer in ('counted', 'no'), 'invalid answer %r' % answer