From 060281e5ff82c7cd12ab5eafa5a1bfc69bd93c09 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 13 Apr 2016 22:05:44 +0200 Subject: [PATCH] Use concrete scraper objects everywhere. This is a first step for #42. Since most access to the scraper classes is through instances, modules can now dynamically override url and name (name is now a property). --- dosage | 12 ++-- dosagelib/director.py | 70 +++++++++++++---------- dosagelib/output.py | 2 +- dosagelib/plugins/comicfury.py | 6 +- dosagelib/plugins/creators.py | 6 +- dosagelib/plugins/gocomics.py | 6 +- dosagelib/plugins/nuklearpower.py | 6 +- dosagelib/plugins/wlpcomics.py | 6 +- dosagelib/scraper.py | 95 ++++++++++++++++--------------- tests/test_comicnames.py | 4 +- tests/test_comics.py | 23 ++++---- tests/test_scraper.py | 21 ++++--- tests/test_vote.py | 2 +- 13 files changed, 137 insertions(+), 122 deletions(-) diff --git a/dosage b/dosage index 3f162bc6e..e00d5dac7 100755 --- a/dosage +++ b/dosage @@ -140,7 +140,7 @@ def display_help(options): def display_comic_help(scraperobj): """Print help for a comic.""" orig_context = out.context - out.context = scraperobj.getName() + out.context = scraperobj.name try: out.info(u"URL: " + scraperobj.url) out.info(u"Language: " + scraperobj.language()) @@ -178,9 +178,9 @@ def vote_comic(scraperobj): """Vote for given comic scraper.""" errors = 0 orig_context = out.context - out.context = scraperobj.getName() + out.context = scraperobj.name try: - name = scraperobj.getName() + name = scraperobj.name answer = scraperobj.vote() out.debug(u'Vote answer %r' % answer) if answer == 'counted': @@ -230,7 +230,7 @@ def do_list(column_list=True, verbose=False): out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) scrapers = sorted(director.getAllScrapers(listing=True), - key=lambda s: s.getName()) + key=lambda s: s.name) if column_list: num, disabled = do_column_list(scrapers) else: @@ -293,7 +293,7 @@ def get_tagged_scraper_name(scraperobj, limit=None, reasons=None): suffix = " [" + ", ".join(tags) + "]" else: suffix = "" - name = scraperobj.getName() + name = scraperobj.name if limit is not None: name = strlimit(name, limit) return name + suffix @@ -317,7 +317,7 @@ def main(): def profile(): """Profile the loading of all scrapers.""" import cProfile - cProfile.run("scraper.get_scraperclasses()", "dosage.prof") + cProfile.run("scraper.get_scrapers()", "dosage.prof") def viewprof(): diff --git a/dosagelib/director.py b/dosagelib/director.py index 3d003486d..e3991007e 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -1,5 +1,10 @@ -# -*- coding: iso-8859-1 -*- -# Copyright (C) 2014 Bastian Kleineidam +# -*- coding: utf-8 -*- +# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs +# Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function + import os import threading try: @@ -14,6 +19,7 @@ try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse + from .output import out from . import events, scraper from .util import getDirname @@ -55,6 +61,8 @@ def get_hostname(url): lock = threading.Lock() + + def get_host_lock(url): """Get lock object for given URL host.""" hostname = get_hostname(url) @@ -68,7 +76,7 @@ class ComicGetter(threading.Thread): """Store options.""" super(ComicGetter, self).__init__() self.options = options - self.origname = self.getName() + self.origname = self.name self.stopped = False self.errors = 0 @@ -76,10 +84,10 @@ class ComicGetter(threading.Thread): """Process from queue until it is empty.""" try: while not self.stopped: - scraperobj = jobs.get(False) - self.setName(scraperobj.getName()) + scraper = jobs.get(False) + self.name = scraper.name try: - self.getStrips(scraperobj) + self.getStrips(scraper) finally: jobs.task_done() self.setName(self.origname) @@ -93,7 +101,7 @@ class ComicGetter(threading.Thread): with lock: host_lock = get_host_lock(scraperobj.url) with host_lock: - self._getStrips(scraperobj) + self._getStrips(scraper) def _getStrips(self, scraperobj): """Get all strips from a scraper.""" @@ -117,7 +125,8 @@ class ComicGetter(threading.Thread): if self.stopped: break if self.options.all and not (self.errors or self.options.dry_run or - self.options.cont or scraperobj.indexes): + self.options.cont or + scraperobj.indexes): scraperobj.setComplete(self.options.basepath) except Exception as msg: out.exception(msg) @@ -158,7 +167,8 @@ def getComics(options): events.getHandler().start() errors = 0 try: - for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch): + for scraperobj in getScrapers(options.comic, options.basepath, + options.adult, options.multimatch): jobs.put(scraperobj) # start threads num_threads = min(options.parallel, jobs.qsize()) @@ -200,16 +210,16 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi # only scrapers whose directory already exists if len(comics) > 1: out.warn(u"using '@' as comic name ignores all other specified comics.") - for scraperclass in scraper.get_scraperclasses(): - dirname = getDirname(scraperclass.getName()) + for scraperobj in scraper.get_scrapers(): + dirname = getDirname(scraperobj.name) if os.path.isdir(os.path.join(basepath, dirname)): - if shouldRunScraper(scraperclass, adult, listing): - yield scraperclass() + if shouldRunScraper(scraperobj, adult, listing): + yield scraperobj elif '@@' in comics: # all scrapers - for scraperclass in scraper.get_scraperclasses(): - if shouldRunScraper(scraperclass, adult, listing): - yield scraperclass() + for scraperobj in scraper.get_scrapers(): + if shouldRunScraper(scraperobj, adult, listing): + yield scraperobj else: # get only selected comic scrapers # store them in a set to eliminate duplicates @@ -227,32 +237,34 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi else: name = comic indexes = None - scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed) - for scraperclass in scraperclasses: - if shouldRunScraper(scraperclass, adult, listing): - scraperobj = scraperclass(indexes=indexes) + scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed) + for scraperobj in scrapers: + if shouldRunScraper(scraperobj, adult, listing): + # FIXME: Find a better way to work with indexes + scraperobj.indexes = indexes if scraperobj not in scrapers: scrapers.add(scraperobj) yield scraperobj -def shouldRunScraper(scraperclass, adult=True, listing=False): +def shouldRunScraper(scraperobj, adult=True, listing=False): if listing: return True - if not adult and scraperclass.adult: - warn_adult(scraperclass) + if not adult and scraperobj.adult: + warn_adult(scraperobj) return False - reasons = scraperclass.getDisabledReasons() + reasons = scraperobj.getDisabledReasons() if reasons: - warn_disabled(scraperclass, reasons) + warn_disabled(scraperobj, reasons) return False return True -def warn_adult(scraperclass): +def warn_adult(scraperobj): """Print warning about adult content.""" - out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName()) + out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperobj.name) -def warn_disabled(scraperclass, reasons): + +def warn_disabled(scraperobj, reasons): """Print warning about disabled comic modules.""" - out.warn(u"Skipping comic %s: %s" % (scraperclass.getName(), ' '.join(reasons.values()))) + out.warn(u"Skipping comic %s: %s" % (scraperobj.name, ' '.join(reasons.values()))) diff --git a/dosagelib/output.py b/dosagelib/output.py index 451ff08ca..2de380716 100644 --- a/dosagelib/output.py +++ b/dosagelib/output.py @@ -29,7 +29,7 @@ lock = threading.Lock() def get_threadname(): """Return name of current thread.""" - return threading.current_thread().getName() + return threading.current_thread().name class Output(object): diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index deb257536..a4caef9f4 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -29,9 +29,9 @@ class _ComicFury(_ParserScraper): num = parts[-1] return "%s_%s%s" % (cls.__name__[2:], num, ext) - @classmethod - def getName(cls): - return 'ComicFury/' + cls.__name__[2:] + @property + def name(self): + return 'ComicFury/' + super(_ComicFury, self).name[2:] def getIndexStripUrl(self, index): return self.url + 'comics/%s' % index diff --git a/dosagelib/plugins/creators.py b/dosagelib/plugins/creators.py index da0a48ce1..d2cd54934 100644 --- a/dosagelib/plugins/creators.py +++ b/dosagelib/plugins/creators.py @@ -14,9 +14,9 @@ class _Creators(_ParserScraper): prevSearch = '//a[@id="nav_prev"]' latestSearch = '//div[contains(@class,"caption")]/a' - @classmethod - def getName(cls): - return 'Creators/' + cls.__name__ + @property + def name(self): + return 'Creators/' + super(_Creators, self).name def starter(self): start = self.url + self.path diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index 36694652c..91bf7fef2 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -16,9 +16,9 @@ class _GoComics(_ParserScraper): nextSearch = '//ul[@class="feature-nav"]//a[@class="next"]' help = 'Index format: yyyy/mm/dd' - @classmethod - def getName(cls): - return 'GoComics/' + cls.__name__[2:] + @property + def name(self): + return 'GoComics/' + super(_GoComics, self).name[2:] def starter(self): url1 = self.url + self.path diff --git a/dosagelib/plugins/nuklearpower.py b/dosagelib/plugins/nuklearpower.py index b6d59aec5..d251d7be3 100644 --- a/dosagelib/plugins/nuklearpower.py +++ b/dosagelib/plugins/nuklearpower.py @@ -14,9 +14,9 @@ class _NuklearPower(_ParserScraper): def starter(self): return self.url + self.path + '/' - @classmethod - def getName(cls): - return 'NuklearPower/' + cls.__name__[2:] + @property + def name(self): + return 'NuklearPower/' + super(_NuklearPower, self).name[2:] class NP8BitTheater(_NuklearPower): diff --git a/dosagelib/plugins/wlpcomics.py b/dosagelib/plugins/wlpcomics.py index c854b22f5..72b6c690b 100644 --- a/dosagelib/plugins/wlpcomics.py +++ b/dosagelib/plugins/wlpcomics.py @@ -16,9 +16,9 @@ class _WLPComics(_ParserScraper): starter = bounceStarter help = 'Index format: nnn' - @classmethod - def getName(cls): - return 'WLP/' + cls.__name__ + @property + def name(self): + return 'WLP/' + super(_WLPComics, self).name @classmethod def namer(cls, image_url, page_url): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index d408d1383..6dd7e3af6 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -90,13 +90,19 @@ class Scraper(object): # HTTP session for configuration & cookies session = requests_session() - def __init__(self, indexes=None): + @property + def indexes(self): + return self._indexes + + @indexes.setter + def indexes(self, val): + if val: + self._indexes = tuple(sorted(val)) + + def __init__(self): """Initialize internal variables.""" self.urls = set() - if indexes: - self.indexes = tuple(sorted(indexes)) - else: - self.indexes = tuple() + self._indexes = tuple() self.skippedUrls = set() self.hitFirstStripUrl = False @@ -105,7 +111,7 @@ class Scraper(object): if not isinstance(other, Scraper): return 1 # first, order by name - d = cmp(self.getName(), other.getName()) + d = cmp(self.name, other.name) if d != 0: return d # then by indexes @@ -113,7 +119,7 @@ class Scraper(object): def __hash__(self): """Get hash value from name and index list.""" - return hash((self.getName(), self.indexes)) + return hash((self.name, self.indexes)) def shouldSkipUrl(self, url, data): """Determine if search for images in given URL should be skipped.""" @@ -141,7 +147,7 @@ class Scraper(object): optional=self.textOptional) else: text = None - return ComicStrip(self.getName(), url, imageUrls, self.namer, + return ComicStrip(self.name, url, imageUrls, self.namer, self.session, text=text) def getStrips(self, maxstrips=None): @@ -217,24 +223,21 @@ class Scraper(object): else: prevUrl = self.prevUrlModifier(prevUrl) out.debug(u"Found previous URL %s" % prevUrl) - getHandler().comicPageLink(self.getName(), url, prevUrl) + getHandler().comicPageLink(self.name, url, prevUrl) return prevUrl def getIndexStripUrl(self, index): """Get comic strip URL from index.""" return self.stripUrl % index - @classmethod - def getName(cls): + @property + def name(self): """Get scraper name.""" - if hasattr(cls, 'name'): - return cls.name - return cls.__name__ + return self.__class__.__name__ - @classmethod - def starter(cls): + def starter(self): """Get starter URL from where to scrape comic strips.""" - return cls.url + return self.url @classmethod def namer(cls, imageUrl, pageUrl): @@ -261,18 +264,17 @@ class Scraper(object): """Get starter URL from where to scrape comic strips.""" return self.starter() - @classmethod - def vote(cls): + def vote(self): """Cast a public vote for this comic.""" url = configuration.VoteUrl + 'count/' uid = get_system_uid() - data = {"name": cls.getName().replace('/', '_'), "uid": uid} - page = urlopen(url, cls.session, data=data) + data = {"name": self.name.replace('/', '_'), "uid": uid} + page = urlopen(url, self.session, data=data) return page.text def getCompleteFile(self, basepath): """Get filename indicating all comics are downloaded.""" - dirname = getDirname(self.getName()) + dirname = getDirname(self.name) return os.path.join(basepath, dirname, "complete.txt") def isComplete(self, basepath): @@ -517,63 +519,66 @@ class _ParserScraper(Scraper): return res -def find_scraperclasses(comic, multiple_allowed=False): - """Get a list comic scraper classes. Can return more than one entries if - multiple_allowed is True, else it raises a ValueError if multiple - modules match. The match is a case insensitive substring search.""" +def find_scrapers(comic, multiple_allowed=False): + """Get a list comic scraper objects. + + Can return more than one entry if multiple_allowed is True, else it raises + a ValueError if multiple modules match. The match is a case insensitive + substring search. + """ if not comic: raise ValueError("empty comic name") candidates = [] cname = comic.lower() - for scraperclass in get_scraperclasses(): - lname = scraperclass.getName().lower() + for scrapers in get_scrapers(): + lname = scrapers.name.lower() if lname == cname: # perfect match if not multiple_allowed: - return [scraperclass] + return [scrapers] else: - candidates.append(scraperclass) + candidates.append(scrapers) elif cname in lname: - candidates.append(scraperclass) + candidates.append(scrapers) if len(candidates) > 1 and not multiple_allowed: - comics = ", ".join(x.getName() for x in candidates) + comics = ", ".join(x.name for x in candidates) raise ValueError('multiple comics found: %s' % comics) elif not candidates: raise ValueError('comic %r not found' % comic) return candidates -_scraperclasses = None +_scrapers = None -def get_scraperclasses(): +def get_scrapers(): """Find all comic scraper classes in the plugins directory. The result is cached. @return: list of Scraper classes @rtype: list of Scraper """ - global _scraperclasses - if _scraperclasses is None: + global _scrapers + if _scrapers is None: out.debug(u"Loading comic modules...") modules = loader.get_modules('plugins') plugins = loader.get_plugins(modules, Scraper) - _scraperclasses = sorted(plugins, key=lambda p: p.getName()) + _scrapers = sorted([x() for x in plugins], key=lambda p: p.name) check_scrapers() - out.debug(u"... %d modules loaded." % len(_scraperclasses)) - return _scraperclasses + out.debug(u"... %d modules loaded." % len(_scrapers)) + return _scrapers def check_scrapers(): - """Check for duplicate scraper class names.""" + """Check for duplicate scraper names.""" d = {} - for scraperclass in _scraperclasses: - name = scraperclass.getName().lower() + for scraper in _scrapers: + name = scraper.name.lower() if name in d: - name1 = scraperclass.getName() - name2 = d[name].getName() + name1 = scraper.name + name2 = d[name].name raise ValueError('duplicate scrapers %s and %s found' % (name1, name2)) - d[name] = scraperclass + d[name] = scraper def make_scraper(classname, scraperType=_BasicScraper, **attributes): diff --git a/tests/test_comicnames.py b/tests/test_comicnames.py index 0d9199591..29909b911 100644 --- a/tests/test_comicnames.py +++ b/tests/test_comicnames.py @@ -13,8 +13,8 @@ from dosagelib import scraper class TestComicNames(object): def test_names(self): - for scraperclass in scraper.get_scraperclasses(): - name = scraperclass.getName() + for scraperobj in scraper.get_scrapers(): + name = scraperobj.name assert name.count('/') <= 1 if '/' in name: comicname = name.split('/')[1] diff --git a/tests/test_comics.py b/tests/test_comics.py index 28fafcdd6..65047c76c 100644 --- a/tests/test_comics.py +++ b/tests/test_comics.py @@ -33,18 +33,17 @@ def get_lock(host): return _locks[host] -def _get_saved_images(outdir, scraper): +def _get_saved_images(outdir, scraperobj): """Get saved images.""" - dirs = tuple(scraper.getName().split('/')) + dirs = tuple(scraperobj.name.split('/')) files = os.listdir(os.path.join(outdir, *dirs)) files = [x for x in files if not x.endswith(".txt")] return files -def test_comicmodule(tmpdir, scraperclass): +def test_comicmodule(tmpdir, scraperobj): '''Test a scraper. It must be able to traverse backward for at least 5 strips from the start, and find strip images on at least 4 pages.''' - scraperobj = scraperclass() # Limit number of connections to one host. host = get_host(scraperobj.url) try: @@ -121,11 +120,11 @@ def _check_stripurl(strip, scraperobj): assert mo is not None, err -def get_test_scraperclasses(): +def get_test_scrapers(): """Return scrapers that should be tested.""" if "TESTALL" in os.environ: # test all comics (this will take some time) - scraperclasses = scraper.get_scraperclasses() + scrapers = scraper.get_scrapers() else: if 'TESTCOMICS' in os.environ: scraper_pattern = re.compile(os.environ['TESTCOMICS']) @@ -139,13 +138,13 @@ def get_test_scraperclasses(): ] scraper_pattern = re.compile('|'.join(testscrapernames)) - scraperclasses = [ - scraperclass for scraperclass in scraper.get_scraperclasses() - if scraper_pattern.match(scraperclass.getName()) + scrapers = [ + scraperobj for scraperobj in scraper.get_scrapers() + if scraper_pattern.match(scraperobj.name) ] - return scraperclasses + return scrapers def pytest_generate_tests(metafunc): - if 'scraperclass' in metafunc.fixturenames: - metafunc.parametrize('scraperclass', get_test_scraperclasses()) + if 'scraperobj' in metafunc.fixturenames: + metafunc.parametrize('scraperobj', get_test_scrapers()) diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 6242cba43..8ac335544 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -9,20 +9,19 @@ from dosagelib import scraper class TestScraper(object): """Test scraper module functions.""" - def test_get_scraperclasses(self): - for scraperclass in scraper.get_scraperclasses(): - scraperobj = scraperclass() - scraperobj = scraperclass(indexes=["bla"]) - assert scraperobj.url, "missing url in %s" % scraperobj.getName() + def test_get_scrapers(self): + for scraperobj in scraper.get_scrapers(): + scraperobj.indexes = ["bla"] + assert scraperobj.url, "missing url in %s" % scraperobj.name - def test_find_scraperclasses_single(self): - result = scraper.find_scraperclasses("xkcd") + def test_find_scrapers_single(self): + result = scraper.find_scrapers("xkcd") assert len(result) == 1 - def test_find_scraperclasses_multi(self): - result = scraper.find_scraperclasses("a", multiple_allowed=True) + def test_find_scrapers_multi(self): + result = scraper.find_scrapers("a", multiple_allowed=True) assert len(result) > 1 - def test_find_scraperclasses_error(self): + def test_find_scrapers_error(self): with pytest.raises(ValueError): - scraper.find_scraperclasses("") + scraper.find_scrapers("") diff --git a/tests/test_vote.py b/tests/test_vote.py index 1600c4196..8b8f8da37 100644 --- a/tests/test_vote.py +++ b/tests/test_vote.py @@ -12,5 +12,5 @@ class ATestScraper(scraper._BasicScraper): class TestVote(object): def test_vote(self): - answer = ATestScraper.vote() + answer = ATestScraper().vote() assert answer in ('counted', 'no'), 'invalid answer %r' % answer