From 9237bd62b270a0177b7ac9ed719d605c99dff8fb Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Thu, 1 Oct 2020 18:49:14 +0200 Subject: [PATCH] Convert scraper cache to a class This should make it easier to extend with additional entries. --- dosagelib/cmd.py | 7 ++- dosagelib/director.py | 7 ++- dosagelib/scraper.py | 122 ++++++++++++++++++++------------------ scripts/mklanguages.py | 4 +- scripts/scriptutil.py | 5 +- tests/modules/conftest.py | 8 +-- tests/test_comicnames.py | 10 ++-- tests/test_scraper.py | 12 ++-- 8 files changed, 92 insertions(+), 83 deletions(-) diff --git a/dosagelib/cmd.py b/dosagelib/cmd.py index 4db38bb93..cd0ed61fc 100644 --- a/dosagelib/cmd.py +++ b/dosagelib/cmd.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher import argparse import os -from . import events, configuration, singleton, director, scraper +from . import events, configuration, singleton, director from . import AppName, __version__ from .output import out +from .scraper import scrapers as allscrapers from .util import internal_error, strlimit @@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False): out.info(u'Comics tagged with [{}] require age confirmation' ' with the --adult option.'.format(TAG_ADULT)) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) - scrapers = sorted(scraper.get_scrapers(listall), + scrapers = sorted(allscrapers.get(listall), key=lambda s: s.name.lower()) if column_list: num, disabled = do_column_list(scrapers) diff --git a/dosagelib/director.py b/dosagelib/director.py index bb2479e96..a7811bec0 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -10,7 +10,8 @@ from queue import Queue, Empty from urllib.parse import urlparse from .output import out -from . import events, scraper +from .scraper import scrapers as allscrapers +from . import events class ComicQueue(Queue): @@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi else: name = comic indexes = None - found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed) + found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed) for scraperobj in found_scrapers: if shouldRunScraper(scraperobj, adult, listing): # FIXME: Find a better way to work with indexes @@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi def get_existing_comics(basepath=None, adult=True, listing=False): - for scraperobj in scraper.get_scrapers(include_removed=True): + for scraperobj in allscrapers.get(include_removed=True): dirname = scraperobj.get_download_dir(basepath) if os.path.isdir(dirname): if shouldRunScraper(scraperobj, adult, listing): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index c54aaac34..41402e62f 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -536,68 +536,74 @@ class _ParserScraper(Scraper): return res -def find_scrapers(comic, multiple_allowed=False): - """Get a list comic scraper objects. - - Can return more than one entry if multiple_allowed is True, else it raises - a ValueError if multiple modules match. The match is a case insensitive - substring search. +class Cache: + """Cache for comic scraper objects. The cache is initialized on first use. + This is cached, since iterating & loading a complete package might be quite + slow. """ - if not comic: - raise ValueError("empty comic name") - candidates = [] - cname = comic.lower() - for scrapers in get_scrapers(include_removed=True): - lname = scrapers.name.lower() - if lname == cname: - # perfect match - if not multiple_allowed: - return [scrapers] - else: + def __init__(self): + self.data = None + + def find(self, comic, multiple_allowed=False): + """Get a list comic scraper objects. + + Can return more than one entry if multiple_allowed is True, else it raises + a ValueError if multiple modules match. The match is a case insensitive + substring search. + """ + if not comic: + raise ValueError("empty comic name") + candidates = [] + cname = comic.lower() + for scrapers in self.get(include_removed=True): + lname = scrapers.name.lower() + if lname == cname: + # perfect match + if not multiple_allowed: + return [scrapers] + else: + candidates.append(scrapers) + elif cname in lname and scrapers.url: candidates.append(scrapers) - elif cname in lname and scrapers.url: - candidates.append(scrapers) - if len(candidates) > 1 and not multiple_allowed: - comics = ", ".join(x.name for x in candidates) - raise ValueError('multiple comics found: %s' % comics) - elif not candidates: - raise ValueError('comic %r not found' % comic) - return candidates + if len(candidates) > 1 and not multiple_allowed: + comics = ", ".join(x.name for x in candidates) + raise ValueError('multiple comics found: %s' % comics) + elif not candidates: + raise ValueError('comic %r not found' % comic) + return candidates - -_scrapers = None - - -def get_scrapers(include_removed=False): - """Find all comic scraper classes in the plugins directory. - The result is cached. - @return: list of Scraper classes - @rtype: list of Scraper - """ - global _scrapers - if _scrapers is None: - out.debug(u"Loading comic modules...") + def load(self): + out.debug("Loading comic modules...") modules = loader.get_modules('plugins') plugins = list(loader.get_plugins(modules, Scraper)) - _scrapers = sorted([m for x in plugins for m in x.getmodules()], - key=lambda p: p.name) - check_scrapers() - out.debug(u"... %d modules loaded from %d classes." % ( - len(_scrapers), len(plugins))) - if include_removed: - return _scrapers - else: - return [x for x in _scrapers if x.url] + self.data = list([m for x in plugins for m in x.getmodules()]) + self.validate() + out.debug("... %d modules loaded from %d classes." % ( + len(self.data), len(plugins))) + + def get(self, include_removed=False): + """Find all comic scraper classes in the plugins directory. + @return: list of Scraper classes + @rtype: list of Scraper + """ + if not self.data: + self.load() + if include_removed: + return self.data + else: + return [x for x in self.data if x.url] + + def validate(self): + """Check for duplicate scraper names.""" + d = {} + for scraper in self.data: + name = scraper.name.lower() + if name in d: + name1 = scraper.name + name2 = d[name].name + raise ValueError('duplicate scrapers %s and %s found' % + (name1, name2)) + d[name] = scraper -def check_scrapers(): - """Check for duplicate scraper names.""" - d = {} - for scraper in _scrapers: - name = scraper.name.lower() - if name in d: - name1 = scraper.name - name2 = d[name].name - raise ValueError('duplicate scrapers %s and %s found' % - (name1, name2)) - d[name] = scraper +scrapers = Cache() diff --git a/scripts/mklanguages.py b/scripts/mklanguages.py index 5cc3f16c4..cf8920fca 100755 --- a/scripts/mklanguages.py +++ b/scripts/mklanguages.py @@ -7,7 +7,7 @@ import os import codecs -from dosagelib.scraper import get_scrapers +from dosagelib.scraper import scrapers def main(): @@ -24,7 +24,7 @@ def main(): def get_used_languages(): languages = {} - for scraperobj in get_scrapers(): + for scraperobj in scrapers.get(): lang = scraperobj.lang if lang not in languages: languages[lang] = scraperobj.language() diff --git a/scripts/scriptutil.py b/scripts/scriptutil.py index 9c696d901..292596a5e 100644 --- a/scripts/scriptutil.py +++ b/scripts/scriptutil.py @@ -12,8 +12,9 @@ import time import lxml +from dosagelib.scraper import scrapers from dosagelib.util import get_page -from dosagelib import scraper, http +from dosagelib import http def first_lower(x): @@ -129,7 +130,7 @@ class ComicListUpdater(object): """Check if comic name already exists.""" names = [(tmpl % name).lower() for tmpl in self.dup_templates] if names: - for scraperobj in scraper.get_scrapers(): + for scraperobj in scrapers.get(): lname = scraperobj.name.lower() if lname in names: return scraperobj.name diff --git a/tests/modules/conftest.py b/tests/modules/conftest.py index c7af1ef78..0433ef575 100644 --- a/tests/modules/conftest.py +++ b/tests/modules/conftest.py @@ -1,21 +1,21 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher import re import os import pytest from xdist.dsession import LoadScopeScheduling -from dosagelib import scraper +from dosagelib.scraper import scrapers def get_test_scrapers(): """Return scrapers that should be tested.""" if "TESTALL" in os.environ: # test all comics (this will take some time) - return scraper.get_scrapers() + return scrapers.get() if 'TESTCOMICS' in os.environ: scraper_pattern = re.compile(os.environ['TESTCOMICS']) else: @@ -31,7 +31,7 @@ def get_test_scrapers(): scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$') return [ - scraperobj for scraperobj in scraper.get_scrapers() + scraperobj for scraperobj in scrapers.get() if scraper_pattern.match(scraperobj.name) ] diff --git a/tests/test_comicnames.py b/tests/test_comicnames.py index f4ed860a8..7d094e0bf 100644 --- a/tests/test_comicnames.py +++ b/tests/test_comicnames.py @@ -1,17 +1,17 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher import re -from dosagelib import scraper +from dosagelib.scraper import scrapers from dosagelib.plugins import old class TestComicNames(object): def test_names(self): - for scraperobj in scraper.get_scrapers(): + for scraperobj in scrapers.get(): name = scraperobj.name assert name.count('/') <= 1 if '/' in name: @@ -21,10 +21,10 @@ class TestComicNames(object): assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname def test_renamed(self): - for scraperobj in scraper.get_scrapers(include_removed=True): + for scraperobj in scrapers.get(include_removed=True): if not isinstance(scraperobj, old.Renamed): continue assert len(scraperobj.getDisabledReasons()) > 0 # Renamed scraper should only point to an non-disabled scraper - newscraper = scraper.find_scrapers(scraperobj.newname)[0] + newscraper = scrapers.find(scraperobj.newname)[0] assert len(newscraper.getDisabledReasons()) == 0 diff --git a/tests/test_scraper.py b/tests/test_scraper.py index 06076cddf..9a479b35b 100644 --- a/tests/test_scraper.py +++ b/tests/test_scraper.py @@ -1,26 +1,26 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2013-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher import pytest -from dosagelib import scraper +from dosagelib.scraper import scrapers class TestScraper(object): """Test scraper module functions.""" def test_get_scrapers(self): - for scraperobj in scraper.get_scrapers(): + for scraperobj in scrapers.get(): scraperobj.indexes = ["bla"] assert scraperobj.url, "missing url in %s" % scraperobj.name def test_find_scrapers_single(self): - result = scraper.find_scrapers("xkcd") + result = scrapers.find("xkcd") assert len(result) == 1 def test_find_scrapers_multi(self): - result = scraper.find_scrapers("a", multiple_allowed=True) + result = scrapers.find("a", multiple_allowed=True) assert len(result) > 1 def test_find_scrapers_error(self): with pytest.raises(ValueError, match='empty comic name'): - scraper.find_scrapers('') + scrapers.find('')