Convert scraper cache to a class

This should make it easier to extend with additional entries.
This commit is contained in:
Tobias Gruetzmacher 2020-10-01 18:49:14 +02:00
parent 5ec0710d26
commit 9237bd62b2
8 changed files with 92 additions and 83 deletions

View file

@ -1,13 +1,14 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import argparse import argparse
import os import os
from . import events, configuration, singleton, director, scraper from . import events, configuration, singleton, director
from . import AppName, __version__ from . import AppName, __version__
from .output import out from .output import out
from .scraper import scrapers as allscrapers
from .util import internal_error, strlimit from .util import internal_error, strlimit
@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False):
out.info(u'Comics tagged with [{}] require age confirmation' out.info(u'Comics tagged with [{}] require age confirmation'
' with the --adult option.'.format(TAG_ADULT)) ' with the --adult option.'.format(TAG_ADULT))
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(scraper.get_scrapers(listall), scrapers = sorted(allscrapers.get(listall),
key=lambda s: s.name.lower()) key=lambda s: s.name.lower())
if column_list: if column_list:
num, disabled = do_column_list(scrapers) num, disabled = do_column_list(scrapers)

View file

@ -10,7 +10,8 @@ from queue import Queue, Empty
from urllib.parse import urlparse from urllib.parse import urlparse
from .output import out from .output import out
from . import events, scraper from .scraper import scrapers as allscrapers
from . import events
class ComicQueue(Queue): class ComicQueue(Queue):
@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
else: else:
name = comic name = comic
indexes = None indexes = None
found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed) found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed)
for scraperobj in found_scrapers: for scraperobj in found_scrapers:
if shouldRunScraper(scraperobj, adult, listing): if shouldRunScraper(scraperobj, adult, listing):
# FIXME: Find a better way to work with indexes # FIXME: Find a better way to work with indexes
@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
def get_existing_comics(basepath=None, adult=True, listing=False): def get_existing_comics(basepath=None, adult=True, listing=False):
for scraperobj in scraper.get_scrapers(include_removed=True): for scraperobj in allscrapers.get(include_removed=True):
dirname = scraperobj.get_download_dir(basepath) dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname): if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing): if shouldRunScraper(scraperobj, adult, listing):

View file

@ -536,68 +536,74 @@ class _ParserScraper(Scraper):
return res return res
def find_scrapers(comic, multiple_allowed=False): class Cache:
"""Get a list comic scraper objects. """Cache for comic scraper objects. The cache is initialized on first use.
This is cached, since iterating & loading a complete package might be quite
Can return more than one entry if multiple_allowed is True, else it raises slow.
a ValueError if multiple modules match. The match is a case insensitive
substring search.
""" """
if not comic: def __init__(self):
raise ValueError("empty comic name") self.data = None
candidates = []
cname = comic.lower() def find(self, comic, multiple_allowed=False):
for scrapers in get_scrapers(include_removed=True): """Get a list comic scraper objects.
lname = scrapers.name.lower()
if lname == cname: Can return more than one entry if multiple_allowed is True, else it raises
# perfect match a ValueError if multiple modules match. The match is a case insensitive
if not multiple_allowed: substring search.
return [scrapers] """
else: if not comic:
raise ValueError("empty comic name")
candidates = []
cname = comic.lower()
for scrapers in self.get(include_removed=True):
lname = scrapers.name.lower()
if lname == cname:
# perfect match
if not multiple_allowed:
return [scrapers]
else:
candidates.append(scrapers)
elif cname in lname and scrapers.url:
candidates.append(scrapers) candidates.append(scrapers)
elif cname in lname and scrapers.url: if len(candidates) > 1 and not multiple_allowed:
candidates.append(scrapers) comics = ", ".join(x.name for x in candidates)
if len(candidates) > 1 and not multiple_allowed: raise ValueError('multiple comics found: %s' % comics)
comics = ", ".join(x.name for x in candidates) elif not candidates:
raise ValueError('multiple comics found: %s' % comics) raise ValueError('comic %r not found' % comic)
elif not candidates: return candidates
raise ValueError('comic %r not found' % comic)
return candidates
def load(self):
_scrapers = None out.debug("Loading comic modules...")
def get_scrapers(include_removed=False):
"""Find all comic scraper classes in the plugins directory.
The result is cached.
@return: list of Scraper classes
@rtype: list of Scraper
"""
global _scrapers
if _scrapers is None:
out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins') modules = loader.get_modules('plugins')
plugins = list(loader.get_plugins(modules, Scraper)) plugins = list(loader.get_plugins(modules, Scraper))
_scrapers = sorted([m for x in plugins for m in x.getmodules()], self.data = list([m for x in plugins for m in x.getmodules()])
key=lambda p: p.name) self.validate()
check_scrapers() out.debug("... %d modules loaded from %d classes." % (
out.debug(u"... %d modules loaded from %d classes." % ( len(self.data), len(plugins)))
len(_scrapers), len(plugins)))
if include_removed: def get(self, include_removed=False):
return _scrapers """Find all comic scraper classes in the plugins directory.
else: @return: list of Scraper classes
return [x for x in _scrapers if x.url] @rtype: list of Scraper
"""
if not self.data:
self.load()
if include_removed:
return self.data
else:
return [x for x in self.data if x.url]
def validate(self):
"""Check for duplicate scraper names."""
d = {}
for scraper in self.data:
name = scraper.name.lower()
if name in d:
name1 = scraper.name
name2 = d[name].name
raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraper
def check_scrapers(): scrapers = Cache()
"""Check for duplicate scraper names."""
d = {}
for scraper in _scrapers:
name = scraper.name.lower()
if name in d:
name1 = scraper.name
name2 = d[name].name
raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraper

View file

@ -7,7 +7,7 @@
import os import os
import codecs import codecs
from dosagelib.scraper import get_scrapers from dosagelib.scraper import scrapers
def main(): def main():
@ -24,7 +24,7 @@ def main():
def get_used_languages(): def get_used_languages():
languages = {} languages = {}
for scraperobj in get_scrapers(): for scraperobj in scrapers.get():
lang = scraperobj.lang lang = scraperobj.lang
if lang not in languages: if lang not in languages:
languages[lang] = scraperobj.language() languages[lang] = scraperobj.language()

View file

@ -12,8 +12,9 @@ import time
import lxml import lxml
from dosagelib.scraper import scrapers
from dosagelib.util import get_page from dosagelib.util import get_page
from dosagelib import scraper, http from dosagelib import http
def first_lower(x): def first_lower(x):
@ -129,7 +130,7 @@ class ComicListUpdater(object):
"""Check if comic name already exists.""" """Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates] names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names: if names:
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
lname = scraperobj.name.lower() lname = scraperobj.name.lower()
if lname in names: if lname in names:
return scraperobj.name return scraperobj.name

View file

@ -1,21 +1,21 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import re import re
import os import os
import pytest import pytest
from xdist.dsession import LoadScopeScheduling from xdist.dsession import LoadScopeScheduling
from dosagelib import scraper from dosagelib.scraper import scrapers
def get_test_scrapers(): def get_test_scrapers():
"""Return scrapers that should be tested.""" """Return scrapers that should be tested."""
if "TESTALL" in os.environ: if "TESTALL" in os.environ:
# test all comics (this will take some time) # test all comics (this will take some time)
return scraper.get_scrapers() return scrapers.get()
if 'TESTCOMICS' in os.environ: if 'TESTCOMICS' in os.environ:
scraper_pattern = re.compile(os.environ['TESTCOMICS']) scraper_pattern = re.compile(os.environ['TESTCOMICS'])
else: else:
@ -31,7 +31,7 @@ def get_test_scrapers():
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$') scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
return [ return [
scraperobj for scraperobj in scraper.get_scrapers() scraperobj for scraperobj in scrapers.get()
if scraper_pattern.match(scraperobj.name) if scraper_pattern.match(scraperobj.name)
] ]

View file

@ -1,17 +1,17 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import re import re
from dosagelib import scraper from dosagelib.scraper import scrapers
from dosagelib.plugins import old from dosagelib.plugins import old
class TestComicNames(object): class TestComicNames(object):
def test_names(self): def test_names(self):
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
name = scraperobj.name name = scraperobj.name
assert name.count('/') <= 1 assert name.count('/') <= 1
if '/' in name: if '/' in name:
@ -21,10 +21,10 @@ class TestComicNames(object):
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
def test_renamed(self): def test_renamed(self):
for scraperobj in scraper.get_scrapers(include_removed=True): for scraperobj in scrapers.get(include_removed=True):
if not isinstance(scraperobj, old.Renamed): if not isinstance(scraperobj, old.Renamed):
continue continue
assert len(scraperobj.getDisabledReasons()) > 0 assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper # Renamed scraper should only point to an non-disabled scraper
newscraper = scraper.find_scrapers(scraperobj.newname)[0] newscraper = scrapers.find(scraperobj.newname)[0]
assert len(newscraper.getDisabledReasons()) == 0 assert len(newscraper.getDisabledReasons()) == 0

View file

@ -1,26 +1,26 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import pytest import pytest
from dosagelib import scraper from dosagelib.scraper import scrapers
class TestScraper(object): class TestScraper(object):
"""Test scraper module functions.""" """Test scraper module functions."""
def test_get_scrapers(self): def test_get_scrapers(self):
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
scraperobj.indexes = ["bla"] scraperobj.indexes = ["bla"]
assert scraperobj.url, "missing url in %s" % scraperobj.name assert scraperobj.url, "missing url in %s" % scraperobj.name
def test_find_scrapers_single(self): def test_find_scrapers_single(self):
result = scraper.find_scrapers("xkcd") result = scrapers.find("xkcd")
assert len(result) == 1 assert len(result) == 1
def test_find_scrapers_multi(self): def test_find_scrapers_multi(self):
result = scraper.find_scrapers("a", multiple_allowed=True) result = scrapers.find("a", multiple_allowed=True)
assert len(result) > 1 assert len(result) > 1
def test_find_scrapers_error(self): def test_find_scrapers_error(self):
with pytest.raises(ValueError, match='empty comic name'): with pytest.raises(ValueError, match='empty comic name'):
scraper.find_scrapers('') scrapers.find('')