Convert scraper cache to a class

This should make it easier to extend with additional entries.
This commit is contained in:
Tobias Gruetzmacher 2020-10-01 18:49:14 +02:00
parent 5ec0710d26
commit 9237bd62b2
8 changed files with 92 additions and 83 deletions

View file

@ -1,13 +1,14 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import argparse import argparse
import os import os
from . import events, configuration, singleton, director, scraper from . import events, configuration, singleton, director
from . import AppName, __version__ from . import AppName, __version__
from .output import out from .output import out
from .scraper import scrapers as allscrapers
from .util import internal_error, strlimit from .util import internal_error, strlimit
@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False):
out.info(u'Comics tagged with [{}] require age confirmation' out.info(u'Comics tagged with [{}] require age confirmation'
' with the --adult option.'.format(TAG_ADULT)) ' with the --adult option.'.format(TAG_ADULT))
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(scraper.get_scrapers(listall), scrapers = sorted(allscrapers.get(listall),
key=lambda s: s.name.lower()) key=lambda s: s.name.lower())
if column_list: if column_list:
num, disabled = do_column_list(scrapers) num, disabled = do_column_list(scrapers)

View file

@ -10,7 +10,8 @@ from queue import Queue, Empty
from urllib.parse import urlparse from urllib.parse import urlparse
from .output import out from .output import out
from . import events, scraper from .scraper import scrapers as allscrapers
from . import events
class ComicQueue(Queue): class ComicQueue(Queue):
@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
else: else:
name = comic name = comic
indexes = None indexes = None
found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed) found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed)
for scraperobj in found_scrapers: for scraperobj in found_scrapers:
if shouldRunScraper(scraperobj, adult, listing): if shouldRunScraper(scraperobj, adult, listing):
# FIXME: Find a better way to work with indexes # FIXME: Find a better way to work with indexes
@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
def get_existing_comics(basepath=None, adult=True, listing=False): def get_existing_comics(basepath=None, adult=True, listing=False):
for scraperobj in scraper.get_scrapers(include_removed=True): for scraperobj in allscrapers.get(include_removed=True):
dirname = scraperobj.get_download_dir(basepath) dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname): if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing): if shouldRunScraper(scraperobj, adult, listing):

View file

@ -536,7 +536,15 @@ class _ParserScraper(Scraper):
return res return res
def find_scrapers(comic, multiple_allowed=False): class Cache:
"""Cache for comic scraper objects. The cache is initialized on first use.
This is cached, since iterating & loading a complete package might be quite
slow.
"""
def __init__(self):
self.data = None
def find(self, comic, multiple_allowed=False):
"""Get a list comic scraper objects. """Get a list comic scraper objects.
Can return more than one entry if multiple_allowed is True, else it raises Can return more than one entry if multiple_allowed is True, else it raises
@ -547,7 +555,7 @@ def find_scrapers(comic, multiple_allowed=False):
raise ValueError("empty comic name") raise ValueError("empty comic name")
candidates = [] candidates = []
cname = comic.lower() cname = comic.lower()
for scrapers in get_scrapers(include_removed=True): for scrapers in self.get(include_removed=True):
lname = scrapers.name.lower() lname = scrapers.name.lower()
if lname == cname: if lname == cname:
# perfect match # perfect match
@ -564,36 +572,31 @@ def find_scrapers(comic, multiple_allowed=False):
raise ValueError('comic %r not found' % comic) raise ValueError('comic %r not found' % comic)
return candidates return candidates
def load(self):
out.debug("Loading comic modules...")
modules = loader.get_modules('plugins')
plugins = list(loader.get_plugins(modules, Scraper))
self.data = list([m for x in plugins for m in x.getmodules()])
self.validate()
out.debug("... %d modules loaded from %d classes." % (
len(self.data), len(plugins)))
_scrapers = None def get(self, include_removed=False):
def get_scrapers(include_removed=False):
"""Find all comic scraper classes in the plugins directory. """Find all comic scraper classes in the plugins directory.
The result is cached.
@return: list of Scraper classes @return: list of Scraper classes
@rtype: list of Scraper @rtype: list of Scraper
""" """
global _scrapers if not self.data:
if _scrapers is None: self.load()
out.debug(u"Loading comic modules...")
modules = loader.get_modules('plugins')
plugins = list(loader.get_plugins(modules, Scraper))
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
key=lambda p: p.name)
check_scrapers()
out.debug(u"... %d modules loaded from %d classes." % (
len(_scrapers), len(plugins)))
if include_removed: if include_removed:
return _scrapers return self.data
else: else:
return [x for x in _scrapers if x.url] return [x for x in self.data if x.url]
def validate(self):
def check_scrapers():
"""Check for duplicate scraper names.""" """Check for duplicate scraper names."""
d = {} d = {}
for scraper in _scrapers: for scraper in self.data:
name = scraper.name.lower() name = scraper.name.lower()
if name in d: if name in d:
name1 = scraper.name name1 = scraper.name
@ -601,3 +604,6 @@ def check_scrapers():
raise ValueError('duplicate scrapers %s and %s found' % raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2)) (name1, name2))
d[name] = scraper d[name] = scraper
scrapers = Cache()

View file

@ -7,7 +7,7 @@
import os import os
import codecs import codecs
from dosagelib.scraper import get_scrapers from dosagelib.scraper import scrapers
def main(): def main():
@ -24,7 +24,7 @@ def main():
def get_used_languages(): def get_used_languages():
languages = {} languages = {}
for scraperobj in get_scrapers(): for scraperobj in scrapers.get():
lang = scraperobj.lang lang = scraperobj.lang
if lang not in languages: if lang not in languages:
languages[lang] = scraperobj.language() languages[lang] = scraperobj.language()

View file

@ -12,8 +12,9 @@ import time
import lxml import lxml
from dosagelib.scraper import scrapers
from dosagelib.util import get_page from dosagelib.util import get_page
from dosagelib import scraper, http from dosagelib import http
def first_lower(x): def first_lower(x):
@ -129,7 +130,7 @@ class ComicListUpdater(object):
"""Check if comic name already exists.""" """Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates] names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names: if names:
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
lname = scraperobj.name.lower() lname = scraperobj.name.lower()
if lname in names: if lname in names:
return scraperobj.name return scraperobj.name

View file

@ -1,21 +1,21 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import re import re
import os import os
import pytest import pytest
from xdist.dsession import LoadScopeScheduling from xdist.dsession import LoadScopeScheduling
from dosagelib import scraper from dosagelib.scraper import scrapers
def get_test_scrapers(): def get_test_scrapers():
"""Return scrapers that should be tested.""" """Return scrapers that should be tested."""
if "TESTALL" in os.environ: if "TESTALL" in os.environ:
# test all comics (this will take some time) # test all comics (this will take some time)
return scraper.get_scrapers() return scrapers.get()
if 'TESTCOMICS' in os.environ: if 'TESTCOMICS' in os.environ:
scraper_pattern = re.compile(os.environ['TESTCOMICS']) scraper_pattern = re.compile(os.environ['TESTCOMICS'])
else: else:
@ -31,7 +31,7 @@ def get_test_scrapers():
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$') scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
return [ return [
scraperobj for scraperobj in scraper.get_scrapers() scraperobj for scraperobj in scrapers.get()
if scraper_pattern.match(scraperobj.name) if scraper_pattern.match(scraperobj.name)
] ]

View file

@ -1,17 +1,17 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import re import re
from dosagelib import scraper from dosagelib.scraper import scrapers
from dosagelib.plugins import old from dosagelib.plugins import old
class TestComicNames(object): class TestComicNames(object):
def test_names(self): def test_names(self):
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
name = scraperobj.name name = scraperobj.name
assert name.count('/') <= 1 assert name.count('/') <= 1
if '/' in name: if '/' in name:
@ -21,10 +21,10 @@ class TestComicNames(object):
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
def test_renamed(self): def test_renamed(self):
for scraperobj in scraper.get_scrapers(include_removed=True): for scraperobj in scrapers.get(include_removed=True):
if not isinstance(scraperobj, old.Renamed): if not isinstance(scraperobj, old.Renamed):
continue continue
assert len(scraperobj.getDisabledReasons()) > 0 assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper # Renamed scraper should only point to an non-disabled scraper
newscraper = scraper.find_scrapers(scraperobj.newname)[0] newscraper = scrapers.find(scraperobj.newname)[0]
assert len(newscraper.getDisabledReasons()) == 0 assert len(newscraper.getDisabledReasons()) == 0

View file

@ -1,26 +1,26 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
import pytest import pytest
from dosagelib import scraper from dosagelib.scraper import scrapers
class TestScraper(object): class TestScraper(object):
"""Test scraper module functions.""" """Test scraper module functions."""
def test_get_scrapers(self): def test_get_scrapers(self):
for scraperobj in scraper.get_scrapers(): for scraperobj in scrapers.get():
scraperobj.indexes = ["bla"] scraperobj.indexes = ["bla"]
assert scraperobj.url, "missing url in %s" % scraperobj.name assert scraperobj.url, "missing url in %s" % scraperobj.name
def test_find_scrapers_single(self): def test_find_scrapers_single(self):
result = scraper.find_scrapers("xkcd") result = scrapers.find("xkcd")
assert len(result) == 1 assert len(result) == 1
def test_find_scrapers_multi(self): def test_find_scrapers_multi(self):
result = scraper.find_scrapers("a", multiple_allowed=True) result = scrapers.find("a", multiple_allowed=True)
assert len(result) > 1 assert len(result) > 1
def test_find_scrapers_error(self): def test_find_scrapers_error(self):
with pytest.raises(ValueError, match='empty comic name'): with pytest.raises(ValueError, match='empty comic name'):
scraper.find_scrapers('') scrapers.find('')