Convert scraper cache to a class

This should make it easier to extend with additional entries.
This commit is contained in:
Tobias Gruetzmacher 2020-10-01 18:49:14 +02:00
parent 5ec0710d26
commit 9237bd62b2
8 changed files with 92 additions and 83 deletions

View file

@ -1,13 +1,14 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import argparse
import os
from . import events, configuration, singleton, director, scraper
from . import events, configuration, singleton, director
from . import AppName, __version__
from .output import out
from .scraper import scrapers as allscrapers
from .util import internal_error, strlimit
@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False):
out.info(u'Comics tagged with [{}] require age confirmation'
' with the --adult option.'.format(TAG_ADULT))
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(scraper.get_scrapers(listall),
scrapers = sorted(allscrapers.get(listall),
key=lambda s: s.name.lower())
if column_list:
num, disabled = do_column_list(scrapers)

View file

@ -10,7 +10,8 @@ from queue import Queue, Empty
from urllib.parse import urlparse
from .output import out
from . import events, scraper
from .scraper import scrapers as allscrapers
from . import events
class ComicQueue(Queue):
@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
else:
name = comic
indexes = None
found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed)
found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed)
for scraperobj in found_scrapers:
if shouldRunScraper(scraperobj, adult, listing):
# FIXME: Find a better way to work with indexes
@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
def get_existing_comics(basepath=None, adult=True, listing=False):
for scraperobj in scraper.get_scrapers(include_removed=True):
for scraperobj in allscrapers.get(include_removed=True):
dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing):

View file

@ -536,68 +536,74 @@ class _ParserScraper(Scraper):
return res
def find_scrapers(comic, multiple_allowed=False):
"""Get a list comic scraper objects.
Can return more than one entry if multiple_allowed is True, else it raises
a ValueError if multiple modules match. The match is a case insensitive
substring search.
class Cache:
"""Cache for comic scraper objects. The cache is initialized on first use.
This is cached, since iterating & loading a complete package might be quite
slow.
"""
if not comic:
raise ValueError("empty comic name")
candidates = []
cname = comic.lower()
for scrapers in get_scrapers(include_removed=True):
lname = scrapers.name.lower()
if lname == cname:
# perfect match
if not multiple_allowed:
return [scrapers]
else:
def __init__(self):
self.data = None
def find(self, comic, multiple_allowed=False):
"""Get a list comic scraper objects.
Can return more than one entry if multiple_allowed is True, else it raises
a ValueError if multiple modules match. The match is a case insensitive
substring search.
"""
if not comic:
raise ValueError("empty comic name")
candidates = []
cname = comic.lower()
for scrapers in self.get(include_removed=True):
lname = scrapers.name.lower()
if lname == cname:
# perfect match
if not multiple_allowed:
return [scrapers]
else:
candidates.append(scrapers)
elif cname in lname and scrapers.url:
candidates.append(scrapers)
elif cname in lname and scrapers.url:
candidates.append(scrapers)
if len(candidates) > 1 and not multiple_allowed:
comics = ", ".join(x.name for x in candidates)
raise ValueError('multiple comics found: %s' % comics)
elif not candidates:
raise ValueError('comic %r not found' % comic)
return candidates
if len(candidates) > 1 and not multiple_allowed:
comics = ", ".join(x.name for x in candidates)
raise ValueError('multiple comics found: %s' % comics)
elif not candidates:
raise ValueError('comic %r not found' % comic)
return candidates
_scrapers = None
def get_scrapers(include_removed=False):
"""Find all comic scraper classes in the plugins directory.
The result is cached.
@return: list of Scraper classes
@rtype: list of Scraper
"""
global _scrapers
if _scrapers is None:
out.debug(u"Loading comic modules...")
def load(self):
out.debug("Loading comic modules...")
modules = loader.get_modules('plugins')
plugins = list(loader.get_plugins(modules, Scraper))
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
key=lambda p: p.name)
check_scrapers()
out.debug(u"... %d modules loaded from %d classes." % (
len(_scrapers), len(plugins)))
if include_removed:
return _scrapers
else:
return [x for x in _scrapers if x.url]
self.data = list([m for x in plugins for m in x.getmodules()])
self.validate()
out.debug("... %d modules loaded from %d classes." % (
len(self.data), len(plugins)))
def get(self, include_removed=False):
"""Find all comic scraper classes in the plugins directory.
@return: list of Scraper classes
@rtype: list of Scraper
"""
if not self.data:
self.load()
if include_removed:
return self.data
else:
return [x for x in self.data if x.url]
def validate(self):
"""Check for duplicate scraper names."""
d = {}
for scraper in self.data:
name = scraper.name.lower()
if name in d:
name1 = scraper.name
name2 = d[name].name
raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraper
def check_scrapers():
"""Check for duplicate scraper names."""
d = {}
for scraper in _scrapers:
name = scraper.name.lower()
if name in d:
name1 = scraper.name
name2 = d[name].name
raise ValueError('duplicate scrapers %s and %s found' %
(name1, name2))
d[name] = scraper
scrapers = Cache()

View file

@ -7,7 +7,7 @@
import os
import codecs
from dosagelib.scraper import get_scrapers
from dosagelib.scraper import scrapers
def main():
@ -24,7 +24,7 @@ def main():
def get_used_languages():
languages = {}
for scraperobj in get_scrapers():
for scraperobj in scrapers.get():
lang = scraperobj.lang
if lang not in languages:
languages[lang] = scraperobj.language()

View file

@ -12,8 +12,9 @@ import time
import lxml
from dosagelib.scraper import scrapers
from dosagelib.util import get_page
from dosagelib import scraper, http
from dosagelib import http
def first_lower(x):
@ -129,7 +130,7 @@ class ComicListUpdater(object):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraperobj in scraper.get_scrapers():
for scraperobj in scrapers.get():
lname = scraperobj.name.lower()
if lname in names:
return scraperobj.name

View file

@ -1,21 +1,21 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import re
import os
import pytest
from xdist.dsession import LoadScopeScheduling
from dosagelib import scraper
from dosagelib.scraper import scrapers
def get_test_scrapers():
"""Return scrapers that should be tested."""
if "TESTALL" in os.environ:
# test all comics (this will take some time)
return scraper.get_scrapers()
return scrapers.get()
if 'TESTCOMICS' in os.environ:
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
else:
@ -31,7 +31,7 @@ def get_test_scrapers():
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
return [
scraperobj for scraperobj in scraper.get_scrapers()
scraperobj for scraperobj in scrapers.get()
if scraper_pattern.match(scraperobj.name)
]

View file

@ -1,17 +1,17 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import re
from dosagelib import scraper
from dosagelib.scraper import scrapers
from dosagelib.plugins import old
class TestComicNames(object):
def test_names(self):
for scraperobj in scraper.get_scrapers():
for scraperobj in scrapers.get():
name = scraperobj.name
assert name.count('/') <= 1
if '/' in name:
@ -21,10 +21,10 @@ class TestComicNames(object):
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
def test_renamed(self):
for scraperobj in scraper.get_scrapers(include_removed=True):
for scraperobj in scrapers.get(include_removed=True):
if not isinstance(scraperobj, old.Renamed):
continue
assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper
newscraper = scraper.find_scrapers(scraperobj.newname)[0]
newscraper = scrapers.find(scraperobj.newname)[0]
assert len(newscraper.getDisabledReasons()) == 0

View file

@ -1,26 +1,26 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
import pytest
from dosagelib import scraper
from dosagelib.scraper import scrapers
class TestScraper(object):
"""Test scraper module functions."""
def test_get_scrapers(self):
for scraperobj in scraper.get_scrapers():
for scraperobj in scrapers.get():
scraperobj.indexes = ["bla"]
assert scraperobj.url, "missing url in %s" % scraperobj.name
def test_find_scrapers_single(self):
result = scraper.find_scrapers("xkcd")
result = scrapers.find("xkcd")
assert len(result) == 1
def test_find_scrapers_multi(self):
result = scraper.find_scrapers("a", multiple_allowed=True)
result = scrapers.find("a", multiple_allowed=True)
assert len(result) > 1
def test_find_scrapers_error(self):
with pytest.raises(ValueError, match='empty comic name'):
scraper.find_scrapers('')
scrapers.find('')