Convert scraper cache to a class
This should make it easier to extend with additional entries.
This commit is contained in:
parent
5ec0710d26
commit
9237bd62b2
8 changed files with 92 additions and 83 deletions
|
@ -1,13 +1,14 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import argparse
|
||||
import os
|
||||
|
||||
from . import events, configuration, singleton, director, scraper
|
||||
from . import events, configuration, singleton, director
|
||||
from . import AppName, __version__
|
||||
from .output import out
|
||||
from .scraper import scrapers as allscrapers
|
||||
from .util import internal_error, strlimit
|
||||
|
||||
|
||||
|
@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False):
|
|||
out.info(u'Comics tagged with [{}] require age confirmation'
|
||||
' with the --adult option.'.format(TAG_ADULT))
|
||||
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
||||
scrapers = sorted(scraper.get_scrapers(listall),
|
||||
scrapers = sorted(allscrapers.get(listall),
|
||||
key=lambda s: s.name.lower())
|
||||
if column_list:
|
||||
num, disabled = do_column_list(scrapers)
|
||||
|
|
|
@ -10,7 +10,8 @@ from queue import Queue, Empty
|
|||
from urllib.parse import urlparse
|
||||
|
||||
from .output import out
|
||||
from . import events, scraper
|
||||
from .scraper import scrapers as allscrapers
|
||||
from . import events
|
||||
|
||||
|
||||
class ComicQueue(Queue):
|
||||
|
@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
|||
else:
|
||||
name = comic
|
||||
indexes = None
|
||||
found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed)
|
||||
found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed)
|
||||
for scraperobj in found_scrapers:
|
||||
if shouldRunScraper(scraperobj, adult, listing):
|
||||
# FIXME: Find a better way to work with indexes
|
||||
|
@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
|||
|
||||
|
||||
def get_existing_comics(basepath=None, adult=True, listing=False):
|
||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
||||
for scraperobj in allscrapers.get(include_removed=True):
|
||||
dirname = scraperobj.get_download_dir(basepath)
|
||||
if os.path.isdir(dirname):
|
||||
if shouldRunScraper(scraperobj, adult, listing):
|
||||
|
|
|
@ -536,68 +536,74 @@ class _ParserScraper(Scraper):
|
|||
return res
|
||||
|
||||
|
||||
def find_scrapers(comic, multiple_allowed=False):
|
||||
"""Get a list comic scraper objects.
|
||||
|
||||
Can return more than one entry if multiple_allowed is True, else it raises
|
||||
a ValueError if multiple modules match. The match is a case insensitive
|
||||
substring search.
|
||||
class Cache:
|
||||
"""Cache for comic scraper objects. The cache is initialized on first use.
|
||||
This is cached, since iterating & loading a complete package might be quite
|
||||
slow.
|
||||
"""
|
||||
if not comic:
|
||||
raise ValueError("empty comic name")
|
||||
candidates = []
|
||||
cname = comic.lower()
|
||||
for scrapers in get_scrapers(include_removed=True):
|
||||
lname = scrapers.name.lower()
|
||||
if lname == cname:
|
||||
# perfect match
|
||||
if not multiple_allowed:
|
||||
return [scrapers]
|
||||
else:
|
||||
def __init__(self):
|
||||
self.data = None
|
||||
|
||||
def find(self, comic, multiple_allowed=False):
|
||||
"""Get a list comic scraper objects.
|
||||
|
||||
Can return more than one entry if multiple_allowed is True, else it raises
|
||||
a ValueError if multiple modules match. The match is a case insensitive
|
||||
substring search.
|
||||
"""
|
||||
if not comic:
|
||||
raise ValueError("empty comic name")
|
||||
candidates = []
|
||||
cname = comic.lower()
|
||||
for scrapers in self.get(include_removed=True):
|
||||
lname = scrapers.name.lower()
|
||||
if lname == cname:
|
||||
# perfect match
|
||||
if not multiple_allowed:
|
||||
return [scrapers]
|
||||
else:
|
||||
candidates.append(scrapers)
|
||||
elif cname in lname and scrapers.url:
|
||||
candidates.append(scrapers)
|
||||
elif cname in lname and scrapers.url:
|
||||
candidates.append(scrapers)
|
||||
if len(candidates) > 1 and not multiple_allowed:
|
||||
comics = ", ".join(x.name for x in candidates)
|
||||
raise ValueError('multiple comics found: %s' % comics)
|
||||
elif not candidates:
|
||||
raise ValueError('comic %r not found' % comic)
|
||||
return candidates
|
||||
if len(candidates) > 1 and not multiple_allowed:
|
||||
comics = ", ".join(x.name for x in candidates)
|
||||
raise ValueError('multiple comics found: %s' % comics)
|
||||
elif not candidates:
|
||||
raise ValueError('comic %r not found' % comic)
|
||||
return candidates
|
||||
|
||||
|
||||
_scrapers = None
|
||||
|
||||
|
||||
def get_scrapers(include_removed=False):
|
||||
"""Find all comic scraper classes in the plugins directory.
|
||||
The result is cached.
|
||||
@return: list of Scraper classes
|
||||
@rtype: list of Scraper
|
||||
"""
|
||||
global _scrapers
|
||||
if _scrapers is None:
|
||||
out.debug(u"Loading comic modules...")
|
||||
def load(self):
|
||||
out.debug("Loading comic modules...")
|
||||
modules = loader.get_modules('plugins')
|
||||
plugins = list(loader.get_plugins(modules, Scraper))
|
||||
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
|
||||
key=lambda p: p.name)
|
||||
check_scrapers()
|
||||
out.debug(u"... %d modules loaded from %d classes." % (
|
||||
len(_scrapers), len(plugins)))
|
||||
if include_removed:
|
||||
return _scrapers
|
||||
else:
|
||||
return [x for x in _scrapers if x.url]
|
||||
self.data = list([m for x in plugins for m in x.getmodules()])
|
||||
self.validate()
|
||||
out.debug("... %d modules loaded from %d classes." % (
|
||||
len(self.data), len(plugins)))
|
||||
|
||||
def get(self, include_removed=False):
|
||||
"""Find all comic scraper classes in the plugins directory.
|
||||
@return: list of Scraper classes
|
||||
@rtype: list of Scraper
|
||||
"""
|
||||
if not self.data:
|
||||
self.load()
|
||||
if include_removed:
|
||||
return self.data
|
||||
else:
|
||||
return [x for x in self.data if x.url]
|
||||
|
||||
def validate(self):
|
||||
"""Check for duplicate scraper names."""
|
||||
d = {}
|
||||
for scraper in self.data:
|
||||
name = scraper.name.lower()
|
||||
if name in d:
|
||||
name1 = scraper.name
|
||||
name2 = d[name].name
|
||||
raise ValueError('duplicate scrapers %s and %s found' %
|
||||
(name1, name2))
|
||||
d[name] = scraper
|
||||
|
||||
|
||||
def check_scrapers():
|
||||
"""Check for duplicate scraper names."""
|
||||
d = {}
|
||||
for scraper in _scrapers:
|
||||
name = scraper.name.lower()
|
||||
if name in d:
|
||||
name1 = scraper.name
|
||||
name2 = d[name].name
|
||||
raise ValueError('duplicate scrapers %s and %s found' %
|
||||
(name1, name2))
|
||||
d[name] = scraper
|
||||
scrapers = Cache()
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
import os
|
||||
import codecs
|
||||
|
||||
from dosagelib.scraper import get_scrapers
|
||||
from dosagelib.scraper import scrapers
|
||||
|
||||
|
||||
def main():
|
||||
|
@ -24,7 +24,7 @@ def main():
|
|||
|
||||
def get_used_languages():
|
||||
languages = {}
|
||||
for scraperobj in get_scrapers():
|
||||
for scraperobj in scrapers.get():
|
||||
lang = scraperobj.lang
|
||||
if lang not in languages:
|
||||
languages[lang] = scraperobj.language()
|
||||
|
|
|
@ -12,8 +12,9 @@ import time
|
|||
|
||||
import lxml
|
||||
|
||||
from dosagelib.scraper import scrapers
|
||||
from dosagelib.util import get_page
|
||||
from dosagelib import scraper, http
|
||||
from dosagelib import http
|
||||
|
||||
|
||||
def first_lower(x):
|
||||
|
@ -129,7 +130,7 @@ class ComicListUpdater(object):
|
|||
"""Check if comic name already exists."""
|
||||
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
|
||||
if names:
|
||||
for scraperobj in scraper.get_scrapers():
|
||||
for scraperobj in scrapers.get():
|
||||
lname = scraperobj.name.lower()
|
||||
if lname in names:
|
||||
return scraperobj.name
|
||||
|
|
|
@ -1,21 +1,21 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import re
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from xdist.dsession import LoadScopeScheduling
|
||||
|
||||
from dosagelib import scraper
|
||||
from dosagelib.scraper import scrapers
|
||||
|
||||
|
||||
def get_test_scrapers():
|
||||
"""Return scrapers that should be tested."""
|
||||
if "TESTALL" in os.environ:
|
||||
# test all comics (this will take some time)
|
||||
return scraper.get_scrapers()
|
||||
return scrapers.get()
|
||||
if 'TESTCOMICS' in os.environ:
|
||||
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
|
||||
else:
|
||||
|
@ -31,7 +31,7 @@ def get_test_scrapers():
|
|||
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
|
||||
|
||||
return [
|
||||
scraperobj for scraperobj in scraper.get_scrapers()
|
||||
scraperobj for scraperobj in scrapers.get()
|
||||
if scraper_pattern.match(scraperobj.name)
|
||||
]
|
||||
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import re
|
||||
|
||||
from dosagelib import scraper
|
||||
from dosagelib.scraper import scrapers
|
||||
from dosagelib.plugins import old
|
||||
|
||||
|
||||
class TestComicNames(object):
|
||||
|
||||
def test_names(self):
|
||||
for scraperobj in scraper.get_scrapers():
|
||||
for scraperobj in scrapers.get():
|
||||
name = scraperobj.name
|
||||
assert name.count('/') <= 1
|
||||
if '/' in name:
|
||||
|
@ -21,10 +21,10 @@ class TestComicNames(object):
|
|||
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
|
||||
|
||||
def test_renamed(self):
|
||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
||||
for scraperobj in scrapers.get(include_removed=True):
|
||||
if not isinstance(scraperobj, old.Renamed):
|
||||
continue
|
||||
assert len(scraperobj.getDisabledReasons()) > 0
|
||||
# Renamed scraper should only point to an non-disabled scraper
|
||||
newscraper = scraper.find_scrapers(scraperobj.newname)[0]
|
||||
newscraper = scrapers.find(scraperobj.newname)[0]
|
||||
assert len(newscraper.getDisabledReasons()) == 0
|
||||
|
|
|
@ -1,26 +1,26 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
import pytest
|
||||
from dosagelib import scraper
|
||||
from dosagelib.scraper import scrapers
|
||||
|
||||
|
||||
class TestScraper(object):
|
||||
"""Test scraper module functions."""
|
||||
|
||||
def test_get_scrapers(self):
|
||||
for scraperobj in scraper.get_scrapers():
|
||||
for scraperobj in scrapers.get():
|
||||
scraperobj.indexes = ["bla"]
|
||||
assert scraperobj.url, "missing url in %s" % scraperobj.name
|
||||
|
||||
def test_find_scrapers_single(self):
|
||||
result = scraper.find_scrapers("xkcd")
|
||||
result = scrapers.find("xkcd")
|
||||
assert len(result) == 1
|
||||
|
||||
def test_find_scrapers_multi(self):
|
||||
result = scraper.find_scrapers("a", multiple_allowed=True)
|
||||
result = scrapers.find("a", multiple_allowed=True)
|
||||
assert len(result) > 1
|
||||
|
||||
def test_find_scrapers_error(self):
|
||||
with pytest.raises(ValueError, match='empty comic name'):
|
||||
scraper.find_scrapers('')
|
||||
scrapers.find('')
|
||||
|
|
Loading…
Reference in a new issue