Convert scraper cache to a class
This should make it easier to extend with additional entries.
This commit is contained in:
parent
5ec0710d26
commit
9237bd62b2
8 changed files with 92 additions and 83 deletions
|
@ -1,13 +1,14 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from . import events, configuration, singleton, director, scraper
|
from . import events, configuration, singleton, director
|
||||||
from . import AppName, __version__
|
from . import AppName, __version__
|
||||||
from .output import out
|
from .output import out
|
||||||
|
from .scraper import scrapers as allscrapers
|
||||||
from .util import internal_error, strlimit
|
from .util import internal_error, strlimit
|
||||||
|
|
||||||
|
|
||||||
|
@ -243,7 +244,7 @@ def do_list(column_list=True, verbose=False, listall=False):
|
||||||
out.info(u'Comics tagged with [{}] require age confirmation'
|
out.info(u'Comics tagged with [{}] require age confirmation'
|
||||||
' with the --adult option.'.format(TAG_ADULT))
|
' with the --adult option.'.format(TAG_ADULT))
|
||||||
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
|
||||||
scrapers = sorted(scraper.get_scrapers(listall),
|
scrapers = sorted(allscrapers.get(listall),
|
||||||
key=lambda s: s.name.lower())
|
key=lambda s: s.name.lower())
|
||||||
if column_list:
|
if column_list:
|
||||||
num, disabled = do_column_list(scrapers)
|
num, disabled = do_column_list(scrapers)
|
||||||
|
|
|
@ -10,7 +10,8 @@ from queue import Queue, Empty
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from . import events, scraper
|
from .scraper import scrapers as allscrapers
|
||||||
|
from . import events
|
||||||
|
|
||||||
|
|
||||||
class ComicQueue(Queue):
|
class ComicQueue(Queue):
|
||||||
|
@ -209,7 +210,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
else:
|
else:
|
||||||
name = comic
|
name = comic
|
||||||
indexes = None
|
indexes = None
|
||||||
found_scrapers = scraper.find_scrapers(name, multiple_allowed=multiple_allowed)
|
found_scrapers = allscrapers.find(name, multiple_allowed=multiple_allowed)
|
||||||
for scraperobj in found_scrapers:
|
for scraperobj in found_scrapers:
|
||||||
if shouldRunScraper(scraperobj, adult, listing):
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
# FIXME: Find a better way to work with indexes
|
# FIXME: Find a better way to work with indexes
|
||||||
|
@ -220,7 +221,7 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
|
|
||||||
|
|
||||||
def get_existing_comics(basepath=None, adult=True, listing=False):
|
def get_existing_comics(basepath=None, adult=True, listing=False):
|
||||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
for scraperobj in allscrapers.get(include_removed=True):
|
||||||
dirname = scraperobj.get_download_dir(basepath)
|
dirname = scraperobj.get_download_dir(basepath)
|
||||||
if os.path.isdir(dirname):
|
if os.path.isdir(dirname):
|
||||||
if shouldRunScraper(scraperobj, adult, listing):
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
|
|
|
@ -536,68 +536,74 @@ class _ParserScraper(Scraper):
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def find_scrapers(comic, multiple_allowed=False):
|
class Cache:
|
||||||
"""Get a list comic scraper objects.
|
"""Cache for comic scraper objects. The cache is initialized on first use.
|
||||||
|
This is cached, since iterating & loading a complete package might be quite
|
||||||
Can return more than one entry if multiple_allowed is True, else it raises
|
slow.
|
||||||
a ValueError if multiple modules match. The match is a case insensitive
|
|
||||||
substring search.
|
|
||||||
"""
|
"""
|
||||||
if not comic:
|
def __init__(self):
|
||||||
raise ValueError("empty comic name")
|
self.data = None
|
||||||
candidates = []
|
|
||||||
cname = comic.lower()
|
def find(self, comic, multiple_allowed=False):
|
||||||
for scrapers in get_scrapers(include_removed=True):
|
"""Get a list comic scraper objects.
|
||||||
lname = scrapers.name.lower()
|
|
||||||
if lname == cname:
|
Can return more than one entry if multiple_allowed is True, else it raises
|
||||||
# perfect match
|
a ValueError if multiple modules match. The match is a case insensitive
|
||||||
if not multiple_allowed:
|
substring search.
|
||||||
return [scrapers]
|
"""
|
||||||
else:
|
if not comic:
|
||||||
|
raise ValueError("empty comic name")
|
||||||
|
candidates = []
|
||||||
|
cname = comic.lower()
|
||||||
|
for scrapers in self.get(include_removed=True):
|
||||||
|
lname = scrapers.name.lower()
|
||||||
|
if lname == cname:
|
||||||
|
# perfect match
|
||||||
|
if not multiple_allowed:
|
||||||
|
return [scrapers]
|
||||||
|
else:
|
||||||
|
candidates.append(scrapers)
|
||||||
|
elif cname in lname and scrapers.url:
|
||||||
candidates.append(scrapers)
|
candidates.append(scrapers)
|
||||||
elif cname in lname and scrapers.url:
|
if len(candidates) > 1 and not multiple_allowed:
|
||||||
candidates.append(scrapers)
|
comics = ", ".join(x.name for x in candidates)
|
||||||
if len(candidates) > 1 and not multiple_allowed:
|
raise ValueError('multiple comics found: %s' % comics)
|
||||||
comics = ", ".join(x.name for x in candidates)
|
elif not candidates:
|
||||||
raise ValueError('multiple comics found: %s' % comics)
|
raise ValueError('comic %r not found' % comic)
|
||||||
elif not candidates:
|
return candidates
|
||||||
raise ValueError('comic %r not found' % comic)
|
|
||||||
return candidates
|
|
||||||
|
|
||||||
|
def load(self):
|
||||||
_scrapers = None
|
out.debug("Loading comic modules...")
|
||||||
|
|
||||||
|
|
||||||
def get_scrapers(include_removed=False):
|
|
||||||
"""Find all comic scraper classes in the plugins directory.
|
|
||||||
The result is cached.
|
|
||||||
@return: list of Scraper classes
|
|
||||||
@rtype: list of Scraper
|
|
||||||
"""
|
|
||||||
global _scrapers
|
|
||||||
if _scrapers is None:
|
|
||||||
out.debug(u"Loading comic modules...")
|
|
||||||
modules = loader.get_modules('plugins')
|
modules = loader.get_modules('plugins')
|
||||||
plugins = list(loader.get_plugins(modules, Scraper))
|
plugins = list(loader.get_plugins(modules, Scraper))
|
||||||
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
|
self.data = list([m for x in plugins for m in x.getmodules()])
|
||||||
key=lambda p: p.name)
|
self.validate()
|
||||||
check_scrapers()
|
out.debug("... %d modules loaded from %d classes." % (
|
||||||
out.debug(u"... %d modules loaded from %d classes." % (
|
len(self.data), len(plugins)))
|
||||||
len(_scrapers), len(plugins)))
|
|
||||||
if include_removed:
|
def get(self, include_removed=False):
|
||||||
return _scrapers
|
"""Find all comic scraper classes in the plugins directory.
|
||||||
else:
|
@return: list of Scraper classes
|
||||||
return [x for x in _scrapers if x.url]
|
@rtype: list of Scraper
|
||||||
|
"""
|
||||||
|
if not self.data:
|
||||||
|
self.load()
|
||||||
|
if include_removed:
|
||||||
|
return self.data
|
||||||
|
else:
|
||||||
|
return [x for x in self.data if x.url]
|
||||||
|
|
||||||
|
def validate(self):
|
||||||
|
"""Check for duplicate scraper names."""
|
||||||
|
d = {}
|
||||||
|
for scraper in self.data:
|
||||||
|
name = scraper.name.lower()
|
||||||
|
if name in d:
|
||||||
|
name1 = scraper.name
|
||||||
|
name2 = d[name].name
|
||||||
|
raise ValueError('duplicate scrapers %s and %s found' %
|
||||||
|
(name1, name2))
|
||||||
|
d[name] = scraper
|
||||||
|
|
||||||
|
|
||||||
def check_scrapers():
|
scrapers = Cache()
|
||||||
"""Check for duplicate scraper names."""
|
|
||||||
d = {}
|
|
||||||
for scraper in _scrapers:
|
|
||||||
name = scraper.name.lower()
|
|
||||||
if name in d:
|
|
||||||
name1 = scraper.name
|
|
||||||
name2 = d[name].name
|
|
||||||
raise ValueError('duplicate scrapers %s and %s found' %
|
|
||||||
(name1, name2))
|
|
||||||
d[name] = scraper
|
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
import os
|
import os
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
from dosagelib.scraper import get_scrapers
|
from dosagelib.scraper import scrapers
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -24,7 +24,7 @@ def main():
|
||||||
|
|
||||||
def get_used_languages():
|
def get_used_languages():
|
||||||
languages = {}
|
languages = {}
|
||||||
for scraperobj in get_scrapers():
|
for scraperobj in scrapers.get():
|
||||||
lang = scraperobj.lang
|
lang = scraperobj.lang
|
||||||
if lang not in languages:
|
if lang not in languages:
|
||||||
languages[lang] = scraperobj.language()
|
languages[lang] = scraperobj.language()
|
||||||
|
|
|
@ -12,8 +12,9 @@ import time
|
||||||
|
|
||||||
import lxml
|
import lxml
|
||||||
|
|
||||||
|
from dosagelib.scraper import scrapers
|
||||||
from dosagelib.util import get_page
|
from dosagelib.util import get_page
|
||||||
from dosagelib import scraper, http
|
from dosagelib import http
|
||||||
|
|
||||||
|
|
||||||
def first_lower(x):
|
def first_lower(x):
|
||||||
|
@ -129,7 +130,7 @@ class ComicListUpdater(object):
|
||||||
"""Check if comic name already exists."""
|
"""Check if comic name already exists."""
|
||||||
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
|
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
|
||||||
if names:
|
if names:
|
||||||
for scraperobj in scraper.get_scrapers():
|
for scraperobj in scrapers.get():
|
||||||
lname = scraperobj.name.lower()
|
lname = scraperobj.name.lower()
|
||||||
if lname in names:
|
if lname in names:
|
||||||
return scraperobj.name
|
return scraperobj.name
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from xdist.dsession import LoadScopeScheduling
|
from xdist.dsession import LoadScopeScheduling
|
||||||
|
|
||||||
from dosagelib import scraper
|
from dosagelib.scraper import scrapers
|
||||||
|
|
||||||
|
|
||||||
def get_test_scrapers():
|
def get_test_scrapers():
|
||||||
"""Return scrapers that should be tested."""
|
"""Return scrapers that should be tested."""
|
||||||
if "TESTALL" in os.environ:
|
if "TESTALL" in os.environ:
|
||||||
# test all comics (this will take some time)
|
# test all comics (this will take some time)
|
||||||
return scraper.get_scrapers()
|
return scrapers.get()
|
||||||
if 'TESTCOMICS' in os.environ:
|
if 'TESTCOMICS' in os.environ:
|
||||||
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
|
scraper_pattern = re.compile(os.environ['TESTCOMICS'])
|
||||||
else:
|
else:
|
||||||
|
@ -31,7 +31,7 @@ def get_test_scrapers():
|
||||||
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
|
scraper_pattern = re.compile('^(' + '|'.join(testscrapernames) + ')$')
|
||||||
|
|
||||||
return [
|
return [
|
||||||
scraperobj for scraperobj in scraper.get_scrapers()
|
scraperobj for scraperobj in scrapers.get()
|
||||||
if scraper_pattern.match(scraperobj.name)
|
if scraper_pattern.match(scraperobj.name)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from dosagelib import scraper
|
from dosagelib.scraper import scrapers
|
||||||
from dosagelib.plugins import old
|
from dosagelib.plugins import old
|
||||||
|
|
||||||
|
|
||||||
class TestComicNames(object):
|
class TestComicNames(object):
|
||||||
|
|
||||||
def test_names(self):
|
def test_names(self):
|
||||||
for scraperobj in scraper.get_scrapers():
|
for scraperobj in scrapers.get():
|
||||||
name = scraperobj.name
|
name = scraperobj.name
|
||||||
assert name.count('/') <= 1
|
assert name.count('/') <= 1
|
||||||
if '/' in name:
|
if '/' in name:
|
||||||
|
@ -21,10 +21,10 @@ class TestComicNames(object):
|
||||||
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
|
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
|
||||||
|
|
||||||
def test_renamed(self):
|
def test_renamed(self):
|
||||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
for scraperobj in scrapers.get(include_removed=True):
|
||||||
if not isinstance(scraperobj, old.Renamed):
|
if not isinstance(scraperobj, old.Renamed):
|
||||||
continue
|
continue
|
||||||
assert len(scraperobj.getDisabledReasons()) > 0
|
assert len(scraperobj.getDisabledReasons()) > 0
|
||||||
# Renamed scraper should only point to an non-disabled scraper
|
# Renamed scraper should only point to an non-disabled scraper
|
||||||
newscraper = scraper.find_scrapers(scraperobj.newname)[0]
|
newscraper = scrapers.find(scraperobj.newname)[0]
|
||||||
assert len(newscraper.getDisabledReasons()) == 0
|
assert len(newscraper.getDisabledReasons()) == 0
|
||||||
|
|
|
@ -1,26 +1,26 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||||
import pytest
|
import pytest
|
||||||
from dosagelib import scraper
|
from dosagelib.scraper import scrapers
|
||||||
|
|
||||||
|
|
||||||
class TestScraper(object):
|
class TestScraper(object):
|
||||||
"""Test scraper module functions."""
|
"""Test scraper module functions."""
|
||||||
|
|
||||||
def test_get_scrapers(self):
|
def test_get_scrapers(self):
|
||||||
for scraperobj in scraper.get_scrapers():
|
for scraperobj in scrapers.get():
|
||||||
scraperobj.indexes = ["bla"]
|
scraperobj.indexes = ["bla"]
|
||||||
assert scraperobj.url, "missing url in %s" % scraperobj.name
|
assert scraperobj.url, "missing url in %s" % scraperobj.name
|
||||||
|
|
||||||
def test_find_scrapers_single(self):
|
def test_find_scrapers_single(self):
|
||||||
result = scraper.find_scrapers("xkcd")
|
result = scrapers.find("xkcd")
|
||||||
assert len(result) == 1
|
assert len(result) == 1
|
||||||
|
|
||||||
def test_find_scrapers_multi(self):
|
def test_find_scrapers_multi(self):
|
||||||
result = scraper.find_scrapers("a", multiple_allowed=True)
|
result = scrapers.find("a", multiple_allowed=True)
|
||||||
assert len(result) > 1
|
assert len(result) > 1
|
||||||
|
|
||||||
def test_find_scrapers_error(self):
|
def test_find_scrapers_error(self):
|
||||||
with pytest.raises(ValueError, match='empty comic name'):
|
with pytest.raises(ValueError, match='empty comic name'):
|
||||||
scraper.find_scrapers('')
|
scrapers.find('')
|
||||||
|
|
Loading…
Reference in a new issue