Fix complex image extraction in Unsounded
This also adds a test to ensure this extraction continues working in the future.
This commit is contained in:
parent
4f932803a3
commit
0d44632d01
5 changed files with 41 additions and 10 deletions
|
@ -4,9 +4,9 @@
|
||||||
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
from contextlib import suppress
|
||||||
from re import compile
|
from re import compile
|
||||||
from urllib.parse import urljoin
|
|
||||||
from lxml import etree
|
|
||||||
|
|
||||||
from ..scraper import BasicScraper, ParserScraper
|
from ..scraper import BasicScraper, ParserScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
|
@ -89,15 +89,30 @@ class Unsounded(ParserScraper):
|
||||||
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
|
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
starter = indirectStarter
|
starter = indirectStarter
|
||||||
|
style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)')
|
||||||
help = 'Index format: chapter-page'
|
help = 'Index format: chapter-page'
|
||||||
|
|
||||||
def extract_image_urls(self, url, data):
|
def extract_image_urls(self, url, data):
|
||||||
imageUrls = super().extract_image_urls(url, data)
|
urls = []
|
||||||
|
with suppress(ValueError):
|
||||||
|
urls.extend(super().extract_image_urls(url, data))
|
||||||
# Include background for multi-image pages
|
# Include background for multi-image pages
|
||||||
imageRegex = compile(r'background-image: url\((pageart/.*)\)')
|
cssbg = self.extract_css_bg(data)
|
||||||
for match in imageRegex.finditer(str(etree.tostring(data))):
|
if cssbg:
|
||||||
imageUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
|
urls.append(cssbg)
|
||||||
return imageUrls
|
if not urls:
|
||||||
|
raise ValueError(f'No comic found at {url!r}')
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def extract_css_bg(self, page) -> str | None:
|
||||||
|
comicdivs = page.xpath('//div[@id="comic"]')
|
||||||
|
if comicdivs:
|
||||||
|
style = comicdivs[0].attrib.get('style')
|
||||||
|
if style:
|
||||||
|
hit = self.style_bg_regex.search(style)
|
||||||
|
if hit:
|
||||||
|
return hit.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
def namer(self, image_url, page_url):
|
def namer(self, image_url, page_url):
|
||||||
filename = image_url.rsplit('/', 1)[-1]
|
filename = image_url.rsplit('/', 1)[-1]
|
||||||
|
|
BIN
tests/responses/unsounded-17-137.html.gz
Normal file
BIN
tests/responses/unsounded-17-137.html.gz
Normal file
Binary file not shown.
BIN
tests/responses/unsounded-17-92.html.gz
Normal file
BIN
tests/responses/unsounded-17-92.html.gz
Normal file
Binary file not shown.
BIN
tests/responses/unsounded-root.html.gz
Normal file
BIN
tests/responses/unsounded-root.html.gz
Normal file
Binary file not shown.
|
@ -1,5 +1,5 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2021 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -15,7 +15,7 @@ def cmd(*options):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures('_nosleep', '_noappdirs')
|
@pytest.mark.usefixtures('_nosleep', '_noappdirs')
|
||||||
class TestModules(object):
|
class TestModules:
|
||||||
"""Test that specific comic modules work correctly."""
|
"""Test that specific comic modules work correctly."""
|
||||||
|
|
||||||
@responses.activate
|
@responses.activate
|
||||||
|
@ -40,9 +40,25 @@ class TestModules(object):
|
||||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
||||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
||||||
|
|
||||||
|
@responses.activate
|
||||||
|
def test_unsounded(self, tmpdir, capfd):
|
||||||
|
httpmocks.page('https://www.casualvillain.com/Unsounded/comic+index/',
|
||||||
|
'unsounded-root')
|
||||||
|
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_92.html',
|
||||||
|
'unsounded-17-92')
|
||||||
|
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_137.html',
|
||||||
|
'unsounded-17-137')
|
||||||
|
httpmocks.jpeg(re.compile(r'.*/pageart/ch\d+_\d+.jpg'))
|
||||||
|
|
||||||
|
cmd('--basepath', str(tmpdir), 'Unsounded')
|
||||||
|
cmd('--basepath', str(tmpdir), 'Unsounded:17-92')
|
||||||
|
|
||||||
|
out = capfd.readouterr().out
|
||||||
|
assert 'ERROR' not in out
|
||||||
|
|
||||||
@responses.activate
|
@responses.activate
|
||||||
@pytest.mark.skip(reason="SoloeLeveling was removed, so we have no way to test this...")
|
@pytest.mark.skip(reason="SoloeLeveling was removed, so we have no way to test this...")
|
||||||
def test_sololeveling_geoblock(self, tmpdir):
|
def test_sololeveling_geoblock(self):
|
||||||
from dosagelib.plugins.s import SoloLeveling
|
from dosagelib.plugins.s import SoloLeveling
|
||||||
from dosagelib.scraper import GeoblockedException
|
from dosagelib.scraper import GeoblockedException
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue