Fix complex image extraction in Unsounded

This also adds a test to ensure this extraction continues working in the
future.
This commit is contained in:
Tobias Gruetzmacher 2023-06-10 20:03:56 +02:00
parent 4f932803a3
commit 0d44632d01
No known key found for this signature in database
5 changed files with 41 additions and 10 deletions

View file

@ -4,9 +4,9 @@
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import json import json
import re
from contextlib import suppress
from re import compile from re import compile
from urllib.parse import urljoin
from lxml import etree
from ..scraper import BasicScraper, ParserScraper from ..scraper import BasicScraper, ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
@ -89,15 +89,30 @@ class Unsounded(ParserScraper):
latestSearch = '//div[@id="chapter_box"][1]//a[last()]' latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
starter = indirectStarter starter = indirectStarter
style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)')
help = 'Index format: chapter-page' help = 'Index format: chapter-page'
def extract_image_urls(self, url, data): def extract_image_urls(self, url, data):
imageUrls = super().extract_image_urls(url, data) urls = []
with suppress(ValueError):
urls.extend(super().extract_image_urls(url, data))
# Include background for multi-image pages # Include background for multi-image pages
imageRegex = compile(r'background-image: url\((pageart/.*)\)') cssbg = self.extract_css_bg(data)
for match in imageRegex.finditer(str(etree.tostring(data))): if cssbg:
imageUrls.append(normaliseURL(urljoin(data[1], match.group(1)))) urls.append(cssbg)
return imageUrls if not urls:
raise ValueError(f'No comic found at {url!r}')
return urls
def extract_css_bg(self, page) -> str | None:
comicdivs = page.xpath('//div[@id="comic"]')
if comicdivs:
style = comicdivs[0].attrib.get('style')
if style:
hit = self.style_bg_regex.search(style)
if hit:
return hit.group(1)
return None
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
filename = image_url.rsplit('/', 1)[-1] filename = image_url.rsplit('/', 1)[-1]

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,5 +1,5 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2021 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
import re import re
import pytest import pytest
@ -15,7 +15,7 @@ def cmd(*options):
@pytest.mark.usefixtures('_nosleep', '_noappdirs') @pytest.mark.usefixtures('_nosleep', '_noappdirs')
class TestModules(object): class TestModules:
"""Test that specific comic modules work correctly.""" """Test that specific comic modules work correctly."""
@responses.activate @responses.activate
@ -40,9 +40,25 @@ class TestModules(object):
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol') cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22') cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
@responses.activate
def test_unsounded(self, tmpdir, capfd):
httpmocks.page('https://www.casualvillain.com/Unsounded/comic+index/',
'unsounded-root')
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_92.html',
'unsounded-17-92')
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_137.html',
'unsounded-17-137')
httpmocks.jpeg(re.compile(r'.*/pageart/ch\d+_\d+.jpg'))
cmd('--basepath', str(tmpdir), 'Unsounded')
cmd('--basepath', str(tmpdir), 'Unsounded:17-92')
out = capfd.readouterr().out
assert 'ERROR' not in out
@responses.activate @responses.activate
@pytest.mark.skip(reason="SoloeLeveling was removed, so we have no way to test this...") @pytest.mark.skip(reason="SoloeLeveling was removed, so we have no way to test this...")
def test_sololeveling_geoblock(self, tmpdir): def test_sololeveling_geoblock(self):
from dosagelib.plugins.s import SoloLeveling from dosagelib.plugins.s import SoloLeveling
from dosagelib.scraper import GeoblockedException from dosagelib.scraper import GeoblockedException