Fix complex image extraction in Unsounded
This also adds a test to ensure this extraction continues working in the future.
This commit is contained in:
parent
4f932803a3
commit
0d44632d01
5 changed files with 41 additions and 10 deletions
|
@ -4,9 +4,9 @@
|
|||
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||
import json
|
||||
import re
|
||||
from contextlib import suppress
|
||||
from re import compile
|
||||
from urllib.parse import urljoin
|
||||
from lxml import etree
|
||||
|
||||
from ..scraper import BasicScraper, ParserScraper
|
||||
from ..helpers import indirectStarter
|
||||
|
@ -89,15 +89,30 @@ class Unsounded(ParserScraper):
|
|||
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
|
||||
multipleImagesPerStrip = True
|
||||
starter = indirectStarter
|
||||
style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)')
|
||||
help = 'Index format: chapter-page'
|
||||
|
||||
def extract_image_urls(self, url, data):
|
||||
imageUrls = super().extract_image_urls(url, data)
|
||||
urls = []
|
||||
with suppress(ValueError):
|
||||
urls.extend(super().extract_image_urls(url, data))
|
||||
# Include background for multi-image pages
|
||||
imageRegex = compile(r'background-image: url\((pageart/.*)\)')
|
||||
for match in imageRegex.finditer(str(etree.tostring(data))):
|
||||
imageUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
|
||||
return imageUrls
|
||||
cssbg = self.extract_css_bg(data)
|
||||
if cssbg:
|
||||
urls.append(cssbg)
|
||||
if not urls:
|
||||
raise ValueError(f'No comic found at {url!r}')
|
||||
return urls
|
||||
|
||||
def extract_css_bg(self, page) -> str | None:
|
||||
comicdivs = page.xpath('//div[@id="comic"]')
|
||||
if comicdivs:
|
||||
style = comicdivs[0].attrib.get('style')
|
||||
if style:
|
||||
hit = self.style_bg_regex.search(style)
|
||||
if hit:
|
||||
return hit.group(1)
|
||||
return None
|
||||
|
||||
def namer(self, image_url, page_url):
|
||||
filename = image_url.rsplit('/', 1)[-1]
|
||||
|
|
BIN
tests/responses/unsounded-17-137.html.gz
Normal file
BIN
tests/responses/unsounded-17-137.html.gz
Normal file
Binary file not shown.
BIN
tests/responses/unsounded-17-92.html.gz
Normal file
BIN
tests/responses/unsounded-17-92.html.gz
Normal file
Binary file not shown.
BIN
tests/responses/unsounded-root.html.gz
Normal file
BIN
tests/responses/unsounded-root.html.gz
Normal file
Binary file not shown.
|
@ -1,5 +1,5 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2019-2021 Tobias Gruetzmacher
|
||||
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
@ -15,7 +15,7 @@ def cmd(*options):
|
|||
|
||||
|
||||
@pytest.mark.usefixtures('_nosleep', '_noappdirs')
|
||||
class TestModules(object):
|
||||
class TestModules:
|
||||
"""Test that specific comic modules work correctly."""
|
||||
|
||||
@responses.activate
|
||||
|
@ -40,9 +40,25 @@ class TestModules(object):
|
|||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol')
|
||||
cmd('--basepath', str(tmpdir), 'CalvinAndHobbesEnEspanol:2012/07/22')
|
||||
|
||||
@responses.activate
|
||||
def test_unsounded(self, tmpdir, capfd):
|
||||
httpmocks.page('https://www.casualvillain.com/Unsounded/comic+index/',
|
||||
'unsounded-root')
|
||||
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_92.html',
|
||||
'unsounded-17-92')
|
||||
httpmocks.page('https://www.casualvillain.com/Unsounded/comic/ch17/ch17_137.html',
|
||||
'unsounded-17-137')
|
||||
httpmocks.jpeg(re.compile(r'.*/pageart/ch\d+_\d+.jpg'))
|
||||
|
||||
cmd('--basepath', str(tmpdir), 'Unsounded')
|
||||
cmd('--basepath', str(tmpdir), 'Unsounded:17-92')
|
||||
|
||||
out = capfd.readouterr().out
|
||||
assert 'ERROR' not in out
|
||||
|
||||
@responses.activate
|
||||
@pytest.mark.skip(reason="SoloeLeveling was removed, so we have no way to test this...")
|
||||
def test_sololeveling_geoblock(self, tmpdir):
|
||||
def test_sololeveling_geoblock(self):
|
||||
from dosagelib.plugins.s import SoloLeveling
|
||||
from dosagelib.scraper import GeoblockedException
|
||||
|
||||
|
|
Loading…
Reference in a new issue