dosage/dosagelib/plugins/u.py

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations

import json
import re
from contextlib import suppress
from re import compile

from ..scraper import BasicScraper, ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import ComicControlScraper, WordPressScraper, WordPressNavi


class UberQuest(ParserScraper):
    baseUrl = 'https://uberquest.studiokhimera.com/'
    url = baseUrl + 'wp-json/keeros_comics/v1/chapters'
    stripUrl = baseUrl + 'wp-json/wp/v2/cfx_comic_page?page_number=%s'
    firstStripUrl = stripUrl % 'cover'

    def starter(self):
        # Retrieve comic metadata from API
        data = self.session.get(self.url)
        data.raise_for_status()
        return self.stripUrl % data.json()[-1]['pages'][-1]['page_number']

    def getPrevUrl(self, url, data):
        return self.stripUrl % json.loads(data.text_content())[0]['prev_id']

    def extract_image_urls(self, url, data):
        return [json.loads(data.text_content())[0]['attachment']]

    def namer(self, imageUrl, pageUrl):
        return 'UberQuest-' + pageUrl.rsplit('=', 1)[-1]


class Underling(WordPressNavi):
    url = ('https://web.archive.org/web/20190806120425/'
        'http://underlingcomic.com/')
    firstStripUrl = url + 'page-one/'
    endOfLife = True


class Undertow(BasicScraper):
    url = 'http://undertow.dreamshards.org/'
    imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
    prevSearch = compile(r'href="(.+?)".+?teynpoint')
    latestSearch = compile(r'href="(.+?)".+?Most recent page')
    starter = indirectStarter


class unDivine(ComicControlScraper):
    url = 'https://www.undivinecomic.com/'
    stripUrl = url + 'comic/%s'
    firstStripUrl = stripUrl % 'page-1'

    def namer(self, imageUrl, pageUrl):
        # Fix inconsistent filenames
        filename = imageUrl.rsplit('/', 1)[-1].replace(' ', '-')
        filename = filename.replace('10B311D9-0992-4D74-AEB8-DAB714DA67C6', 'UD-322')
        filename = filename.replace('99266624-7EF7-4E99-9EC9-DDB5F59CBDFD', 'UD-311')
        filename = filename.replace('33C6A5A1-F703-4A0A-BCD5-DE1A09359D8E', 'UD-310')
        filename = filename.replace('6CE01E81-C299-43C7-A221-8DE0670EFA30', 'ch4endbonusq4')
        filename = filename.replace('DB66D93B-1FE5-49C7-90E0-FFF981DCD6B3', 'bipolar')
        if len(filename) > 15 and filename[0].isdigit() and filename[10] == '-':
            filename = filename[11:]
        return filename


class UnicornJelly(BasicScraper):
    baseUrl = 'http://unicornjelly.com/'
    url = baseUrl + 'uni666.html'
    stripUrl = baseUrl + 'uni%s.html'
    firstStripUrl = stripUrl % '001'
    imageSearch = compile(r'</TABLE>(?:<FONT COLOR="BLACK">)?<IMG SRC="(images/[^"]+)" WIDTH=')
    prevSearch = compile(r'<A HREF="(uni\d{3}[bcs]?\.html)">(<FONT COLOR="BLACK">)?<IMG SRC="images/back00\.gif"')
    help = 'Index format: nnn'


class Unsounded(ParserScraper):
    url = 'https://www.casualvillain.com/Unsounded/'
    startUrl = url + 'comic+index/'
    stripUrl = url + 'comic/ch%s/ch%s_%s.html'
    firstStripUrl = stripUrl % ('01', '01', '01')
    imageSearch = '//div[@id="comic"]//img'
    prevSearch = '//a[d:class("back")]'
    latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
    multipleImagesPerStrip = True
    starter = indirectStarter
    style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)')
    help = 'Index format: chapter-page'

    def extract_image_urls(self, url, data):
        urls = []
        with suppress(ValueError):
            urls.extend(super().extract_image_urls(url, data))
        # Include background for multi-image pages
        cssbg = self.extract_css_bg(data)
        if cssbg:
            urls.append(cssbg)
        if not urls:
            raise ValueError(f'No comic found at {url!r}')
        return urls

    def extract_css_bg(self, page) -> str | None:
        comicdivs = page.xpath('//div[@id="comic"]')
        if comicdivs:
            style = comicdivs[0].attrib.get('style')
            if style:
                hit = self.style_bg_regex.search(style)
                if hit:
                    return hit.group(1)
        return None

    def namer(self, image_url, page_url):
        filename = image_url.rsplit('/', 1)[-1]
        pagename = page_url.rsplit('/', 1)[-1]
        if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
            filename = pagename.split('_', 1)[0] + '_' + filename
        return filename

    def getPrevUrl(self, url, data):
        # Fix missing navigation links between chapters
        if 'ch13/you_let_me_fall' in url:
            return self.stripUrl % ('13', '13', '85')
        return super().getPrevUrl(url, data)

    def getIndexStripUrl(self, index):
        chapter, num = index.split('-')
        return self.stripUrl % (chapter, chapter, num)


class UrgentTransformationCrisis(WordPressScraper):
    url = 'http://www.catomix.com/utc/'
    firstStripUrl = url + 'comic/cover1'

    def namer(self, imageUrl, pageUrl):
        # Fix inconsistent filenames
        filename = imageUrl.rsplit('/', 1)[-1].rsplit('?', 1)[0]
        return filename.replace('FVLYHD', 'LYHDpage').replace('UTC084web', '20091218c')
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs`
			`# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam`
			`# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher`
			`# SPDX-FileCopyrightText: © 2019 Daniel Ring`
Fix modern type hint on older Python 2023-06-10 20:33:33 +00:00			`from __future__ import annotations`

Fix UberQuest 2023-06-08 06:32:40 +00:00			`import json`
Fix complex image extraction in Unsounded This also adds a test to ensure this extraction continues working in the future. 2023-06-10 18:03:56 +00:00			`import re`
			`from contextlib import suppress`
Drop Python 2 support: Obsolete future statements 2020-02-04 00:06:19 +00:00			`from re import compile`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from ..scraper import BasicScraper, ParserScraper`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`from ..helpers import indirectStarter`
Sort comics alphabetically & PEP8 style fixes. 2016-03-31 21:13:54 +00:00			`from ..util import tagre`
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from .common import ComicControlScraper, WordPressScraper, WordPressNavi`
Sort comics alphabetically & PEP8 style fixes. 2016-03-31 21:13:54 +00:00
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00
Fix UberQuest 2023-06-08 06:32:40 +00:00			`class UberQuest(ParserScraper):`
			`baseUrl = 'https://uberquest.studiokhimera.com/'`
			`url = baseUrl + 'wp-json/keeros_comics/v1/chapters'`
			`stripUrl = baseUrl + 'wp-json/wp/v2/cfx_comic_page?page_number=%s'`
			`firstStripUrl = stripUrl % 'cover'`

			`def starter(self):`
			`# Retrieve comic metadata from API`
			`data = self.session.get(self.url)`
			`data.raise_for_status()`
			`return self.stripUrl % data.json()[-1]['pages'][-1]['page_number']`

			`def getPrevUrl(self, url, data):`
			`return self.stripUrl % json.loads(data.text_content())[0]['prev_id']`

Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`def extract_image_urls(self, url, data):`
Fix UberQuest 2023-06-08 06:32:40 +00:00			`return [json.loads(data.text_content())[0]['attachment']]`

			`def namer(self, imageUrl, pageUrl):`
			`return 'UberQuest-' + pageUrl.rsplit('=', 1)[-1]`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Underling(WordPressNavi):`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`url = ('https://web.archive.org/web/20190806120425/'`
			`'http://underlingcomic.com/')`
Move more comics to common WordPressScraper. 2016-04-10 21:04:34 +00:00			`firstStripUrl = url + 'page-one/'`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`endOfLife = True`
Added comic Underling 2014-02-20 11:54:40 +00:00
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Undertow(BasicScraper):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://undertow.dreamshards.org/'`
Fix comics. 2012-12-04 06:02:40 +00:00			`imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`prevSearch = compile(r'href="(.+?)".+?teynpoint')`
Read starter parameters from class. This allows to specify starters in a more declarative and dynamic way. 2016-04-12 21:11:39 +00:00			`latestSearch = compile(r'href="(.+?)".+?Most recent page')`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = indirectStarter`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class unDivine(ComicControlScraper):`
Fix unDivine 2021-04-25 01:35:21 +00:00			`url = 'https://www.undivinecomic.com/'`
			`stripUrl = url + 'comic/%s'`
			`firstStripUrl = stripUrl % 'page-1'`

			`def namer(self, imageUrl, pageUrl):`
			`# Fix inconsistent filenames`
			`filename = imageUrl.rsplit('/', 1)[-1].replace(' ', '-')`
			`filename = filename.replace('10B311D9-0992-4D74-AEB8-DAB714DA67C6', 'UD-322')`
			`filename = filename.replace('99266624-7EF7-4E99-9EC9-DDB5F59CBDFD', 'UD-311')`
			`filename = filename.replace('33C6A5A1-F703-4A0A-BCD5-DE1A09359D8E', 'UD-310')`
			`filename = filename.replace('6CE01E81-C299-43C7-A221-8DE0670EFA30', 'ch4endbonusq4')`
			`filename = filename.replace('DB66D93B-1FE5-49C7-90E0-FFF981DCD6B3', 'bipolar')`
			`if len(filename) > 15 and filename[0].isdigit() and filename[10] == '-':`
			`filename = filename[11:]`
			`return filename`
Added unDivine (#147) Added comic unDivine using ComicControlScraper 2020-01-08 23:17:07 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class UnicornJelly(BasicScraper):`
s/baseurl/baseUrl/g 2013-04-13 18:58:00 +00:00			`baseUrl = 'http://unicornjelly.com/'`
			`url = baseUrl + 'uni666.html'`
			`stripUrl = baseUrl + 'uni%s.html'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '001'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`imageSearch = compile(r'</TABLE>(?:<FONT COLOR="BLACK">)?<IMG SRC="(images/[^"]+)" WIDTH=')`
			`prevSearch = compile(r'<A HREF="(uni\d{3}[bcs]?\.html)">(<FONT COLOR="BLACK">)?<IMG SRC="images/back00\.gif"')`
			`help = 'Index format: nnn'`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Unsounded(ParserScraper):`
Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`url = 'https://www.casualvillain.com/Unsounded/'`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`startUrl = url + 'comic+index/'`
Added Unsound. 2013-04-25 19:38:18 +00:00			`stripUrl = url + 'comic/ch%s/ch%s_%s.html'`
			`firstStripUrl = stripUrl % ('01', '01', '01')`
Fix Unsounded 2021-10-12 04:55:10 +00:00			`imageSearch = '//div[@id="comic"]//img'`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`prevSearch = '//a[d:class("back")]'`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`latestSearch = '//div[@id="chapter_box"][1]//a[last()]'`
			`multipleImagesPerStrip = True`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = indirectStarter`
Fix complex image extraction in Unsounded This also adds a test to ensure this extraction continues working in the future. 2023-06-10 18:03:56 +00:00			`style_bg_regex = re.compile(r'background-image: url\((.pageart/.)\)')`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`help = 'Index format: chapter-page'`

Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`def extract_image_urls(self, url, data):`
Fix complex image extraction in Unsounded This also adds a test to ensure this extraction continues working in the future. 2023-06-10 18:03:56 +00:00			`urls = []`
			`with suppress(ValueError):`
			`urls.extend(super().extract_image_urls(url, data))`
Fix Unsounded 2021-10-12 04:55:10 +00:00			`# Include background for multi-image pages`
Fix complex image extraction in Unsounded This also adds a test to ensure this extraction continues working in the future. 2023-06-10 18:03:56 +00:00			`cssbg = self.extract_css_bg(data)`
			`if cssbg:`
			`urls.append(cssbg)`
			`if not urls:`
			`raise ValueError(f'No comic found at {url!r}')`
			`return urls`

			`def extract_css_bg(self, page) -> str \| None:`
			`comicdivs = page.xpath('//div[@id="comic"]')`
			`if comicdivs:`
			`style = comicdivs[0].attrib.get('style')`
			`if style:`
			`hit = self.style_bg_regex.search(style)`
			`if hit:`
			`return hit.group(1)`
			`return None`
Fix Unsounded 2021-10-12 04:55:10 +00:00
Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`def namer(self, image_url, page_url):`
			`filename = image_url.rsplit('/', 1)[-1]`
			`pagename = page_url.rsplit('/', 1)[-1]`
Fix Unsounded 2021-10-12 04:55:10 +00:00			`if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:`
			`filename = pagename.split('_', 1)[0] + '_' + filename`
			`return filename`

Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`def getPrevUrl(self, url, data):`
			`# Fix missing navigation links between chapters`
			`if 'ch13/you_let_me_fall' in url:`
			`return self.stripUrl % ('13', '13', '85')`
Extend scraper API with a extract_image_urls method This is just a light wrapper around fetchUrls, but frees comic modules from second-guessing for what purpose fetchUrls was called when they are overriding that API - And yes, some comic modules already got this wrong, they are now all fixed. 2023-06-10 13:05:57 +00:00			`return super().getPrevUrl(url, data)`
Added Unsound. 2013-04-25 19:38:18 +00:00
			`def getIndexStripUrl(self, index):`
			`chapter, num = index.split('-')`
			`return self.stripUrl % (chapter, chapter, num)`
Add comics from catomix.com. 2016-05-16 21:55:41 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class UrgentTransformationCrisis(WordPressScraper):`
Add comics from catomix.com. 2016-05-16 21:55:41 +00:00			`url = 'http://www.catomix.com/utc/'`
			`firstStripUrl = url + 'comic/cover1'`
Fix UrgentTransformationCrisis 2019-07-06 05:23:37 +00:00
			`def namer(self, imageUrl, pageUrl):`
			`# Fix inconsistent filenames`
			`filename = imageUrl.rsplit('/', 1)[-1].rsplit('?', 1)[0]`
			`return filename.replace('FVLYHD', 'LYHDpage').replace('UTC084web', '20091218c')`