dosage/dosagelib/plugins/u.py

# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
import json
from re import compile
from urllib.parse import urljoin
from lxml import etree

from ..scraper import BasicScraper, ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import ComicControlScraper, WordPressScraper, WordPressNavi


class UberQuest(ParserScraper):
    baseUrl = 'https://uberquest.studiokhimera.com/'
    url = baseUrl + 'wp-json/keeros_comics/v1/chapters'
    stripUrl = baseUrl + 'wp-json/wp/v2/cfx_comic_page?page_number=%s'
    firstStripUrl = stripUrl % 'cover'

    def starter(self):
        # Retrieve comic metadata from API
        data = self.session.get(self.url)
        data.raise_for_status()
        return self.stripUrl % data.json()[-1]['pages'][-1]['page_number']

    def getPrevUrl(self, url, data):
        return self.stripUrl % json.loads(data.text_content())[0]['prev_id']

    def fetchUrls(self, url, data, urlSearch):
        return [json.loads(data.text_content())[0]['attachment']]

    def namer(self, imageUrl, pageUrl):
        return 'UberQuest-' + pageUrl.rsplit('=', 1)[-1]


class Underling(WordPressNavi):
    url = ('https://web.archive.org/web/20190806120425/'
        'http://underlingcomic.com/')
    firstStripUrl = url + 'page-one/'
    endOfLife = True


class Undertow(BasicScraper):
    url = 'http://undertow.dreamshards.org/'
    imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
    prevSearch = compile(r'href="(.+?)".+?teynpoint')
    latestSearch = compile(r'href="(.+?)".+?Most recent page')
    starter = indirectStarter


class unDivine(ComicControlScraper):
    url = 'https://www.undivinecomic.com/'
    stripUrl = url + 'comic/%s'
    firstStripUrl = stripUrl % 'page-1'

    def namer(self, imageUrl, pageUrl):
        # Fix inconsistent filenames
        filename = imageUrl.rsplit('/', 1)[-1].replace(' ', '-')
        filename = filename.replace('10B311D9-0992-4D74-AEB8-DAB714DA67C6', 'UD-322')
        filename = filename.replace('99266624-7EF7-4E99-9EC9-DDB5F59CBDFD', 'UD-311')
        filename = filename.replace('33C6A5A1-F703-4A0A-BCD5-DE1A09359D8E', 'UD-310')
        filename = filename.replace('6CE01E81-C299-43C7-A221-8DE0670EFA30', 'ch4endbonusq4')
        filename = filename.replace('DB66D93B-1FE5-49C7-90E0-FFF981DCD6B3', 'bipolar')
        if len(filename) > 15 and filename[0].isdigit() and filename[10] == '-':
            filename = filename[11:]
        return filename


class UnicornJelly(BasicScraper):
    baseUrl = 'http://unicornjelly.com/'
    url = baseUrl + 'uni666.html'
    stripUrl = baseUrl + 'uni%s.html'
    firstStripUrl = stripUrl % '001'
    imageSearch = compile(r'</TABLE>(?:<FONT COLOR="BLACK">)?<IMG SRC="(images/[^"]+)" WIDTH=')
    prevSearch = compile(r'<A HREF="(uni\d{3}[bcs]?\.html)">(<FONT COLOR="BLACK">)?<IMG SRC="images/back00\.gif"')
    help = 'Index format: nnn'


class Unsounded(ParserScraper):
    url = 'http://www.casualvillain.com/Unsounded/'
    startUrl = url + 'comic+index/'
    stripUrl = url + 'comic/ch%s/ch%s_%s.html'
    firstStripUrl = stripUrl % ('01', '01', '01')
    imageSearch = '//div[@id="comic"]//img'
    prevSearch = '//a[d:class("back")]'
    latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
    multipleImagesPerStrip = True
    starter = indirectStarter
    help = 'Index format: chapter-page'

    def fetchUrls(self, url, data, urlSearch):
        imageUrls = super(Unsounded, self).fetchUrls(url, data, urlSearch)
        # Include background for multi-image pages
        imageRegex = compile(r'background-image: url\((pageart/.*)\)')
        for match in imageRegex.finditer(str(etree.tostring(data))):
            print(match)
            searchUrls.append(normaliseURL(urljoin(data[1], match.group(1))))
        return imageUrls

    def namer(self, imageUrl, pageUrl):
        filename = imageUrl.rsplit('/', 1)[-1]
        pagename = pageUrl.rsplit('/', 1)[-1]
        if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
            filename = pagename.split('_', 1)[0] + '_' + filename
        return filename

    def getPrevUrl(self, url, data):
        # Fix missing navigation links between chapters
        if 'ch13/you_let_me_fall' in url:
            return self.stripUrl % ('13', '13', '85')
        return super(Unsounded, self).getPrevUrl(url, data)

    def getIndexStripUrl(self, index):
        chapter, num = index.split('-')
        return self.stripUrl % (chapter, chapter, num)


class UrgentTransformationCrisis(WordPressScraper):
    url = 'http://www.catomix.com/utc/'
    firstStripUrl = url + 'comic/cover1'

    def namer(self, imageUrl, pageUrl):
        # Fix inconsistent filenames
        filename = imageUrl.rsplit('/', 1)[-1].rsplit('?', 1)[0]
        return filename.replace('FVLYHD', 'LYHDpage').replace('UTC084web', '20091218c')
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`# Copyright (C) 2015-2020 Tobias Gruetzmacher`
Add self to authors list, update copyright headers 2020-01-13 06:34:05 +00:00			`# Copyright (C) 2019-2020 Daniel Ring`
Fix UberQuest 2023-06-08 06:32:40 +00:00			`import json`
Drop Python 2 support: Obsolete future statements 2020-02-04 00:06:19 +00:00			`from re import compile`
Fix Unsounded 2021-10-12 04:55:10 +00:00			`from urllib.parse import urljoin`
			`from lxml import etree`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from ..scraper import BasicScraper, ParserScraper`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`from ..helpers import indirectStarter`
Sort comics alphabetically & PEP8 style fixes. 2016-03-31 21:13:54 +00:00			`from ..util import tagre`
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from .common import ComicControlScraper, WordPressScraper, WordPressNavi`
Sort comics alphabetically & PEP8 style fixes. 2016-03-31 21:13:54 +00:00
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00
Fix UberQuest 2023-06-08 06:32:40 +00:00			`class UberQuest(ParserScraper):`
			`baseUrl = 'https://uberquest.studiokhimera.com/'`
			`url = baseUrl + 'wp-json/keeros_comics/v1/chapters'`
			`stripUrl = baseUrl + 'wp-json/wp/v2/cfx_comic_page?page_number=%s'`
			`firstStripUrl = stripUrl % 'cover'`

			`def starter(self):`
			`# Retrieve comic metadata from API`
			`data = self.session.get(self.url)`
			`data.raise_for_status()`
			`return self.stripUrl % data.json()[-1]['pages'][-1]['page_number']`

			`def getPrevUrl(self, url, data):`
			`return self.stripUrl % json.loads(data.text_content())[0]['prev_id']`

			`def fetchUrls(self, url, data, urlSearch):`
			`return [json.loads(data.text_content())[0]['attachment']]`

			`def namer(self, imageUrl, pageUrl):`
			`return 'UberQuest-' + pageUrl.rsplit('=', 1)[-1]`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Underling(WordPressNavi):`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`url = ('https://web.archive.org/web/20190806120425/'`
			`'http://underlingcomic.com/')`
Move more comics to common WordPressScraper. 2016-04-10 21:04:34 +00:00			`firstStripUrl = url + 'page-one/'`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`endOfLife = True`
Added comic Underling 2014-02-20 11:54:40 +00:00
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Undertow(BasicScraper):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://undertow.dreamshards.org/'`
Fix comics. 2012-12-04 06:02:40 +00:00			`imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`prevSearch = compile(r'href="(.+?)".+?teynpoint')`
Read starter parameters from class. This allows to specify starters in a more declarative and dynamic way. 2016-04-12 21:11:39 +00:00			`latestSearch = compile(r'href="(.+?)".+?Most recent page')`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = indirectStarter`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class unDivine(ComicControlScraper):`
Fix unDivine 2021-04-25 01:35:21 +00:00			`url = 'https://www.undivinecomic.com/'`
			`stripUrl = url + 'comic/%s'`
			`firstStripUrl = stripUrl % 'page-1'`

			`def namer(self, imageUrl, pageUrl):`
			`# Fix inconsistent filenames`
			`filename = imageUrl.rsplit('/', 1)[-1].replace(' ', '-')`
			`filename = filename.replace('10B311D9-0992-4D74-AEB8-DAB714DA67C6', 'UD-322')`
			`filename = filename.replace('99266624-7EF7-4E99-9EC9-DDB5F59CBDFD', 'UD-311')`
			`filename = filename.replace('33C6A5A1-F703-4A0A-BCD5-DE1A09359D8E', 'UD-310')`
			`filename = filename.replace('6CE01E81-C299-43C7-A221-8DE0670EFA30', 'ch4endbonusq4')`
			`filename = filename.replace('DB66D93B-1FE5-49C7-90E0-FFF981DCD6B3', 'bipolar')`
			`if len(filename) > 15 and filename[0].isdigit() and filename[10] == '-':`
			`filename = filename[11:]`
			`return filename`
Added unDivine (#147) Added comic unDivine using ComicControlScraper 2020-01-08 23:17:07 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class UnicornJelly(BasicScraper):`
s/baseurl/baseUrl/g 2013-04-13 18:58:00 +00:00			`baseUrl = 'http://unicornjelly.com/'`
			`url = baseUrl + 'uni666.html'`
			`stripUrl = baseUrl + 'uni%s.html'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '001'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`imageSearch = compile(r'</TABLE>(?:<FONT COLOR="BLACK">)?<IMG SRC="(images/[^"]+)" WIDTH=')`
			`prevSearch = compile(r'<A HREF="(uni\d{3}[bcs]?\.html)">(<FONT COLOR="BLACK">)?<IMG SRC="images/back00\.gif"')`
			`help = 'Index format: nnn'`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Unsounded(ParserScraper):`
Added Unsound. 2013-04-25 19:38:18 +00:00			`url = 'http://www.casualvillain.com/Unsounded/'`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`startUrl = url + 'comic+index/'`
Added Unsound. 2013-04-25 19:38:18 +00:00			`stripUrl = url + 'comic/ch%s/ch%s_%s.html'`
			`firstStripUrl = stripUrl % ('01', '01', '01')`
Fix Unsounded 2021-10-12 04:55:10 +00:00			`imageSearch = '//div[@id="comic"]//img'`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`prevSearch = '//a[d:class("back")]'`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`latestSearch = '//div[@id="chapter_box"][1]//a[last()]'`
			`multipleImagesPerStrip = True`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = indirectStarter`
Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`help = 'Index format: chapter-page'`

Fix Unsounded 2021-10-12 04:55:10 +00:00			`def fetchUrls(self, url, data, urlSearch):`
			`imageUrls = super(Unsounded, self).fetchUrls(url, data, urlSearch)`
			`# Include background for multi-image pages`
			`imageRegex = compile(r'background-image: url\((pageart/.*)\)')`
			`for match in imageRegex.finditer(str(etree.tostring(data))):`
			`print(match)`
			`searchUrls.append(normaliseURL(urljoin(data[1], match.group(1))))`
			`return imageUrls`

			`def namer(self, imageUrl, pageUrl):`
			`filename = imageUrl.rsplit('/', 1)[-1]`
			`pagename = pageUrl.rsplit('/', 1)[-1]`
			`if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:`
			`filename = pagename.split('_', 1)[0] + '_' + filename`
			`return filename`

Fix Unsounded (fixes #107) 2020-01-09 21:21:20 +00:00			`def getPrevUrl(self, url, data):`
			`# Fix missing navigation links between chapters`
			`if 'ch13/you_let_me_fall' in url:`
			`return self.stripUrl % ('13', '13', '85')`
			`return super(Unsounded, self).getPrevUrl(url, data)`
Added Unsound. 2013-04-25 19:38:18 +00:00
			`def getIndexStripUrl(self, index):`
			`chapter, num = index.split('-')`
			`return self.stripUrl % (chapter, chapter, num)`
Add comics from catomix.com. 2016-05-16 21:55:41 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class UrgentTransformationCrisis(WordPressScraper):`
Add comics from catomix.com. 2016-05-16 21:55:41 +00:00			`url = 'http://www.catomix.com/utc/'`
			`firstStripUrl = url + 'comic/cover1'`
Fix UrgentTransformationCrisis 2019-07-06 05:23:37 +00:00
			`def namer(self, imageUrl, pageUrl):`
			`# Fix inconsistent filenames`
			`filename = imageUrl.rsplit('/', 1)[-1].rsplit('?', 1)[0]`
			`return filename.replace('FVLYHD', 'LYHDpage').replace('UTC084web', '20091218c')`