dosage/scripts/comicskingdom.py

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019 Thomas W. Littauer
"""
Script to get a list of comicskingdom.com comics and save the info in a JSON
file for further processing.
"""

from scriptutil import ComicListUpdater
from dosagelib.xml import NS


class ComicsKingdomUpdater(ComicListUpdater):
    dup_templates = (
        "Creators/%s",
        "GoComics/%s",
        "KeenSpot/%s",
        "ComicGenesis/%s",
    )

    def handle_startpage(self, page):
        """Parse list of comics from the bottom of the start page."""
        for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS):
            link = li.xpath('./a')[0]
            url = link.attrib['href']
            name = link.text.removeprefix('The ')

            self.add_comic(name, (url, None))

    def handle_listing(self, page, lang: str = None, add: str = ''):

        hasnew = True
        while hasnew:
            hasnew = False
            for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS):
                nametag = comicdiv.xpath('./a/comic-name')
                if len(nametag) == 0:
                    continue
                name = nametag[0].text.removeprefix('The ') + add
                url = comicdiv.xpath('./a')[0].attrib['href']

                if self.add_comic(name, (url, lang)):
                    hasnew = True

            nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]')
            page = self.get_url(nextlink[0].attrib['href'])

    def collect_results(self):
        """Parse all search result pages."""
        page = self.get_url('https://www.comicskingdom.com/')
        self.handle_startpage(page)
        self.handle_listing(page)
        self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish')

    def get_entry(self, name: str, data: tuple[str, str]):
        opt = f", lang='{data[1]}'" if data[1] else ''
        return f"cls('{name}', '{data[0].split('/')[3]}'{opt}),"


if __name__ == '__main__':
    ComicsKingdomUpdater(__file__).run()
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`#!/usr/bin/env python3`
			`# SPDX-License-Identifier: MIT`
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`# Copyright (C) 2019-2022 Tobias Gruetzmacher`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00			`# Copyright (C) 2019 Thomas W. Littauer`
			`"""`
Simplify ComicsKingdom extraction a bit 2019-12-16 22:47:14 +00:00			`Script to get a list of comicskingdom.com comics and save the info in a JSON`
			`file for further processing.`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00			`"""`

			`from scriptutil import ComicListUpdater`
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`from dosagelib.xml import NS`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00
Simplify ComicsKingdom extraction a bit 2019-12-16 22:47:14 +00:00
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00			`class ComicsKingdomUpdater(ComicListUpdater):`
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`dup_templates = (`
			`"Creators/%s",`
			`"GoComics/%s",`
			`"KeenSpot/%s",`
			`"ComicGenesis/%s",`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00			`)`

Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`def handle_startpage(self, page):`
			`"""Parse list of comics from the bottom of the start page."""`
			`for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS):`
			`link = li.xpath('./a')[0]`
			`url = link.attrib['href']`
			`name = link.text.removeprefix('The ')`

			`self.add_comic(name, (url, None))`

			`def handle_listing(self, page, lang: str = None, add: str = ''):`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`hasnew = True`
			`while hasnew:`
			`hasnew = False`
			`for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS):`
			`nametag = comicdiv.xpath('./a/comic-name')`
			`if len(nametag) == 0:`
			`continue`
			`name = nametag[0].text.removeprefix('The ') + add`
			`url = comicdiv.xpath('./a')[0].attrib['href']`
Simplify ComicsKingdom extraction a bit 2019-12-16 22:47:14 +00:00
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`if self.add_comic(name, (url, lang)):`
			`hasnew = True`

			`nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]')`
			`page = self.get_url(nextlink[0].attrib['href'])`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00
			`def collect_results(self):`
			`"""Parse all search result pages."""`
Search for spanish comics on ComicsKingdom (fixes #165) 2022-06-05 22:20:12 +00:00			`page = self.get_url('https://www.comicskingdom.com/')`
			`self.handle_startpage(page)`
			`self.handle_listing(page)`
			`self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish')`

			`def get_entry(self, name: str, data: tuple[str, str]):`
			`opt = f", lang='{data[1]}'" if data[1] else ''`
			`return f"cls('{name}', '{data[0].split('/')[3]}'{opt}),"`
Add ComicsKingdom.com (aka King Features) (#134) Note that going back more than about seven days is a paid feature and is untested. 2019-12-16 22:18:04 +00:00

			`if __name__ == '__main__':`
			`ComicsKingdomUpdater(__file__).run()`