dosage/scripts/arcamax.py

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher
"""
Script to get arcamax comics and save the info in a JSON file for further
processing.
"""

from scriptutil import ComicListUpdater


class ArcamaxUpdater(ComicListUpdater):
    dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
                     "KeenSpot/%s", "ComicGenesis/%s")

    # names of comics to exclude
    excluded_comics = (
        # better source available
        "Dilbert",
        "HagarTheHorrible",
    )

    def handle_url(self, url):
        """Parse one search result page."""
        data = self.get_url(url)

        for comiclink in data.cssselect('a.comic-icon'):
            path = comiclink.attrib['href']
            name = comiclink.attrib['title']

            self.add_comic(name, path.rsplit('/', 2)[1])

    def collect_results(self):
        """Parse all search result pages."""
        self.handle_url('http://www.arcamax.com/comics')

    def get_entry(self, name, entry):
        return u"cls('%s', '%s')," % (name, entry)


if __name__ == '__main__':
    ArcamaxUpdater(__file__).run()
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`#!/usr/bin/env python3`
			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Remove SmackJeeves It was closed at the end of 2020... 2021-01-10 18:18:45 +00:00			`# Copyright (C) 2015-2021 Tobias Gruetzmacher`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00			`"""`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`Script to get arcamax comics and save the info in a JSON file for further`
			`processing.`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00			`"""`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`from scriptutil import ComicListUpdater`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00

Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`class ArcamaxUpdater(ComicListUpdater):`
			`dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",`
Remove SmackJeeves It was closed at the end of 2020... 2021-01-10 18:18:45 +00:00			`"KeenSpot/%s", "ComicGenesis/%s")`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`# names of comics to exclude`
			`excluded_comics = (`
Migrate Arcamax to single-class module. 2016-05-22 21:17:24 +00:00			`# better source available`
			`"Dilbert",`
			`"HagarTheHorrible",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`)`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`def handle_url(self, url):`
			`"""Parse one search result page."""`
			`data = self.get_url(url)`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`for comiclink in data.cssselect('a.comic-icon'):`
			`path = comiclink.attrib['href']`
			`name = comiclink.attrib['title']`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`self.add_comic(name, path.rsplit('/', 2)[1])`
Remove make_scraper magic from Arcamax. 2016-04-13 22:17:59 +00:00
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`def collect_results(self):`
			`"""Parse all search result pages."""`
			`self.handle_url('http://www.arcamax.com/comics')`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00
Make auto-update script more flexible. 2016-05-22 20:55:06 +00:00			`def get_entry(self, name, entry):`
Migrate Arcamax to single-class module. 2016-05-22 21:17:24 +00:00			`return u"cls('%s', '%s')," % (name, entry)`
Add comic strips from Arcamax. 2013-01-23 18:34:11 +00:00

			`if __name__ == '__main__':`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`ArcamaxUpdater(__file__).run()`