dosage/scripts/comicfury.py

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
"""

import sys
from urllib.parse import urlsplit

from scriptutil import ComicListUpdater


class ComicFuryUpdater(ComicListUpdater):
    # Absolute minumum number of pages a comic may have (restrict search space)
    MIN_COMICS = 90

    dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',
                     'KeenSpot/%s', 'Arcamax/%s')

    langmap = {
        'german': 'de',
        'spanish': 'es',
        'italian': 'it',
        'japanese': 'ja',
        'french': 'fr',
        'portuguese': 'pt',
    }

    # names of comics to exclude
    excluded_comics = (
        # unsuitable navigation
        "AlfdisAndGunnora",
        "AnAmericanNerdInAnimatedTokyo",
        "AngryAlien",
        "BoozerAndStoner",
        "Bonejangles",
        "ConradStory",
        "Crossing",
        "ChristianHumberReloaded",
        "CorkAndBlotto",
        "Democomix",
        "ErraticBeatComics",
        "EnergyWielders",
        "EvilBearorg",
        "Fiascos",
        "FateOfTheBlueStar",
        "FPK",
        "Fanartgyle",
        "FrigginRandom",
        "GoodbyeKitty",
        "GoodSirICannotDraw",
        "HighlyExperiMental",
        "IfAndCanBeFlowers",
        "JournalismStory",
        "JohnsonSuperior",
        "Keel",
        "JudgeDredBasset",
        "LomeathAndHuilii",
        "MNPB",
        "LucidsDream",
        "MadDog",
        "Minebreakers",
        "MoonlightValley",
        "MyImmortalFool",
        "NATO",
        "NothingFits",
        "OptimisticFishermenAndPessimisticFishermen",
        "Old2G",
        "NothingFitsArtBlog",
        "OutToLunchTheStingRayWhoreStory",
        "Pandemonium",
        "Pewfell",
        "ProjectX",
        "Ratantia",
        "RealLifeTrips",
        "Sandgate",
        "Secondpuberty",
        "Seconds",
        "SlightlyEccentricOrigins",
        "StardustTheCat",
        "StrangerThanFiction",
        "TalamakGreatAdventure",
        "TheBattalion",
        "TheBends",
        "TheDailyProblem",
        "TheMansionOfE",
        "ThePainter",
        "TheSeekers",
        "TheTrialsOfKlahadOfTheAbyss",
        "TheStickmen",
        "ThornsInOurSide",
        "TopHeavyVeryBustyPinUpsForAdults",
        "USBUnlimitedSimulatedBody",
        "TylerHumanRecycler",
        "UAF",
        "WhenPigsFly",
        "YeOldeLegotimeTheatre",

        # no content
        "Angst",
        "TheDevonLegacyPrologue",

        # images gone
        "BaseballCapsAndTiaras",
        "BiMorphon",
        "CROSSWORLDSNEXUS",
        "DevilSpy",
        "Fathead",
        "GOODBYEREPTILIANS",
        "KevinZombie",
        "KindergardenCrisIs",
        "NoSongsForTheDead",
        "RequiemShadowbornPariah",
        "SandboxDrama",
        "STICKFODDER",
        "TezzleAndZeek",
        "TheRealmOfKaerwyn",

        # broken HTML
        "CrossingOver",

        # unique html
        "IKilledTheHero",
        "PowerOfPower",
        "Schizmatic",
        "WakeTheSleepers",
        "WeightOfEternity",

        # moved
        "OopsComicAdventure",
    )

    def handle_url(self, url):
        """Parse one search result page."""
        data = self.get_url(url)

        count = 999
        for comicdiv in data.cssselect('div.searchresult'):
            comiclink = comicdiv.cssselect('h3 a')[0]
            comicurl = comiclink.attrib['href']
            name = comiclink.text

            info = comicdiv.cssselect('span.comicinfo')
            # find out how many images this comic has
            count = int(info[1].text.strip())
            # find activity
            active = info[6].text.strip().lower() == "active"
            lang = info[7].text.strip().lower()
            self.add_comic(name, (comicurl, active, lang), count)

        return count

    def collect_results(self):
        """Parse all search result pages."""
        # Sort by page count, so we can abort when we get under some threshold.
        baseUrl = ('https://comicfury.com/search.php?search=1&webcomics=1&' +
                   'query=&worder=1&asc=0&incvi=2&incnu=2&incla=2&incse=2&' +
                   'all_ge=1&all_st=1&all_la=1&page=%d')
        last_count = 999
        page = 1
        print("Parsing search result pages...", file=sys.stderr)
        while last_count >= self.MIN_COMICS:
            last_count = self.handle_url(baseUrl % page)
            page += 1
            print(last_count, file=sys.stderr, end=" ")

    def get_entry(self, name, entry):
        url, active, lang = entry
        langopt = ''
        if lang != "english":
            if lang in self.langmap:
                langopt = ", '%s'" % self.langmap[lang]
            else:
                print("WARNING:", "Unknown language:", lang)

        sub = urlsplit(url).hostname.split('.', 1)[0]
        return u"cls('%s', '%s'%s)," % (name, sub, langopt)


if __name__ == '__main__':
    ComicFuryUpdater(__file__).run()
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`#!/usr/bin/env python3`
			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Adopt SmackJeeves to new site design (fixes #144) Some things got lost on the way: - Since there is no comprehensive comic directory anymore, removed automatic update script. New comics need to be added manually. - Some authors used the opportunity to move from SmackJeeves somewhere else - some of those got new modules (either standalone or ComicFury) - Abunch of comics just disappeared... 2019-12-26 21:03:18 +00:00			`# Copyright (C) 2015-2019 Tobias Gruetzmacher`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`"""`
Update ComicFury comics. (+871, -245) - Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing. 2016-03-16 23:44:06 +00:00			`Script to get ComicFury comics and save the info in a JSON file for further`
			`processing.`
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`"""`
Clean up update helper scripts. 2016-04-12 22:52:16 +00:00
Added comicfury comics. 2013-02-13 16:53:36 +00:00			`import sys`
Drop Python 2 support: six & other imports 2020-02-03 00:03:31 +00:00			`from urllib.parse import urlsplit`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00
			`from scriptutil import ComicListUpdater`


			`class ComicFuryUpdater(ComicListUpdater):`
			`# Absolute minumum number of pages a comic may have (restrict search space)`
			`MIN_COMICS = 90`

Update ComicFury again... 2017-02-12 18:50:51 +00:00			`dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',`
Adopt SmackJeeves to new site design (fixes #144) Some things got lost on the way: - Since there is no comprehensive comic directory anymore, removed automatic update script. New comics need to be added manually. - Some authors used the opportunity to move from SmackJeeves somewhere else - some of those got new modules (either standalone or ComicFury) - Abunch of comics just disappeared... 2019-12-26 21:03:18 +00:00			`'KeenSpot/%s', 'Arcamax/%s')`
Update ComicFury. 2016-04-16 11:13:47 +00:00
			`langmap = {`
			`'german': 'de',`
			`'spanish': 'es',`
			`'italian': 'it',`
			`'japanese': 'ja',`
			`'french': 'fr',`
			`'portuguese': 'pt',`
			`}`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00
			`# names of comics to exclude`
			`excluded_comics = (`
			`# unsuitable navigation`
			`"AlfdisAndGunnora",`
			`"AnAmericanNerdInAnimatedTokyo",`
			`"AngryAlien",`
			`"BoozerAndStoner",`
			`"Bonejangles",`
			`"ConradStory",`
			`"Crossing",`
			`"ChristianHumberReloaded",`
			`"CorkAndBlotto",`
			`"Democomix",`
			`"ErraticBeatComics",`
			`"EnergyWielders",`
			`"EvilBearorg",`
			`"Fiascos",`
			`"FateOfTheBlueStar",`
			`"FPK",`
			`"Fanartgyle",`
			`"FrigginRandom",`
			`"GoodbyeKitty",`
Update ComicFury (again). 2016-04-17 14:19:44 +00:00			`"GoodSirICannotDraw",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"HighlyExperiMental",`
			`"IfAndCanBeFlowers",`
			`"JournalismStory",`
			`"JohnsonSuperior",`
			`"Keel",`
			`"JudgeDredBasset",`
			`"LomeathAndHuilii",`
			`"MNPB",`
			`"LucidsDream",`
			`"MadDog",`
			`"Minebreakers",`
			`"MoonlightValley",`
			`"MyImmortalFool",`
			`"NATO",`
			`"NothingFits",`
			`"OptimisticFishermenAndPessimisticFishermen",`
			`"Old2G",`
			`"NothingFitsArtBlog",`
			`"OutToLunchTheStingRayWhoreStory",`
			`"Pandemonium",`
			`"Pewfell",`
			`"ProjectX",`
			`"Ratantia",`
			`"RealLifeTrips",`
			`"Sandgate",`
			`"Secondpuberty",`
			`"Seconds",`
			`"SlightlyEccentricOrigins",`
			`"StardustTheCat",`
			`"StrangerThanFiction",`
			`"TalamakGreatAdventure",`
			`"TheBattalion",`
Update ComicFury (again). 2016-04-17 14:19:44 +00:00			`"TheBends",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"TheDailyProblem",`
			`"TheMansionOfE",`
			`"ThePainter",`
			`"TheSeekers",`
			`"TheTrialsOfKlahadOfTheAbyss",`
			`"TheStickmen",`
			`"ThornsInOurSide",`
			`"TopHeavyVeryBustyPinUpsForAdults",`
			`"USBUnlimitedSimulatedBody",`
			`"TylerHumanRecycler",`
			`"UAF",`
			`"WhenPigsFly",`
			`"YeOldeLegotimeTheatre",`

			`# no content`
			`"Angst",`
Update ComicFury (again). 2016-04-17 14:19:44 +00:00			`"TheDevonLegacyPrologue",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00
			`# images gone`
			`"BaseballCapsAndTiaras",`
Fix some modules. 2017-02-05 23:05:05 +00:00			`"BiMorphon",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"CROSSWORLDSNEXUS",`
Update ComicFury again... 2017-02-12 18:50:51 +00:00			`"DevilSpy",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"Fathead",`
Some more ComicFury comics gone... 2016-05-15 22:53:22 +00:00			`"GOODBYEREPTILIANS",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"KevinZombie",`
			`"KindergardenCrisIs",`
			`"NoSongsForTheDead",`
			`"RequiemShadowbornPariah",`
Some more ComicFury comics gone... 2016-05-15 22:53:22 +00:00			`"SandboxDrama",`
Remove some broken comics from ComicFury module. 2017-05-14 20:45:12 +00:00			`"STICKFODDER",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`"TezzleAndZeek",`
Remove some broken comics from ComicFury module. 2017-05-14 20:45:12 +00:00			`"TheRealmOfKaerwyn",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00
			`# broken HTML`
			`"CrossingOver",`

			`# unique html`
			`"IKilledTheHero",`
			`"PowerOfPower",`
			`"Schizmatic",`
			`"WakeTheSleepers",`
			`"WeightOfEternity",`
Update ComicFury modules. 2016-10-30 09:57:50 +00:00
			`# moved`
			`"OopsComicAdventure",`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`)`

			`def handle_url(self, url):`
			`"""Parse one search result page."""`
			`data = self.get_url(url)`

			`count = 999`
			`for comicdiv in data.cssselect('div.searchresult'):`
			`comiclink = comicdiv.cssselect('h3 a')[0]`
			`comicurl = comiclink.attrib['href']`
			`name = comiclink.text`

			`info = comicdiv.cssselect('span.comicinfo')`
			`# find out how many images this comic has`
			`count = int(info[1].text.strip())`
			`# find activity`
			`active = info[6].text.strip().lower() == "active"`
			`lang = info[7].text.strip().lower()`
			`self.add_comic(name, (comicurl, active, lang), count)`

			`return count`

			`def collect_results(self):`
			`"""Parse all search result pages."""`
			`# Sort by page count, so we can abort when we get under some threshold.`
Adopt SmackJeeves to new site design (fixes #144) Some things got lost on the way: - Since there is no comprehensive comic directory anymore, removed automatic update script. New comics need to be added manually. - Some authors used the opportunity to move from SmackJeeves somewhere else - some of those got new modules (either standalone or ComicFury) - Abunch of comics just disappeared... 2019-12-26 21:03:18 +00:00			`baseUrl = ('https://comicfury.com/search.php?search=1&webcomics=1&' +`
			`'query=&worder=1&asc=0&incvi=2&incnu=2&incla=2&incse=2&' +`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`'all_ge=1&all_st=1&all_la=1&page=%d')`
			`last_count = 999`
			`page = 1`
			`print("Parsing search result pages...", file=sys.stderr)`
			`while last_count >= self.MIN_COMICS:`
			`last_count = self.handle_url(baseUrl % page)`
			`page += 1`
			`print(last_count, file=sys.stderr, end=" ")`

Make auto-update script more flexible. 2016-05-22 20:55:06 +00:00			`def get_entry(self, name, entry):`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`url, active, lang = entry`
Update ComicFury. 2016-04-16 11:13:47 +00:00			`langopt = ''`
			`if lang != "english":`
			`if lang in self.langmap:`
Migrate ComicFury to single-class module. 2016-05-22 21:31:53 +00:00			`langopt = ", '%s'" % self.langmap[lang]`
Update ComicFury. 2016-04-16 11:13:47 +00:00			`else:`
			`print("WARNING:", "Unknown language:", lang)`

			`sub = urlsplit(url).hostname.split('.', 1)[0]`
Missing comma :) 2016-05-22 22:02:05 +00:00			`return u"cls('%s', '%s'%s)," % (name, sub, langopt)`
Added comicfury comics. 2013-02-13 16:53:36 +00:00

			`if __name__ == '__main__':`
Refactor update helpers: Remove duplicate code. 2016-04-14 20:22:37 +00:00			`ComicFuryUpdater(__file__).run()`