dosage/scripts/comicfury.py

175 lines
4.6 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
2016-10-28 22:21:41 +00:00
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
2016-04-12 22:52:16 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
2022-11-26 16:46:31 +00:00
# Copyright (C) 2015-2022 Tobias Gruetzmacher
2013-02-13 16:53:36 +00:00
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
2013-02-13 16:53:36 +00:00
"""
2016-04-12 22:52:16 +00:00
2013-02-13 16:53:36 +00:00
import sys
from urllib.parse import urlsplit
from scriptutil import ComicListUpdater
class ComicFuryUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
2017-02-12 18:50:51 +00:00
dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',
'KeenSpot/%s', 'Arcamax/%s')
2016-04-16 11:13:47 +00:00
langmap = {
'german': 'de',
'spanish': 'es',
'italian': 'it',
'japanese': 'ja',
'french': 'fr',
'portuguese': 'pt',
}
# names of comics to exclude
excluded_comics = (
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
2016-04-17 14:19:44 +00:00
"GoodSirICannotDraw",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
2016-04-17 14:19:44 +00:00
"TheBends",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
2016-04-17 14:19:44 +00:00
"TheDevonLegacyPrologue",
# images gone
"BaseballCapsAndTiaras",
2017-02-05 23:05:05 +00:00
"BiMorphon",
"CROSSWORLDSNEXUS",
2017-02-12 18:50:51 +00:00
"DevilSpy",
"Fathead",
2016-05-15 22:53:22 +00:00
"GOODBYEREPTILIANS",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
2016-05-15 22:53:22 +00:00
"SandboxDrama",
"STICKFODDER",
"TezzleAndZeek",
"TheRealmOfKaerwyn",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
2016-10-30 09:57:50 +00:00
# moved
"OopsComicAdventure",
)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
2022-11-26 16:46:31 +00:00
for comicdiv in data.cssselect('div.webcomic-result'):
comiclink = comicdiv.cssselect('div.webcomic-result-title a')[0]
comicurl = comiclink.attrib['href']
name = comiclink.text
2022-11-26 16:46:31 +00:00
info = comicdiv.cssselect('span.stat-value')
# find out how many images this comic has
2022-11-26 16:46:31 +00:00
count = int(info[0].text.strip())
self.add_comic(name, comicurl, count)
2022-11-26 16:46:31 +00:00
nextlink = data.cssselect('div.search-next-page a')
if nextlink:
return nextlink[0].attrib['href']
else:
return None
def collect_results(self):
"""Parse all search result pages."""
# Sort by page count, so we can abort when we get under some threshold.
2022-11-26 16:46:31 +00:00
url = ('https://comicfury.com/search.php?query=&lastupdate=0&' +
'completed=1&fn=2&fv=2&fs=2&fl=2&sort=0')
print("Parsing search result pages...", file=sys.stderr)
2022-11-26 16:46:31 +00:00
while url:
url = self.handle_url(url)
2016-05-22 20:55:06 +00:00
def get_entry(self, name, entry):
2022-11-26 16:46:31 +00:00
url = entry
2016-04-16 11:13:47 +00:00
sub = urlsplit(url).hostname.split('.', 1)[0]
2022-11-26 16:46:31 +00:00
return f"cls('{name}', '{sub}'),"
2013-02-13 16:53:36 +00:00
if __name__ == '__main__':
ComicFuryUpdater(__file__).run()