dosage/scripts/comicfury.py
Tobias Gruetzmacher 963db7f448 Adopt SmackJeeves to new site design (fixes #144)
Some things got lost on the way:
- Since there is no comprehensive comic directory anymore, removed
  automatic update script. New comics need to be added manually.
- Some authors used the opportunity to move from SmackJeeves somewhere
  else - some of those got new modules (either standalone or ComicFury)
- Abunch of comics just disappeared...
2019-12-26 22:03:18 +01:00

186 lines
5.1 KiB
Python
Executable file

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
"""
from __future__ import absolute_import, division, print_function
import sys
from six.moves.urllib.parse import urlsplit
from scriptutil import ComicListUpdater
class ComicFuryUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',
'KeenSpot/%s', 'Arcamax/%s')
langmap = {
'german': 'de',
'spanish': 'es',
'italian': 'it',
'japanese': 'ja',
'french': 'fr',
'portuguese': 'pt',
}
# names of comics to exclude
excluded_comics = (
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"GoodSirICannotDraw",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheBends",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
"TheDevonLegacyPrologue",
# images gone
"BaseballCapsAndTiaras",
"BiMorphon",
"CROSSWORLDSNEXUS",
"DevilSpy",
"Fathead",
"GOODBYEREPTILIANS",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"SandboxDrama",
"STICKFODDER",
"TezzleAndZeek",
"TheRealmOfKaerwyn",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
# moved
"OopsComicAdventure",
)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
count = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = comiclink.text
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has
count = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
self.add_comic(name, (comicurl, active, lang), count)
return count
def collect_results(self):
"""Parse all search result pages."""
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('https://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=2&incnu=2&incla=2&incse=2&' +
'all_ge=1&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= self.MIN_COMICS:
last_count = self.handle_url(baseUrl % page)
page += 1
print(last_count, file=sys.stderr, end=" ")
def get_entry(self, name, entry):
url, active, lang = entry
langopt = ''
if lang != "english":
if lang in self.langmap:
langopt = ", '%s'" % self.langmap[lang]
else:
print("WARNING:", "Unknown language:", lang)
sub = urlsplit(url).hostname.split('.', 1)[0]
return u"cls('%s', '%s'%s)," % (name, sub, langopt)
if __name__ == '__main__':
ComicFuryUpdater(__file__).run()