183 lines
5.1 KiB
Python
Executable file
183 lines
5.1 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
|
"""
|
|
Script to get ComicFury comics and save the info in a JSON file for further
|
|
processing.
|
|
"""
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
import sys
|
|
from six.moves.urllib.parse import urlsplit
|
|
|
|
from scriptutil import ComicListUpdater
|
|
|
|
|
|
class ComicFuryUpdater(ComicListUpdater):
|
|
# Absolute minumum number of pages a comic may have (restrict search space)
|
|
MIN_COMICS = 90
|
|
|
|
dup_templates = ('Creators/%s', 'DrunkDuck/%s', 'GoComics/%s',
|
|
'KeenSpot/%s', 'SmackJeeves/%s', 'Arcamax/%s')
|
|
|
|
langmap = {
|
|
'german': 'de',
|
|
'spanish': 'es',
|
|
'italian': 'it',
|
|
'japanese': 'ja',
|
|
'french': 'fr',
|
|
'portuguese': 'pt',
|
|
}
|
|
|
|
# names of comics to exclude
|
|
excluded_comics = (
|
|
# unsuitable navigation
|
|
"AlfdisAndGunnora",
|
|
"AnAmericanNerdInAnimatedTokyo",
|
|
"AngryAlien",
|
|
"BoozerAndStoner",
|
|
"Bonejangles",
|
|
"ConradStory",
|
|
"Crossing",
|
|
"ChristianHumberReloaded",
|
|
"CorkAndBlotto",
|
|
"Democomix",
|
|
"ErraticBeatComics",
|
|
"EnergyWielders",
|
|
"EvilBearorg",
|
|
"Fiascos",
|
|
"FateOfTheBlueStar",
|
|
"FPK",
|
|
"Fanartgyle",
|
|
"FrigginRandom",
|
|
"GoodbyeKitty",
|
|
"GoodSirICannotDraw",
|
|
"HighlyExperiMental",
|
|
"IfAndCanBeFlowers",
|
|
"JournalismStory",
|
|
"JohnsonSuperior",
|
|
"Keel",
|
|
"JudgeDredBasset",
|
|
"LomeathAndHuilii",
|
|
"MNPB",
|
|
"LucidsDream",
|
|
"MadDog",
|
|
"Minebreakers",
|
|
"MoonlightValley",
|
|
"MyImmortalFool",
|
|
"NATO",
|
|
"NothingFits",
|
|
"OptimisticFishermenAndPessimisticFishermen",
|
|
"Old2G",
|
|
"NothingFitsArtBlog",
|
|
"OutToLunchTheStingRayWhoreStory",
|
|
"Pandemonium",
|
|
"Pewfell",
|
|
"ProjectX",
|
|
"Ratantia",
|
|
"RealLifeTrips",
|
|
"Sandgate",
|
|
"Secondpuberty",
|
|
"Seconds",
|
|
"SlightlyEccentricOrigins",
|
|
"StardustTheCat",
|
|
"StrangerThanFiction",
|
|
"TalamakGreatAdventure",
|
|
"TheBattalion",
|
|
"TheBends",
|
|
"TheDailyProblem",
|
|
"TheMansionOfE",
|
|
"ThePainter",
|
|
"TheSeekers",
|
|
"TheTrialsOfKlahadOfTheAbyss",
|
|
"TheStickmen",
|
|
"ThornsInOurSide",
|
|
"TopHeavyVeryBustyPinUpsForAdults",
|
|
"USBUnlimitedSimulatedBody",
|
|
"TylerHumanRecycler",
|
|
"UAF",
|
|
"WhenPigsFly",
|
|
"YeOldeLegotimeTheatre",
|
|
|
|
# no content
|
|
"Angst",
|
|
"TheDevonLegacyPrologue",
|
|
|
|
# images gone
|
|
"BaseballCapsAndTiaras",
|
|
"CROSSWORLDSNEXUS",
|
|
"Fathead",
|
|
"GOODBYEREPTILIANS",
|
|
"KevinZombie",
|
|
"KindergardenCrisIs",
|
|
"NoSongsForTheDead",
|
|
"RequiemShadowbornPariah",
|
|
"STICKFODDER",
|
|
"SandboxDrama",
|
|
"TezzleAndZeek",
|
|
|
|
# broken HTML
|
|
"CrossingOver",
|
|
|
|
# unique html
|
|
"IKilledTheHero",
|
|
"PowerOfPower",
|
|
"Schizmatic",
|
|
"WakeTheSleepers",
|
|
"WeightOfEternity",
|
|
|
|
# moved
|
|
"OopsComicAdventure",
|
|
)
|
|
|
|
def handle_url(self, url):
|
|
"""Parse one search result page."""
|
|
data = self.get_url(url)
|
|
|
|
count = 999
|
|
for comicdiv in data.cssselect('div.searchresult'):
|
|
comiclink = comicdiv.cssselect('h3 a')[0]
|
|
comicurl = comiclink.attrib['href']
|
|
name = comiclink.text
|
|
|
|
info = comicdiv.cssselect('span.comicinfo')
|
|
# find out how many images this comic has
|
|
count = int(info[1].text.strip())
|
|
# find activity
|
|
active = info[6].text.strip().lower() == "active"
|
|
lang = info[7].text.strip().lower()
|
|
self.add_comic(name, (comicurl, active, lang), count)
|
|
|
|
return count
|
|
|
|
def collect_results(self):
|
|
"""Parse all search result pages."""
|
|
# Sort by page count, so we can abort when we get under some threshold.
|
|
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
|
|
'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
|
|
'all_ge=1&all_st=1&all_la=1&page=%d')
|
|
last_count = 999
|
|
page = 1
|
|
print("Parsing search result pages...", file=sys.stderr)
|
|
while last_count >= self.MIN_COMICS:
|
|
last_count = self.handle_url(baseUrl % page)
|
|
page += 1
|
|
print(last_count, file=sys.stderr, end=" ")
|
|
|
|
def get_entry(self, name, entry):
|
|
url, active, lang = entry
|
|
langopt = ''
|
|
if lang != "english":
|
|
if lang in self.langmap:
|
|
langopt = ", '%s'" % self.langmap[lang]
|
|
else:
|
|
print("WARNING:", "Unknown language:", lang)
|
|
|
|
sub = urlsplit(url).hostname.split('.', 1)[0]
|
|
return u"cls('%s', '%s'%s)," % (name, sub, langopt)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ComicFuryUpdater(__file__).run()
|