dosage/scripts/comicfury.py

187 lines
5.2 KiB
Python
Raw Normal View History

2013-02-13 16:53:36 +00:00
#!/usr/bin/env python
2016-04-12 22:52:16 +00:00
# -*- coding: utf-8 -*-
2016-10-28 22:21:41 +00:00
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
2016-04-12 22:52:16 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
2017-02-05 23:05:05 +00:00
# Copyright (C) 2015-2017 Tobias Gruetzmacher
2013-02-13 16:53:36 +00:00
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
2013-02-13 16:53:36 +00:00
"""
2016-04-12 22:52:16 +00:00
from __future__ import absolute_import, division, print_function
2013-02-13 16:53:36 +00:00
import sys
from six.moves.urllib.parse import urlsplit
from scriptutil import ComicListUpdater
class ComicFuryUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
2017-02-12 18:50:51 +00:00
dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',
2016-04-16 11:13:47 +00:00
'KeenSpot/%s', 'SmackJeeves/%s', 'Arcamax/%s')
langmap = {
'german': 'de',
'spanish': 'es',
'italian': 'it',
'japanese': 'ja',
'french': 'fr',
'portuguese': 'pt',
}
# names of comics to exclude
excluded_comics = (
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
2016-04-17 14:19:44 +00:00
"GoodSirICannotDraw",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
2016-04-17 14:19:44 +00:00
"TheBends",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
2016-04-17 14:19:44 +00:00
"TheDevonLegacyPrologue",
# images gone
"BaseballCapsAndTiaras",
2017-02-05 23:05:05 +00:00
"BiMorphon",
"CROSSWORLDSNEXUS",
2017-02-12 18:50:51 +00:00
"DevilSpy",
"Fathead",
2016-05-15 22:53:22 +00:00
"GOODBYEREPTILIANS",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
2016-05-15 22:53:22 +00:00
"SandboxDrama",
"STICKFODDER",
"TezzleAndZeek",
"TheRealmOfKaerwyn",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
2016-10-30 09:57:50 +00:00
# moved
"OopsComicAdventure",
)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
count = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = comiclink.text
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has
count = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
self.add_comic(name, (comicurl, active, lang), count)
return count
def collect_results(self):
"""Parse all search result pages."""
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
'all_ge=1&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= self.MIN_COMICS:
last_count = self.handle_url(baseUrl % page)
page += 1
print(last_count, file=sys.stderr, end=" ")
2016-05-22 20:55:06 +00:00
def get_entry(self, name, entry):
url, active, lang = entry
2016-04-16 11:13:47 +00:00
langopt = ''
if lang != "english":
if lang in self.langmap:
langopt = ", '%s'" % self.langmap[lang]
2016-04-16 11:13:47 +00:00
else:
print("WARNING:", "Unknown language:", lang)
sub = urlsplit(url).hostname.split('.', 1)[0]
2016-05-22 22:02:05 +00:00
return u"cls('%s', '%s'%s)," % (name, sub, langopt)
2013-02-13 16:53:36 +00:00
if __name__ == '__main__':
ComicFuryUpdater(__file__).run()