dosage/scripts/comicfury.py

222 lines
6 KiB
Python
Raw Normal View History

2013-02-13 16:53:36 +00:00
#!/usr/bin/env python
2016-04-12 22:52:16 +00:00
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
2013-02-13 16:53:36 +00:00
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
2013-02-13 16:53:36 +00:00
"""
2016-04-12 22:52:16 +00:00
from __future__ import absolute_import, division, print_function
2013-05-22 20:29:03 +00:00
import codecs
2013-02-13 16:53:36 +00:00
import sys
import os
2016-04-12 22:52:16 +00:00
2013-02-13 16:53:36 +00:00
import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
2016-04-12 22:52:16 +00:00
from dosagelib.util import get_page
2013-02-13 16:53:36 +00:00
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
2013-02-13 16:53:36 +00:00
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
2013-02-13 16:53:36 +00:00
json_file = __file__.replace(".py", ".json")
2013-02-13 16:53:36 +00:00
# names of comics to exclude
exclude_comics = [
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
# images gone
"BaseballCapsAndTiaras",
"CROSSWORLDSNEXUS",
"Fathead",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"TezzleAndZeek",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
2013-02-13 16:53:36 +00:00
]
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
2016-04-12 22:52:16 +00:00
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
2013-02-13 16:53:36 +00:00
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
num = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comiclink.text)
2013-02-13 16:53:36 +00:00
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
2013-02-13 16:53:36 +00:00
continue
info = comicdiv.cssselect('span.comicinfo')
2013-02-13 16:53:36 +00:00
# find out how many images this comic has
num = int(info[1].text.strip())
2013-02-13 16:53:36 +00:00
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
res[name] = [comicurl, num, active, lang]
return num
2013-02-13 16:53:36 +00:00
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
'&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= MIN_COMICS:
last_count = handle_url(baseUrl % page, session, res)
page += 1
print(last_count, file=sys.stderr, end=" ")
2013-02-13 16:53:36 +00:00
save_result(res, json_file)
def find_dups(name):
2013-02-13 16:53:36 +00:00
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
2013-02-13 16:53:36 +00:00
if lname in names:
return scraperclass.getName().lower()
return None
2013-02-13 16:53:36 +00:00
2016-03-31 21:26:56 +00:00
def first_lower(x):
return x[0].lower()
2013-02-13 16:53:36 +00:00
def print_results(args):
"""Print all comics that have at least the given number of minimum
comic strips."""
2013-05-22 20:29:03 +00:00
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
2016-03-31 21:26:56 +00:00
data = load_result(json_file)
for name, entry in sorted(data.items(), key=first_lower):
url, num, active, lang = entry
2013-05-22 20:29:03 +00:00
if name in exclude_comics:
fp.write(u"# %s is excluded\n" % name)
2013-05-22 20:29:03 +00:00
continue
if num < min_comics:
continue
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
2013-05-22 20:29:03 +00:00
else:
fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
truncate_name(name), str(url)))
2013-02-13 16:53:36 +00:00
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()