dosage/scripts/comicfury.py

357 lines
14 KiB
Python
Raw Normal View History

2013-02-13 16:53:36 +00:00
#!/usr/bin/env python
2014-01-05 15:50:57 +00:00
# Copyright (C) 2013-2014 Bastian Kleineidam
2013-02-13 16:53:36 +00:00
"""
Script to get arcamax comics and save the info in a JSON file for further processing.
"""
from __future__ import print_function
2013-05-22 20:29:03 +00:00
import codecs
2013-02-13 16:53:36 +00:00
import re
import sys
import os
import requests
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent
from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name, format_description
json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<h3><a href="([^"]+)">')
desc_matcher = re.compile(r'<span class="subtext">(.*?)\[<a href', re.DOTALL)
num_matcher = re.compile(r'<b>Comics:</b> <span class="comicinfo">(\d+)</span>')
genre_matcher = re.compile(r'<b>Genre:</b> <span class="comicinfo">([^<]+)</span>')
activity_matcher = re.compile(r'<b>Activity status:</b> <span class="comicinfo">([^<]+)</span>')
# names of comics to exclude
exclude_comics = [
2013-05-25 21:24:33 +00:00
"1000", # unsuitable navigation
"12yearsofmissj", # unsuitable navigation
"3DGlasses", # unsuitable navigation
2013-04-28 17:58:38 +00:00
"30Days", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"6tsc", # unsuitable navigation
"Abyss", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Acelestialstory", # unsuitable navigation
"Actdr", # unsuitable navigation
"Aerosol", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Ahtiventures", # unsuitable navigation
"Alienirony", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Aloonaticstale", # unsuitable navigation
"Amity", # unsuitable navigation
"Angelguardian", # unsuitable navigation
"AngelguardianEspanol", # unsuitable navigation
"Angryalien", # unsuitable navigation
2013-04-28 17:58:38 +00:00
"Animangitis", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Archininja", # unsuitable navigation
"Arveytoonz", # unsuitable navigation
2013-04-28 17:58:38 +00:00
"AsperitasAstraalia", # unsuitable navigation
"AttackoftheRobofemoids", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Auriga", # unsuitable navigation
"Bedlam", # unsuitable navigation
2013-04-04 16:30:02 +00:00
"BITCHSquad", # missing images
2013-03-26 16:33:15 +00:00
"Bidoof", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Blobworld", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"BlockTales", # unsuitable navigation
"Bobcomix", # unsuitable navigation
"Bonejangles", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"BookOfLiesComic", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"BoozerandStoner", # unsuitable navigation
"Boyaurus", # unsuitable navigation
"Brainfood", # unsuitable navigation
"Bromosworld", # unsuitable navigation
"BulletMythology", # unsuitable navigation
2013-07-18 18:39:53 +00:00
"Bunnysher", # page moved
2013-03-26 16:33:15 +00:00
"BUXY", # unsuitable navigation
"CafeGruesome", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Castofmadness", # unsuitable navigation
"Chanpuru", # unsuitable navigation
"Christmaswithmaddog", # unsuitable navigation
2013-04-28 17:58:38 +00:00
"ChroniclesOfLillian", # unsuitable navigation
"Comicshortsmain", # unsuitable navigation
"Conrads", # unsuitable navigation
"ConradTheCaterpillar", # unsuitable navigation
"ConsequencesOfChoice", # unsuitable navigation
"CoolYuleComics", # unsuitable navigation
"Crossworldsnexus", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Colorforce", # unsuitable navigation
"Coolstorybro", # unsuitable navigation
"Crepusculars", # unsuitable navigation
"CtrlZ", # unsuitable navigation
"DeadNight", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Democomix", # unsuitable navigation
"Dinosaurkingdom", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Donutsforsharks", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Dotcomic", # unsuitable navigation
"Droned", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Druids", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Effingukookoo", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Elijahandazuuclassic", # unsuitable navigation
"ErraticBeat", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"ErraticE", # unsuitable navigation
"EternalKnights", # unsuitable navigation
"Evilbear", # unsuitable navigation
"Ewmic", # unsuitable navigation
"Fannicklas", # unsuitable navigation
"Fateofthebluestar", # unsuitable navigation
"Fishbowl", # unsuitable navigation
"Foe", # unsuitable navigation
"Foreignterritory", # unsuitable navigation
2013-07-18 18:39:53 +00:00
"Freakingawfulpuns", # page is gone
"Frigginrandom", # unsuitable navigation
"Frostfire", # unsuitable navigation
"Furnerdy", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Fuzzylittleninjas", # unsuitable navigation
"Garfieldminusjon", # unsuitable navigation
"Gatito", # unsuitable navigation
"Gbksayonara", # unsuitable navigation
"Gillimurphyorig", # unsuitable navigation
"Gratz", # unsuitable navigation
"Greygaroutopheavyartwork", # unsuitable navigation
"GrimReaperSchool", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Goldrush", # unsuitable navigation
"GRIND", # unsuitable navigation
"Haywire", # unsuitable navigation
"Hallodri", # unsuitable navigation
"Harrysorehead", # unsuitable navigation
"HazSci", # unsuitable navigation
"Hellboundarchive", # unsuitable navigation
"Herecomesskeeter", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Highlyexperimental", # unsuitable navigation
"Holycowcomics", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Hourlykelly", # unsuitable navigation
"Houseescapeold", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Horizongakuen", # unsuitable navigation
"Icannotdraw", # unsuitable navigation
"Ign", # unsuitable navigation
"Illusionoftime", # unsuitable navigation
"InsideOuT", # unsuitable navigation
2013-04-28 17:58:38 +00:00
"Introvert", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Immortalfool", # unsuitable navigation
"Insectia", # unsuitable navigation
"Jackitandfriends", # unsuitable navigation
"Jenffersshow5", # unsuitable navigation
"Johnsonsuperior", # unsuitable navigation
"Joostdailies", # unsuitable navigation
"Journ", # unsuitable navigation
"JourneyToRaifina", # unsuitable navigation
"Junk", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Kaze", # unsuitable navigation
"Kmlssticks", # unsuitable navigation
"KiLAiLO", # unsuitable navigation
"Kingdomprettycure", # unsuitable navigation
"Kmfe", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Lately", # unsuitable navigation
"Legendoftheredphantom", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"LiteBites", # unsuitable navigation
"Littlephoenix", # unsuitable navigation
"Llwhoelterran", # unsuitable navigation
"Lomeathandhuilii", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Longandexcitingjourney", # unsuitable navigation
"Lovekillsslowly", # unsuitable navigation
"Mannack", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Mars", # unsuitable navigation
"MaskoftheAryans", # unsuitable navigation
"Megamaiden", # unsuitable navigation
2013-05-25 21:24:33 +00:00
"Minebreakers", # unsuitable navigation
"Minecraft2b2t", # unsuitable navigation
2013-07-18 18:39:53 +00:00
"Mischeif", # unsuitable navigation
"Mitadakesaga", # unsuitable navigation
"Mlpfib", # unsuitable navigation
"Monsterloverdp", # unsuitable navigation
"MoonlightValley", # unsuitable navigation
"MurghComics", # unsuitable navigation
"MVPL", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Monobow", # unsuitable navigation
"Mytvisevil", # unsuitable navigation
"Natao", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Nemution", # unsuitable navigation
"NMG", # unsuitable navigation
"Noche", # unsuitable navigation
"Noprrkele", # unsuitable navigation
2013-05-25 21:24:33 +00:00
"Nothingfits", # unsuitable navigation
"Nothingfitsartblog", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"NotYoursAmI", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Oeight", # unsuitable navigation
"Ofpf", # unsuitable navigation
"Old2g", # unsuitable navigation
"Outtolunch", # unsuitable navigation
"Parisel313", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Patchworkpeople", # unsuitable navigation
"Pewfell", # unsuitable navigation
"Phoenix", # unsuitable navigation
"Pi5a", # unsuitable navigation
"Pokemonwarpers", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Princeofcats", # unsuitable navigation
"Princess", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"ProjectX", # unsuitable navigation
"ReadershipofOne", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Rebuildofgenericmanga", # unsuitable navigation
"Queenie", # unsuitable navigation
"Rain", # unsuitable navigation
"Ratantia", # unsuitable navigation
"Rath", # unsuitable navigation
"RawLatex", # unsuitable navigation
"Remnants", # unsuitable navigation
"Requiem", # unsuitable navigation
"Retrofiyora", # unsuitable navigation
"Rexfordavenue", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Rocr", # unsuitable navigation
2013-11-18 21:01:30 +00:00
"Rosie", # unsuitable navigation
"S", # unsuitable navigation
"Sandgate", # unsuitable navigation
"Shadowstories", # unsuitable navigation
"Sigh", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Sleazyspacesage", # unsuitable navigation
"Slightlyeccentric", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Slightlyeccentricorigins", # unsuitable navigation
"Smbhax", # unsuitable navigation
"SpiritSquire1", # unsuitable navigation
2013-07-09 20:21:12 +00:00
"Stampedegirl", # unsuitable navigation
2013-03-07 22:08:17 +00:00
"Stardustthecat", # unsuitable navigation
"Sticklife", # unsuitable navigation
"StickMisadventures", # unsuitable navigation
2013-11-12 17:33:14 +00:00
"Stinkoman", # unsuitable navigation
"StrangerThanFiction", # unsuitable navigation
"SundaySmash", # unsuitable navigation
"Superproultimatewrestling", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Sweetcheeriosandorangejuice", # unsuitable navigation
"Synapticisms", # unsuitable navigation
"Talesofspoons", # unsuitable navigation
"Terwilligers", # unsuitable navigation
"Thedevilshorn", # unsuitable navigation
"TheEntity", # unsuitable navigation
"Theworldjumper", # unsuitable navigation
"TheWorldofUh", # unsuitable navigation
"Thewriter13", # unsuitable navigation
"ToC", # unsuitable navigation
"TOGM", # unsuitable navigation
"Townburgcity", # unsuitable navigation
"Tuhinaloota", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Tezzleandzeek", # unsuitable navigation
2013-05-25 21:24:33 +00:00
"TheDragonFistsofSmortySmythe", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Theredeemers", # unsuitable navigation
"Thestickmen", # unsuitable navigation
"Thingsthatannoyme", # unsuitable navigation
"ThornsInOurSide", # unsuitable navigation
2013-11-18 21:01:30 +00:00
"Two_Rooks", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Unichat", # unsuitable navigation
"UFPA", # unsuitable navigation
2013-03-07 22:08:17 +00:00
"V4", # unsuitable navigation
"Verboten", # unsuitable navigation
"Warg", # unsuitable navigation
"Warrior27", # unsuitable navigation
"Wastedpotential", # unsuitable navigation
"Wcf", # unsuitable navigation
"Whoseline", # unsuitable navigation
2013-03-07 22:08:17 +00:00
"WindRiders", # unsuitable navigation
"WitchesTeaParty", # unsuitable navigation
"Woohooligan", # unsuitable navigation
2013-03-26 16:33:15 +00:00
"Xenozone", # unsuitable navigation
"XWingAlliance", # unsuitable navigation
"Yppcomic", # unsuitable navigation
"Zeroeffort", # unsuitable navigation
2013-02-13 16:53:36 +00:00
]
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data, baseUrl = getPageContent(url, session)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in url_matcher.finditer(data):
comicurl = match.group(1)
name = format_name(comicurl.split('.', 1)[0][7:])
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
2013-03-12 19:47:11 +00:00
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
2013-02-13 16:53:36 +00:00
continue
# find description
end = match.end()
mo = desc_matcher.search(data[end:])
if not mo:
print("ERROR matching description:", repr(data[end:end+300]), file=sys.stderr)
continue
desc = format_description(mo.group(1))
# find out how many images this comic has
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
continue
num = int(mo.group(1))
# find genre
mo = genre_matcher.search(data[end:])
if not mo:
print("ERROR matching genre:", repr(data[end:end+300]), file=sys.stderr)
continue
genre = mo.group(1)
# find activity
mo = activity_matcher.search(data[end:])
if not mo:
print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr)
continue
active = mo.group(1).lower() == "active"
2013-02-18 19:02:16 +00:00
res[name] = [comicurl, desc, num, genre, active]
2013-02-13 16:53:36 +00:00
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
def get_results():
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page='
pages = 382
for i in range(1, pages+1):
url = baseUrl + str(i)
handle_url(url, session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
names = [
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
2013-02-13 16:53:36 +00:00
if lname in names:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
2013-05-22 20:29:03 +00:00
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, desc, num, genre, active = entry
if num < min_comics:
continue
if has_comic(name):
prefix = u'#'
else:
prefix = u''
fp.write(u"%sadd(%r, %r, %r)\n" % (
prefix, str(truncate_name(name)), str(url), desc
))
2013-02-13 16:53:36 +00:00
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()