Update ComicFury comics. (+871, -245)

- Remove make_scraper magic
- Switch to HTML parser
- Update parsing of comic listing.
This commit is contained in:
Tobias Gruetzmacher 2016-03-17 00:44:06 +01:00
parent 6727e9b559
commit 552f29e5fc
3 changed files with 4169 additions and 677 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,7 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
from re import compile, escape from re import compile, escape
@ -50,16 +51,6 @@ class Damonk(_BasicScraper):
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
# XXX disallowed /search by robots.txt
class _DandyAndCompany(_BasicScraper):
url = 'http://www.dandyandcompany.com/'
stripUrl = None
multipleImagesPerStrip = True
imageSearch = compile(tagre("a", "href", r'(http://\d+\.bp\.blogspot\.com/[^"]+)', after="imageanchor"))
prevSearch = compile(tagre("a", "href", r"([^']+)", quote="'", after="Older Posts"))
help = 'Index format: none'
class DangerouslyChloe(_BasicScraper): class DangerouslyChloe(_BasicScraper):
url = 'http://www.dangerouslychloe.com/' url = 'http://www.dangerouslychloe.com/'
stripUrl = url + 'strips-dc/%s' stripUrl = url + 'strips-dc/%s'

View file

@ -1,253 +1,118 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright (C) 2013-2014 Bastian Kleineidam # Copyright (C) 2013-2014 Bastian Kleineidam
# Copyright (C) 2016 Tobias Gruetzmacher
""" """
Script to get arcamax comics and save the info in a JSON file for further processing. Script to get ComicFury comics and save the info in a JSON file for further
processing.
""" """
from __future__ import print_function from __future__ import print_function, absolute_import
import codecs import codecs
import re
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import getPageContent from dosagelib.util import getPageContent
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(r'<h3><a href="([^"]+)">')
num_matcher = re.compile(r'<b>Comics:</b> <span class="comicinfo">(\d+)</span>')
genre_matcher = re.compile(r'<b>Genre:</b> <span class="comicinfo">([^<]+)</span>')
activity_matcher = re.compile(r'<b>Activity status:</b> <span class="comicinfo">([^<]+)</span>')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"1000", # unsuitable navigation # unsuitable navigation
"12yearsofmissj", # unsuitable navigation "AlfdisAndGunnora",
"3DGlasses", # unsuitable navigation "AnAmericanNerdinAnimatedTokyo",
"30Days", # unsuitable navigation "AngryAlien",
"6tsc", # unsuitable navigation "BoozerAndStoner",
"Abyss", # unsuitable navigation "Bonejangles",
"Acelestialstory", # unsuitable navigation "ConradStory",
"Actdr", # unsuitable navigation "Crossing",
"Aerosol", # unsuitable navigation "ChristianHumberReloaded",
"Ahtiventures", # unsuitable navigation "CorkandBlotto",
"Alienirony", # unsuitable navigation "Democomix",
"Aloonaticstale", # unsuitable navigation "ErraticBeatComics",
"Amity", # unsuitable navigation "EnergyWielders",
"Angelguardian", # unsuitable navigation "EvilBearorg",
"AngelguardianEspanol", # unsuitable navigation "Fiascos",
"Angryalien", # unsuitable navigation "FateoftheBlueStar",
"Animangitis", # unsuitable navigation "FPK",
"Archininja", # unsuitable navigation "Fanartgyle",
"Arveytoonz", # unsuitable navigation "FrigginRandom",
"AsperitasAstraalia", # unsuitable navigation "GoodbyeKitty",
"AttackoftheRobofemoids", # unsuitable navigation "HighlyExperiMental",
"Auriga", # unsuitable navigation "IfAndCanBeFlowers",
"Bedlam", # unsuitable navigation "JournalismStory",
"BITCHSquad", # missing images "JohnsonSuperior",
"Bidoof", # unsuitable navigation "Keel",
"Blobworld", # unsuitable navigation "JudgeDredBasset",
"BlockTales", # unsuitable navigation "LomeathAndHuilii",
"Bobcomix", # unsuitable navigation "MNPB",
"Bonejangles", # unsuitable navigation "LucidsDream",
"BookOfLiesComic", # unsuitable navigation "MadDog",
"BoozerandStoner", # unsuitable navigation "Minebreakers",
"Boyaurus", # unsuitable navigation "Moonlightvalley",
"Brainfood", # unsuitable navigation "MyImmortalFool",
"Bromosworld", # unsuitable navigation "NATO",
"BulletMythology", # unsuitable navigation "NothingFits",
"Bunnysher", # page moved "OptimisticFishermenandPessimisticFishermen",
"BUXY", # unsuitable navigation "Old2G",
"CafeGruesome", # unsuitable navigation "NothingFitsArtBlog",
"Castofmadness", # unsuitable navigation "OutToLunchTheStingRayWhoreStory",
"Chanpuru", # unsuitable navigation "Pandemonium",
"Christmaswithmaddog", # unsuitable navigation "Pewfell",
"ChroniclesOfLillian", # unsuitable navigation "ProjectX",
"Comicshortsmain", # unsuitable navigation "Ratantia",
"Conrads", # unsuitable navigation "RealLifeTrips",
"ConradTheCaterpillar", # unsuitable navigation "Sandgate",
"ConsequencesOfChoice", # unsuitable navigation "Secondpuberty",
"CoolYuleComics", # unsuitable navigation "Seconds",
"Crossworldsnexus", # unsuitable navigation "SlightlyEccentricOrigins",
"Colorforce", # unsuitable navigation "StardusttheCat",
"Coolstorybro", # unsuitable navigation "StrangerthanFiction",
"Crepusculars", # unsuitable navigation "TalamakGreatAdventure",
"CtrlZ", # unsuitable navigation "TheBattalion",
"DeadNight", # unsuitable navigation "TheDailyProblem",
"Democomix", # unsuitable navigation "TheMansionofE",
"Dinosaurkingdom", # unsuitable navigation "ThePainter",
"Donutsforsharks", # unsuitable navigation "TheSeekers",
"Dotcomic", # unsuitable navigation "TheTrialsofKlahadoftheAbyss",
"Droned", # unsuitable navigation "TheStickmen",
"Druids", # unsuitable navigation "ThornsInOurSide",
"Effingukookoo", # unsuitable navigation "TopHeavyVeryBustyPinUpsForAdults",
"Elijahandazuuclassic", # unsuitable navigation "USBUnlimitedsimulatedbody",
"ErraticBeat", # unsuitable navigation "TylerHumanRecycler",
"ErraticE", # unsuitable navigation "UAF",
"EternalKnights", # unsuitable navigation "WhenPigsFly",
"Evilbear", # unsuitable navigation "YeOldeLegotimeTheatre",
"Ewmic", # unsuitable navigation
"Fannicklas", # unsuitable navigation # no content
"Fateofthebluestar", # unsuitable navigation "Angst",
"Fishbowl", # unsuitable navigation
"Foe", # unsuitable navigation # images gone
"Foreignterritory", # unsuitable navigation "BaseballCapsandTiaras",
"Freakingawfulpuns", # page is gone "CROSSWORLDSNEXUS",
"Frigginrandom", # unsuitable navigation "Fathead",
"Frostfire", # unsuitable navigation "KevinZombie",
"Furnerdy", # unsuitable navigation "KindergardenCrisIs",
"Fuzzylittleninjas", # unsuitable navigation "NoSongsForTheDead",
"Garfieldminusjon", # unsuitable navigation "RequiemShadowbornPariah",
"Gatito", # unsuitable navigation "TezzleandZeek",
"Gbksayonara", # unsuitable navigation
"Gillimurphyorig", # unsuitable navigation # broken HTML
"Gratz", # unsuitable navigation "CrossingOver",
"Greygaroutopheavyartwork", # unsuitable navigation
"GrimReaperSchool", # unsuitable navigation # unique html
"Goldrush", # unsuitable navigation "IKilledtheHero",
"GRIND", # unsuitable navigation "PowerofPower",
"Haywire", # unsuitable navigation "Schizmatic",
"Hallodri", # unsuitable navigation "WaketheSleepers",
"Harrysorehead", # unsuitable navigation "WeightofEternity",
"HazSci", # unsuitable navigation
"Hellboundarchive", # unsuitable navigation
"Herecomesskeeter", # unsuitable navigation
"Highlyexperimental", # unsuitable navigation
"Holycowcomics", # unsuitable navigation
"Hourlykelly", # unsuitable navigation
"Houseescapeold", # unsuitable navigation
"Horizongakuen", # unsuitable navigation
"Icannotdraw", # unsuitable navigation
"Ign", # unsuitable navigation
"Illusionoftime", # unsuitable navigation
"InsideOuT", # unsuitable navigation
"Introvert", # unsuitable navigation
"Immortalfool", # unsuitable navigation
"Insectia", # unsuitable navigation
"Jackitandfriends", # unsuitable navigation
"Jenffersshow5", # unsuitable navigation
"Johnsonsuperior", # unsuitable navigation
"Joostdailies", # unsuitable navigation
"Journ", # unsuitable navigation
"JourneyToRaifina", # unsuitable navigation
"Junk", # unsuitable navigation
"Kaze", # unsuitable navigation
"Kmlssticks", # unsuitable navigation
"KiLAiLO", # unsuitable navigation
"Kingdomprettycure", # unsuitable navigation
"Kmfe", # unsuitable navigation
"Lately", # unsuitable navigation
"Legendoftheredphantom", # unsuitable navigation
"LiteBites", # unsuitable navigation
"Littlephoenix", # unsuitable navigation
"Llwhoelterran", # unsuitable navigation
"Lomeathandhuilii", # unsuitable navigation
"Longandexcitingjourney", # unsuitable navigation
"Lovekillsslowly", # unsuitable navigation
"Mannack", # unsuitable navigation
"Mars", # unsuitable navigation
"MaskoftheAryans", # unsuitable navigation
"Megamaiden", # unsuitable navigation
"Minebreakers", # unsuitable navigation
"Minecraft2b2t", # unsuitable navigation
"Mischeif", # unsuitable navigation
"Mitadakesaga", # unsuitable navigation
"Mlpfib", # unsuitable navigation
"Monsterloverdp", # unsuitable navigation
"MoonlightValley", # unsuitable navigation
"MurghComics", # unsuitable navigation
"MVPL", # unsuitable navigation
"Monobow", # unsuitable navigation
"Mytvisevil", # unsuitable navigation
"Natao", # unsuitable navigation
"Nemution", # unsuitable navigation
"NMG", # unsuitable navigation
"Noche", # unsuitable navigation
"Noprrkele", # unsuitable navigation
"Nothingfits", # unsuitable navigation
"Nothingfitsartblog", # unsuitable navigation
"NotYoursAmI", # unsuitable navigation
"Oeight", # unsuitable navigation
"Ofpf", # unsuitable navigation
"Old2g", # unsuitable navigation
"Outtolunch", # unsuitable navigation
"Parisel313", # unsuitable navigation
"Patchworkpeople", # unsuitable navigation
"Pewfell", # unsuitable navigation
"Phoenix", # unsuitable navigation
"Pi5a", # unsuitable navigation
"Pokemonwarpers", # unsuitable navigation
"Princeofcats", # unsuitable navigation
"Princess", # unsuitable navigation
"ProjectX", # unsuitable navigation
"ReadershipofOne", # unsuitable navigation
"Rebuildofgenericmanga", # unsuitable navigation
"Queenie", # unsuitable navigation
"Rain", # unsuitable navigation
"Ratantia", # unsuitable navigation
"Rath", # unsuitable navigation
"RawLatex", # unsuitable navigation
"Remnants", # unsuitable navigation
"Requiem", # unsuitable navigation
"Retrofiyora", # unsuitable navigation
"Rexfordavenue", # unsuitable navigation
"Rocr", # unsuitable navigation
"Rosie", # unsuitable navigation
"S", # unsuitable navigation
"Sandgate", # unsuitable navigation
"Shadowstories", # unsuitable navigation
"Sigh", # unsuitable navigation
"Sleazyspacesage", # unsuitable navigation
"Slightlyeccentric", # unsuitable navigation
"Slightlyeccentricorigins", # unsuitable navigation
"Smbhax", # unsuitable navigation
"SpiritSquire1", # unsuitable navigation
"Stampedegirl", # unsuitable navigation
"Stardustthecat", # unsuitable navigation
"Sticklife", # unsuitable navigation
"StickMisadventures", # unsuitable navigation
"Stinkoman", # unsuitable navigation
"StrangerThanFiction", # unsuitable navigation
"SundaySmash", # unsuitable navigation
"Superproultimatewrestling", # unsuitable navigation
"Sweetcheeriosandorangejuice", # unsuitable navigation
"Synapticisms", # unsuitable navigation
"Talesofspoons", # unsuitable navigation
"Terwilligers", # unsuitable navigation
"Thedevilshorn", # unsuitable navigation
"TheEntity", # unsuitable navigation
"Theworldjumper", # unsuitable navigation
"TheWorldofUh", # unsuitable navigation
"Thewriter13", # unsuitable navigation
"ToC", # unsuitable navigation
"TOGM", # unsuitable navigation
"Townburgcity", # unsuitable navigation
"Tuhinaloota", # unsuitable navigation
"Tezzleandzeek", # unsuitable navigation
"TheDragonFistsofSmortySmythe", # unsuitable navigation
"Theredeemers", # unsuitable navigation
"Thestickmen", # unsuitable navigation
"Thingsthatannoyme", # unsuitable navigation
"ThornsInOurSide", # unsuitable navigation
"Two_Rooks", # unsuitable navigation
"Unichat", # unsuitable navigation
"UFPA", # unsuitable navigation
"V4", # unsuitable navigation
"Verboten", # unsuitable navigation
"Warg", # unsuitable navigation
"Warrior27", # unsuitable navigation
"Wastedpotential", # unsuitable navigation
"Wcf", # unsuitable navigation
"Whoseline", # unsuitable navigation
"WindRiders", # unsuitable navigation
"WitchesTeaParty", # unsuitable navigation
"Woohooligan", # unsuitable navigation
"Xenozone", # unsuitable navigation
"XWingAlliance", # unsuitable navigation
"Yppcomic", # unsuitable navigation
"Zeroeffort", # unsuitable navigation
] ]
@ -255,35 +120,32 @@ def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = html.document_fromstring(getPageContent(url, session))
data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data):
comicurl = match.group(1) num = 999
name = format_name(comicurl.split('.', 1)[0][7:]) for comicdiv in data.cssselect('div.searchresult'):
if name in exclude_comics: comiclink = comicdiv.cssselect('h3 a')[0]
continue comicurl = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue continue
info = comicdiv.cssselect('span.comicinfo')
# find out how many images this comic has # find out how many images this comic has
end = match.end() num = int(info[1].text.strip())
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
continue
num = int(mo.group(1))
# find activity # find activity
mo = activity_matcher.search(data[end:]) active = info[6].text.strip().lower() == "active"
if not mo: lang = info[7].text.strip().lower()
print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr) res[name] = [comicurl, num, active, lang]
continue
active = mo.group(1).lower() == "active" return num
res[name] = [comicurl, num, active]
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
def get_results(): def get_results():
@ -291,15 +153,21 @@ def get_results():
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
session = requests.Session() session = requests.Session()
baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page=' # Sort by page count, so we can abort when we get under some threshold.
pages = 382 baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
for i in range(1, pages+1): '&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
url = baseUrl + str(i) '&all_st=1&all_la=1&page=%d')
handle_url(url, session, res) last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= MIN_COMICS:
last_count = handle_url(baseUrl % page, session, res)
page += 1
print(last_count, file=sys.stderr, end=" ")
save_result(res, json_file) save_result(res, json_file)
def has_comic(name): def find_dups(name):
"""Check if comic name already exists.""" """Check if comic name already exists."""
names = [ names = [
("Creators/%s" % name).lower(), ("Creators/%s" % name).lower(),
@ -312,28 +180,29 @@ def has_comic(name):
for scraperclass in get_scraperclasses(): for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower() lname = scraperclass.getName().lower()
if lname in names: if lname in names:
return True return scraperclass.getName().lower()
return False return None
def print_results(args): def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips.""" """Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args min_comics, filename = args
min_comics = int(min_comics) min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp: with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()): for name, entry in sorted(load_result(json_file).items()):
url, num, active, lang = entry
if name in exclude_comics: if name in exclude_comics:
fp.write(u"# %s is excluded\n" % name)
continue continue
url, num, active = entry
if num < min_comics: if num < min_comics:
continue continue
if has_comic(name): dup = find_dups(name)
prefix = u'#' if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else: else:
prefix = u'' fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
fp.write(u"%sadd(%r, %r)\n" % ( truncate_name(name), str(url)))
prefix, str(truncate_name(name)), str(url)
))
if __name__ == '__main__': if __name__ == '__main__':