Update ComicFury comics. (+871, -245)
- Remove make_scraper magic - Switch to HTML parser - Update parsing of comic listing.
This commit is contained in:
parent
6727e9b559
commit
552f29e5fc
3 changed files with 4169 additions and 677 deletions
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
|
@ -50,16 +51,6 @@ class Damonk(_BasicScraper):
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
|
|
||||||
# XXX disallowed /search by robots.txt
|
|
||||||
class _DandyAndCompany(_BasicScraper):
|
|
||||||
url = 'http://www.dandyandcompany.com/'
|
|
||||||
stripUrl = None
|
|
||||||
multipleImagesPerStrip = True
|
|
||||||
imageSearch = compile(tagre("a", "href", r'(http://\d+\.bp\.blogspot\.com/[^"]+)', after="imageanchor"))
|
|
||||||
prevSearch = compile(tagre("a", "href", r"([^']+)", quote="'", after="Older Posts"))
|
|
||||||
help = 'Index format: none'
|
|
||||||
|
|
||||||
|
|
||||||
class DangerouslyChloe(_BasicScraper):
|
class DangerouslyChloe(_BasicScraper):
|
||||||
url = 'http://www.dangerouslychloe.com/'
|
url = 'http://www.dangerouslychloe.com/'
|
||||||
stripUrl = url + 'strips-dc/%s'
|
stripUrl = url + 'strips-dc/%s'
|
||||||
|
|
|
@ -1,253 +1,118 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# Copyright (C) 2013-2014 Bastian Kleineidam
|
# Copyright (C) 2013-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get arcamax comics and save the info in a JSON file for further processing.
|
Script to get ComicFury comics and save the info in a JSON file for further
|
||||||
|
processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import print_function, absolute_import
|
||||||
import codecs
|
import codecs
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
from lxml import html
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
from dosagelib.util import getPageContent
|
from dosagelib.util import getPageContent
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
|
truncate_name, format_name)
|
||||||
|
|
||||||
|
# Absolute minumum number of pages a comic may have (restrict search space)
|
||||||
|
MIN_COMICS = 90
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
url_matcher = re.compile(r'<h3><a href="([^"]+)">')
|
|
||||||
num_matcher = re.compile(r'<b>Comics:</b> <span class="comicinfo">(\d+)</span>')
|
|
||||||
genre_matcher = re.compile(r'<b>Genre:</b> <span class="comicinfo">([^<]+)</span>')
|
|
||||||
activity_matcher = re.compile(r'<b>Activity status:</b> <span class="comicinfo">([^<]+)</span>')
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
"1000", # unsuitable navigation
|
# unsuitable navigation
|
||||||
"12yearsofmissj", # unsuitable navigation
|
"AlfdisAndGunnora",
|
||||||
"3DGlasses", # unsuitable navigation
|
"AnAmericanNerdinAnimatedTokyo",
|
||||||
"30Days", # unsuitable navigation
|
"AngryAlien",
|
||||||
"6tsc", # unsuitable navigation
|
"BoozerAndStoner",
|
||||||
"Abyss", # unsuitable navigation
|
"Bonejangles",
|
||||||
"Acelestialstory", # unsuitable navigation
|
"ConradStory",
|
||||||
"Actdr", # unsuitable navigation
|
"Crossing",
|
||||||
"Aerosol", # unsuitable navigation
|
"ChristianHumberReloaded",
|
||||||
"Ahtiventures", # unsuitable navigation
|
"CorkandBlotto",
|
||||||
"Alienirony", # unsuitable navigation
|
"Democomix",
|
||||||
"Aloonaticstale", # unsuitable navigation
|
"ErraticBeatComics",
|
||||||
"Amity", # unsuitable navigation
|
"EnergyWielders",
|
||||||
"Angelguardian", # unsuitable navigation
|
"EvilBearorg",
|
||||||
"AngelguardianEspanol", # unsuitable navigation
|
"Fiascos",
|
||||||
"Angryalien", # unsuitable navigation
|
"FateoftheBlueStar",
|
||||||
"Animangitis", # unsuitable navigation
|
"FPK",
|
||||||
"Archininja", # unsuitable navigation
|
"Fanartgyle",
|
||||||
"Arveytoonz", # unsuitable navigation
|
"FrigginRandom",
|
||||||
"AsperitasAstraalia", # unsuitable navigation
|
"GoodbyeKitty",
|
||||||
"AttackoftheRobofemoids", # unsuitable navigation
|
"HighlyExperiMental",
|
||||||
"Auriga", # unsuitable navigation
|
"IfAndCanBeFlowers",
|
||||||
"Bedlam", # unsuitable navigation
|
"JournalismStory",
|
||||||
"BITCHSquad", # missing images
|
"JohnsonSuperior",
|
||||||
"Bidoof", # unsuitable navigation
|
"Keel",
|
||||||
"Blobworld", # unsuitable navigation
|
"JudgeDredBasset",
|
||||||
"BlockTales", # unsuitable navigation
|
"LomeathAndHuilii",
|
||||||
"Bobcomix", # unsuitable navigation
|
"MNPB",
|
||||||
"Bonejangles", # unsuitable navigation
|
"LucidsDream",
|
||||||
"BookOfLiesComic", # unsuitable navigation
|
"MadDog",
|
||||||
"BoozerandStoner", # unsuitable navigation
|
"Minebreakers",
|
||||||
"Boyaurus", # unsuitable navigation
|
"Moonlightvalley",
|
||||||
"Brainfood", # unsuitable navigation
|
"MyImmortalFool",
|
||||||
"Bromosworld", # unsuitable navigation
|
"NATO",
|
||||||
"BulletMythology", # unsuitable navigation
|
"NothingFits",
|
||||||
"Bunnysher", # page moved
|
"OptimisticFishermenandPessimisticFishermen",
|
||||||
"BUXY", # unsuitable navigation
|
"Old2G",
|
||||||
"CafeGruesome", # unsuitable navigation
|
"NothingFitsArtBlog",
|
||||||
"Castofmadness", # unsuitable navigation
|
"OutToLunchTheStingRayWhoreStory",
|
||||||
"Chanpuru", # unsuitable navigation
|
"Pandemonium",
|
||||||
"Christmaswithmaddog", # unsuitable navigation
|
"Pewfell",
|
||||||
"ChroniclesOfLillian", # unsuitable navigation
|
"ProjectX",
|
||||||
"Comicshortsmain", # unsuitable navigation
|
"Ratantia",
|
||||||
"Conrads", # unsuitable navigation
|
"RealLifeTrips",
|
||||||
"ConradTheCaterpillar", # unsuitable navigation
|
"Sandgate",
|
||||||
"ConsequencesOfChoice", # unsuitable navigation
|
"Secondpuberty",
|
||||||
"CoolYuleComics", # unsuitable navigation
|
"Seconds",
|
||||||
"Crossworldsnexus", # unsuitable navigation
|
"SlightlyEccentricOrigins",
|
||||||
"Colorforce", # unsuitable navigation
|
"StardusttheCat",
|
||||||
"Coolstorybro", # unsuitable navigation
|
"StrangerthanFiction",
|
||||||
"Crepusculars", # unsuitable navigation
|
"TalamakGreatAdventure",
|
||||||
"CtrlZ", # unsuitable navigation
|
"TheBattalion",
|
||||||
"DeadNight", # unsuitable navigation
|
"TheDailyProblem",
|
||||||
"Democomix", # unsuitable navigation
|
"TheMansionofE",
|
||||||
"Dinosaurkingdom", # unsuitable navigation
|
"ThePainter",
|
||||||
"Donutsforsharks", # unsuitable navigation
|
"TheSeekers",
|
||||||
"Dotcomic", # unsuitable navigation
|
"TheTrialsofKlahadoftheAbyss",
|
||||||
"Droned", # unsuitable navigation
|
"TheStickmen",
|
||||||
"Druids", # unsuitable navigation
|
"ThornsInOurSide",
|
||||||
"Effingukookoo", # unsuitable navigation
|
"TopHeavyVeryBustyPinUpsForAdults",
|
||||||
"Elijahandazuuclassic", # unsuitable navigation
|
"USBUnlimitedsimulatedbody",
|
||||||
"ErraticBeat", # unsuitable navigation
|
"TylerHumanRecycler",
|
||||||
"ErraticE", # unsuitable navigation
|
"UAF",
|
||||||
"EternalKnights", # unsuitable navigation
|
"WhenPigsFly",
|
||||||
"Evilbear", # unsuitable navigation
|
"YeOldeLegotimeTheatre",
|
||||||
"Ewmic", # unsuitable navigation
|
|
||||||
"Fannicklas", # unsuitable navigation
|
# no content
|
||||||
"Fateofthebluestar", # unsuitable navigation
|
"Angst",
|
||||||
"Fishbowl", # unsuitable navigation
|
|
||||||
"Foe", # unsuitable navigation
|
# images gone
|
||||||
"Foreignterritory", # unsuitable navigation
|
"BaseballCapsandTiaras",
|
||||||
"Freakingawfulpuns", # page is gone
|
"CROSSWORLDSNEXUS",
|
||||||
"Frigginrandom", # unsuitable navigation
|
"Fathead",
|
||||||
"Frostfire", # unsuitable navigation
|
"KevinZombie",
|
||||||
"Furnerdy", # unsuitable navigation
|
"KindergardenCrisIs",
|
||||||
"Fuzzylittleninjas", # unsuitable navigation
|
"NoSongsForTheDead",
|
||||||
"Garfieldminusjon", # unsuitable navigation
|
"RequiemShadowbornPariah",
|
||||||
"Gatito", # unsuitable navigation
|
"TezzleandZeek",
|
||||||
"Gbksayonara", # unsuitable navigation
|
|
||||||
"Gillimurphyorig", # unsuitable navigation
|
# broken HTML
|
||||||
"Gratz", # unsuitable navigation
|
"CrossingOver",
|
||||||
"Greygaroutopheavyartwork", # unsuitable navigation
|
|
||||||
"GrimReaperSchool", # unsuitable navigation
|
# unique html
|
||||||
"Goldrush", # unsuitable navigation
|
"IKilledtheHero",
|
||||||
"GRIND", # unsuitable navigation
|
"PowerofPower",
|
||||||
"Haywire", # unsuitable navigation
|
"Schizmatic",
|
||||||
"Hallodri", # unsuitable navigation
|
"WaketheSleepers",
|
||||||
"Harrysorehead", # unsuitable navigation
|
"WeightofEternity",
|
||||||
"HazSci", # unsuitable navigation
|
|
||||||
"Hellboundarchive", # unsuitable navigation
|
|
||||||
"Herecomesskeeter", # unsuitable navigation
|
|
||||||
"Highlyexperimental", # unsuitable navigation
|
|
||||||
"Holycowcomics", # unsuitable navigation
|
|
||||||
"Hourlykelly", # unsuitable navigation
|
|
||||||
"Houseescapeold", # unsuitable navigation
|
|
||||||
"Horizongakuen", # unsuitable navigation
|
|
||||||
"Icannotdraw", # unsuitable navigation
|
|
||||||
"Ign", # unsuitable navigation
|
|
||||||
"Illusionoftime", # unsuitable navigation
|
|
||||||
"InsideOuT", # unsuitable navigation
|
|
||||||
"Introvert", # unsuitable navigation
|
|
||||||
"Immortalfool", # unsuitable navigation
|
|
||||||
"Insectia", # unsuitable navigation
|
|
||||||
"Jackitandfriends", # unsuitable navigation
|
|
||||||
"Jenffersshow5", # unsuitable navigation
|
|
||||||
"Johnsonsuperior", # unsuitable navigation
|
|
||||||
"Joostdailies", # unsuitable navigation
|
|
||||||
"Journ", # unsuitable navigation
|
|
||||||
"JourneyToRaifina", # unsuitable navigation
|
|
||||||
"Junk", # unsuitable navigation
|
|
||||||
"Kaze", # unsuitable navigation
|
|
||||||
"Kmlssticks", # unsuitable navigation
|
|
||||||
"KiLAiLO", # unsuitable navigation
|
|
||||||
"Kingdomprettycure", # unsuitable navigation
|
|
||||||
"Kmfe", # unsuitable navigation
|
|
||||||
"Lately", # unsuitable navigation
|
|
||||||
"Legendoftheredphantom", # unsuitable navigation
|
|
||||||
"LiteBites", # unsuitable navigation
|
|
||||||
"Littlephoenix", # unsuitable navigation
|
|
||||||
"Llwhoelterran", # unsuitable navigation
|
|
||||||
"Lomeathandhuilii", # unsuitable navigation
|
|
||||||
"Longandexcitingjourney", # unsuitable navigation
|
|
||||||
"Lovekillsslowly", # unsuitable navigation
|
|
||||||
"Mannack", # unsuitable navigation
|
|
||||||
"Mars", # unsuitable navigation
|
|
||||||
"MaskoftheAryans", # unsuitable navigation
|
|
||||||
"Megamaiden", # unsuitable navigation
|
|
||||||
"Minebreakers", # unsuitable navigation
|
|
||||||
"Minecraft2b2t", # unsuitable navigation
|
|
||||||
"Mischeif", # unsuitable navigation
|
|
||||||
"Mitadakesaga", # unsuitable navigation
|
|
||||||
"Mlpfib", # unsuitable navigation
|
|
||||||
"Monsterloverdp", # unsuitable navigation
|
|
||||||
"MoonlightValley", # unsuitable navigation
|
|
||||||
"MurghComics", # unsuitable navigation
|
|
||||||
"MVPL", # unsuitable navigation
|
|
||||||
"Monobow", # unsuitable navigation
|
|
||||||
"Mytvisevil", # unsuitable navigation
|
|
||||||
"Natao", # unsuitable navigation
|
|
||||||
"Nemution", # unsuitable navigation
|
|
||||||
"NMG", # unsuitable navigation
|
|
||||||
"Noche", # unsuitable navigation
|
|
||||||
"Noprrkele", # unsuitable navigation
|
|
||||||
"Nothingfits", # unsuitable navigation
|
|
||||||
"Nothingfitsartblog", # unsuitable navigation
|
|
||||||
"NotYoursAmI", # unsuitable navigation
|
|
||||||
"Oeight", # unsuitable navigation
|
|
||||||
"Ofpf", # unsuitable navigation
|
|
||||||
"Old2g", # unsuitable navigation
|
|
||||||
"Outtolunch", # unsuitable navigation
|
|
||||||
"Parisel313", # unsuitable navigation
|
|
||||||
"Patchworkpeople", # unsuitable navigation
|
|
||||||
"Pewfell", # unsuitable navigation
|
|
||||||
"Phoenix", # unsuitable navigation
|
|
||||||
"Pi5a", # unsuitable navigation
|
|
||||||
"Pokemonwarpers", # unsuitable navigation
|
|
||||||
"Princeofcats", # unsuitable navigation
|
|
||||||
"Princess", # unsuitable navigation
|
|
||||||
"ProjectX", # unsuitable navigation
|
|
||||||
"ReadershipofOne", # unsuitable navigation
|
|
||||||
"Rebuildofgenericmanga", # unsuitable navigation
|
|
||||||
"Queenie", # unsuitable navigation
|
|
||||||
"Rain", # unsuitable navigation
|
|
||||||
"Ratantia", # unsuitable navigation
|
|
||||||
"Rath", # unsuitable navigation
|
|
||||||
"RawLatex", # unsuitable navigation
|
|
||||||
"Remnants", # unsuitable navigation
|
|
||||||
"Requiem", # unsuitable navigation
|
|
||||||
"Retrofiyora", # unsuitable navigation
|
|
||||||
"Rexfordavenue", # unsuitable navigation
|
|
||||||
"Rocr", # unsuitable navigation
|
|
||||||
"Rosie", # unsuitable navigation
|
|
||||||
"S", # unsuitable navigation
|
|
||||||
"Sandgate", # unsuitable navigation
|
|
||||||
"Shadowstories", # unsuitable navigation
|
|
||||||
"Sigh", # unsuitable navigation
|
|
||||||
"Sleazyspacesage", # unsuitable navigation
|
|
||||||
"Slightlyeccentric", # unsuitable navigation
|
|
||||||
"Slightlyeccentricorigins", # unsuitable navigation
|
|
||||||
"Smbhax", # unsuitable navigation
|
|
||||||
"SpiritSquire1", # unsuitable navigation
|
|
||||||
"Stampedegirl", # unsuitable navigation
|
|
||||||
"Stardustthecat", # unsuitable navigation
|
|
||||||
"Sticklife", # unsuitable navigation
|
|
||||||
"StickMisadventures", # unsuitable navigation
|
|
||||||
"Stinkoman", # unsuitable navigation
|
|
||||||
"StrangerThanFiction", # unsuitable navigation
|
|
||||||
"SundaySmash", # unsuitable navigation
|
|
||||||
"Superproultimatewrestling", # unsuitable navigation
|
|
||||||
"Sweetcheeriosandorangejuice", # unsuitable navigation
|
|
||||||
"Synapticisms", # unsuitable navigation
|
|
||||||
"Talesofspoons", # unsuitable navigation
|
|
||||||
"Terwilligers", # unsuitable navigation
|
|
||||||
"Thedevilshorn", # unsuitable navigation
|
|
||||||
"TheEntity", # unsuitable navigation
|
|
||||||
"Theworldjumper", # unsuitable navigation
|
|
||||||
"TheWorldofUh", # unsuitable navigation
|
|
||||||
"Thewriter13", # unsuitable navigation
|
|
||||||
"ToC", # unsuitable navigation
|
|
||||||
"TOGM", # unsuitable navigation
|
|
||||||
"Townburgcity", # unsuitable navigation
|
|
||||||
"Tuhinaloota", # unsuitable navigation
|
|
||||||
"Tezzleandzeek", # unsuitable navigation
|
|
||||||
"TheDragonFistsofSmortySmythe", # unsuitable navigation
|
|
||||||
"Theredeemers", # unsuitable navigation
|
|
||||||
"Thestickmen", # unsuitable navigation
|
|
||||||
"Thingsthatannoyme", # unsuitable navigation
|
|
||||||
"ThornsInOurSide", # unsuitable navigation
|
|
||||||
"Two_Rooks", # unsuitable navigation
|
|
||||||
"Unichat", # unsuitable navigation
|
|
||||||
"UFPA", # unsuitable navigation
|
|
||||||
"V4", # unsuitable navigation
|
|
||||||
"Verboten", # unsuitable navigation
|
|
||||||
"Warg", # unsuitable navigation
|
|
||||||
"Warrior27", # unsuitable navigation
|
|
||||||
"Wastedpotential", # unsuitable navigation
|
|
||||||
"Wcf", # unsuitable navigation
|
|
||||||
"Whoseline", # unsuitable navigation
|
|
||||||
"WindRiders", # unsuitable navigation
|
|
||||||
"WitchesTeaParty", # unsuitable navigation
|
|
||||||
"Woohooligan", # unsuitable navigation
|
|
||||||
"Xenozone", # unsuitable navigation
|
|
||||||
"XWingAlliance", # unsuitable navigation
|
|
||||||
"Yppcomic", # unsuitable navigation
|
|
||||||
"Zeroeffort", # unsuitable navigation
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -255,35 +120,32 @@ def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = html.document_fromstring(getPageContent(url, session))
|
||||||
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
|
||||||
comicurl = match.group(1)
|
num = 999
|
||||||
name = format_name(comicurl.split('.', 1)[0][7:])
|
for comicdiv in data.cssselect('div.searchresult'):
|
||||||
if name in exclude_comics:
|
comiclink = comicdiv.cssselect('h3 a')[0]
|
||||||
continue
|
comicurl = comiclink.attrib['href']
|
||||||
|
name = format_name(comiclink.text)
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
print("INFO: skipping possible duplicate", repr(name),
|
||||||
|
file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
info = comicdiv.cssselect('span.comicinfo')
|
||||||
# find out how many images this comic has
|
# find out how many images this comic has
|
||||||
end = match.end()
|
num = int(info[1].text.strip())
|
||||||
mo = num_matcher.search(data[end:])
|
|
||||||
if not mo:
|
|
||||||
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
|
|
||||||
continue
|
|
||||||
num = int(mo.group(1))
|
|
||||||
# find activity
|
# find activity
|
||||||
mo = activity_matcher.search(data[end:])
|
active = info[6].text.strip().lower() == "active"
|
||||||
if not mo:
|
lang = info[7].text.strip().lower()
|
||||||
print("ERROR matching activity:", repr(data[end:end+300]), file=sys.stderr)
|
res[name] = [comicurl, num, active, lang]
|
||||||
continue
|
|
||||||
active = mo.group(1).lower() == "active"
|
return num
|
||||||
res[name] = [comicurl, num, active]
|
|
||||||
if not res:
|
|
||||||
print("ERROR:", "did not match any comics", file=sys.stderr)
|
|
||||||
|
|
||||||
|
|
||||||
def get_results():
|
def get_results():
|
||||||
|
@ -291,15 +153,21 @@ def get_results():
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
baseUrl = 'http://comicfury.com/search.php?search=1&webcomics=Search+for+webcomics&query=&worder=5&asc=1&incvi=1&incse=1&incnu=1&incla=1&all_ge=1&all_st=1&all_la=1&page='
|
# Sort by page count, so we can abort when we get under some threshold.
|
||||||
pages = 382
|
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
|
||||||
for i in range(1, pages+1):
|
'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
|
||||||
url = baseUrl + str(i)
|
'&all_st=1&all_la=1&page=%d')
|
||||||
handle_url(url, session, res)
|
last_count = 999
|
||||||
|
page = 1
|
||||||
|
print("Parsing search result pages...", file=sys.stderr)
|
||||||
|
while last_count >= MIN_COMICS:
|
||||||
|
last_count = handle_url(baseUrl % page, session, res)
|
||||||
|
page += 1
|
||||||
|
print(last_count, file=sys.stderr, end=" ")
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
def has_comic(name):
|
def find_dups(name):
|
||||||
"""Check if comic name already exists."""
|
"""Check if comic name already exists."""
|
||||||
names = [
|
names = [
|
||||||
("Creators/%s" % name).lower(),
|
("Creators/%s" % name).lower(),
|
||||||
|
@ -312,28 +180,29 @@ def has_comic(name):
|
||||||
for scraperclass in get_scraperclasses():
|
for scraperclass in get_scraperclasses():
|
||||||
lname = scraperclass.getName().lower()
|
lname = scraperclass.getName().lower()
|
||||||
if lname in names:
|
if lname in names:
|
||||||
return True
|
return scraperclass.getName().lower()
|
||||||
return False
|
return None
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
def print_results(args):
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
"""Print all comics that have at least the given number of minimum
|
||||||
|
comic strips."""
|
||||||
min_comics, filename = args
|
min_comics, filename = args
|
||||||
min_comics = int(min_comics)
|
min_comics = int(min_comics)
|
||||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||||
for name, entry in sorted(load_result(json_file).items()):
|
for name, entry in sorted(load_result(json_file).items()):
|
||||||
|
url, num, active, lang = entry
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
|
fp.write(u"# %s is excluded\n" % name)
|
||||||
continue
|
continue
|
||||||
url, num, active = entry
|
|
||||||
if num < min_comics:
|
if num < min_comics:
|
||||||
continue
|
continue
|
||||||
if has_comic(name):
|
dup = find_dups(name)
|
||||||
prefix = u'#'
|
if dup is not None:
|
||||||
|
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
|
||||||
else:
|
else:
|
||||||
prefix = u''
|
fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
|
||||||
fp.write(u"%sadd(%r, %r)\n" % (
|
truncate_name(name), str(url)))
|
||||||
prefix, str(truncate_name(name)), str(url)
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue