Clean up ComicGenesis
This commit is contained in:
parent
c2c699a1d5
commit
e6f18a2027
3 changed files with 23 additions and 500 deletions
|
@ -1,37 +1,26 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# Copyright (C) 2015-2017 Tobias Gruetzmacher
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
from re import compile
|
from ..scraper import _ParserScraper
|
||||||
|
|
||||||
from ..scraper import _BasicScraper
|
|
||||||
from ..util import tagre
|
|
||||||
|
|
||||||
# Comicgenesis has a lot of comics, but most of them are disallowed by
|
# Comicgenesis has a lot of comics, but most of them are disallowed by
|
||||||
# robots.txt
|
# robots.txt
|
||||||
|
|
||||||
|
|
||||||
class ComicGenesis(_BasicScraper):
|
class ComicGenesis(_ParserScraper):
|
||||||
imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
|
||||||
prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
|
||||||
'(?:Previous comic' + '|' +
|
|
||||||
tagre("img", "alt", "Previous comic") + '|' +
|
|
||||||
tagre("img", "src", "images/back\.gif") +
|
|
||||||
')')
|
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
imageSearch = '//img[contains(@src, "/comics/")]'
|
||||||
|
prevSearch = (
|
||||||
|
'//a[img/@alt="Previous comic"]',
|
||||||
|
'//a[text()="Previous comic"]',
|
||||||
|
)
|
||||||
help = 'Index format: yyyymmdd'
|
help = 'Index format: yyyymmdd'
|
||||||
|
|
||||||
def link_modifier(self, fromurl, tourl):
|
def __init__(self, name, sub=None, last=None, baseUrl=None, lang=None):
|
||||||
return tourl.replace(
|
|
||||||
"keenspace.com", "comicgenesis.com").replace(
|
|
||||||
"keenspot.com", "comicgenesis.com").replace(
|
|
||||||
"toonspace.com", "comicgenesis.com").replace(
|
|
||||||
"comicgen.com", "comicgenesis.com")
|
|
||||||
|
|
||||||
def __init__(self, name, sub=None, last=None, baseUrl=None):
|
|
||||||
super(ComicGenesis, self).__init__('ComicGenesis/' + name)
|
super(ComicGenesis, self).__init__('ComicGenesis/' + name)
|
||||||
|
|
||||||
if sub:
|
if sub:
|
||||||
|
@ -44,12 +33,12 @@ class ComicGenesis(_BasicScraper):
|
||||||
else:
|
else:
|
||||||
self.url = baseUrl
|
self.url = baseUrl
|
||||||
|
|
||||||
|
if lang:
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getmodules(cls):
|
def getmodules(cls):
|
||||||
return [
|
return (
|
||||||
# do not edit anything below since these entries are generated from
|
|
||||||
# scripts/update_plugins.sh
|
|
||||||
# START AUTOUPDATE
|
|
||||||
cls('AAAAA', 'aaaaa'),
|
cls('AAAAA', 'aaaaa'),
|
||||||
cls('AdventuresofKiltman', 'kiltman'),
|
cls('AdventuresofKiltman', 'kiltman'),
|
||||||
cls('AmorModerno', 'amormoderno'),
|
cls('AmorModerno', 'amormoderno'),
|
||||||
|
@ -61,9 +50,12 @@ class ComicGenesis(_BasicScraper):
|
||||||
cls('BendyStrawVampires', 'bsvampires'),
|
cls('BendyStrawVampires', 'bsvampires'),
|
||||||
cls('BlindSight', 'blindsight'),
|
cls('BlindSight', 'blindsight'),
|
||||||
cls('BreakingtheDoldrum', 'breakingthedoldrum'),
|
cls('BreakingtheDoldrum', 'breakingthedoldrum'),
|
||||||
|
cls('BrotherSwan', 'warlordofnoodles'),
|
||||||
cls('Candi', baseUrl='http://candicomics.com/'),
|
cls('Candi', baseUrl='http://candicomics.com/'),
|
||||||
|
cls('Cerintha', 'cerintha'),
|
||||||
cls('CorporateLife', 'corporatelife'),
|
cls('CorporateLife', 'corporatelife'),
|
||||||
cls('DarkWelkin', 'darkwelkin'),
|
cls('DarkWelkin', 'darkwelkin'),
|
||||||
|
cls('DeepBlue', 'gjbivin', last='20131109'),
|
||||||
cls('DemonEater', 'demoneater'),
|
cls('DemonEater', 'demoneater'),
|
||||||
cls('DoodleDiaries', 'doodlediaries'),
|
cls('DoodleDiaries', 'doodlediaries'),
|
||||||
cls('DormSweetDorm', 'dormsweetdorm'),
|
cls('DormSweetDorm', 'dormsweetdorm'),
|
||||||
|
@ -78,7 +70,6 @@ class ComicGenesis(_BasicScraper):
|
||||||
cls('Flounderville', 'flounderville'),
|
cls('Flounderville', 'flounderville'),
|
||||||
cls('GEM', 'keltzy'),
|
cls('GEM', 'keltzy'),
|
||||||
cls('Gonefor300days', 'g4300d'),
|
cls('Gonefor300days', 'g4300d'),
|
||||||
cls('IBlameDanny', 'vileterror'),
|
|
||||||
cls('ImpendingDoom', 'impending'),
|
cls('ImpendingDoom', 'impending'),
|
||||||
cls('InANutshell', 'nutshellcomics'),
|
cls('InANutshell', 'nutshellcomics'),
|
||||||
cls('KernyMantisComics', 'kernymantis'),
|
cls('KernyMantisComics', 'kernymantis'),
|
||||||
|
@ -91,12 +82,14 @@ class ComicGenesis(_BasicScraper):
|
||||||
cls('LumiasKingdom', 'lumia'),
|
cls('LumiasKingdom', 'lumia'),
|
||||||
cls('Majestic7', 'majestic7'),
|
cls('Majestic7', 'majestic7'),
|
||||||
cls('MaximumWhimsy', 'maximumwhimsy'),
|
cls('MaximumWhimsy', 'maximumwhimsy'),
|
||||||
cls('MenschunsererZeitGerman', 'muz'),
|
cls('MenschUnsererZeitGerman', 'muz', lang='de', last='20090630'),
|
||||||
|
cls('MenschUnsererZeit', 'rabe', last='20090630'),
|
||||||
cls('MoonCrest24', 'mooncrest', last='20121117'),
|
cls('MoonCrest24', 'mooncrest', last='20121117'),
|
||||||
cls('Mushian', 'tentoumushi'),
|
cls('Mushian', 'tentoumushi'),
|
||||||
cls('NightwolfCentral', 'nightwolfcentral'),
|
cls('NightwolfCentral', 'nightwolfcentral'),
|
||||||
cls('NoTimeForLife', 'randyraven'),
|
|
||||||
cls('NoneMoreComic', 'nonemore'),
|
cls('NoneMoreComic', 'nonemore'),
|
||||||
|
cls('NoTimeForLife', 'randyraven', last='20100510'),
|
||||||
|
cls('OcculTango', 'occultango'),
|
||||||
cls('ODCKS', 'odcks'),
|
cls('ODCKS', 'odcks'),
|
||||||
cls('OfDoom', 'ofdoom'),
|
cls('OfDoom', 'ofdoom'),
|
||||||
cls('OpportunityofaLifetime', 'carpathia'),
|
cls('OpportunityofaLifetime', 'carpathia'),
|
||||||
|
@ -119,12 +112,11 @@ class ComicGenesis(_BasicScraper):
|
||||||
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
|
cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'),
|
||||||
cls('TheEasyBreather', 'easybreather'),
|
cls('TheEasyBreather', 'easybreather'),
|
||||||
cls('TheMisadventuresofOkk', 'okk'),
|
cls('TheMisadventuresofOkk', 'okk'),
|
||||||
cls('ThePath', 'thepath'),
|
cls('ThePath', 'thepath', '20081226'),
|
||||||
cls('TheTalesofKalduras', 'kalduras'),
|
cls('TheTalesofKalduras', 'kalduras'),
|
||||||
cls('Unconventional', 'unconventional'),
|
cls('Unconventional', 'unconventional'),
|
||||||
cls('WarMageNC17', 'warmage'),
|
cls('WarMageNC17', 'warmage'),
|
||||||
cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'),
|
cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'),
|
||||||
cls('WhatYouDontSee', 'phantomlady4'),
|
cls('WhatYouDontSee', 'phantomlady4'),
|
||||||
cls('Wierdman', 'asa'),
|
cls('Wierdman', 'asa'),
|
||||||
# END AUTOUPDATE
|
)
|
||||||
]
|
|
||||||
|
|
|
@ -191,6 +191,7 @@ class Removed(Scraper):
|
||||||
cls('ComicFury/Wowwithatwistdamaclesandkejallcomic'),
|
cls('ComicFury/Wowwithatwistdamaclesandkejallcomic'),
|
||||||
cls('ComicFury/YouAreNowEnteringAshburg'),
|
cls('ComicFury/YouAreNowEnteringAshburg'),
|
||||||
cls('ComicGenesis/CryHavoc'),
|
cls('ComicGenesis/CryHavoc'),
|
||||||
|
cls('ComicGenesis/IBlameDanny'),
|
||||||
cls('ComicGenesis/SueosdelSur'),
|
cls('ComicGenesis/SueosdelSur'),
|
||||||
cls('Commissioned'),
|
cls('Commissioned'),
|
||||||
cls('CowboyJedi', 'brk'),
|
cls('CowboyJedi', 'brk'),
|
||||||
|
|
|
@ -1,470 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
|
||||||
"""
|
|
||||||
Script to get a list of ComicGenesis comics and save the info in a
|
|
||||||
JSON file for further processing.
|
|
||||||
"""
|
|
||||||
from __future__ import absolute_import, division, print_function
|
|
||||||
|
|
||||||
import codecs
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
|
||||||
from dosagelib.util import get_page, tagre, check_robotstxt
|
|
||||||
from dosagelib.scraper import get_scrapers
|
|
||||||
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
|
||||||
truncate_name, format_name)
|
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
|
||||||
|
|
||||||
# <div class="comictitle"><strong><a target="_blank"
|
|
||||||
# onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return
|
|
||||||
# false;" href="http://collegepros.comicgenesis.com">Adventures of the College
|
|
||||||
# Pros</a>
|
|
||||||
url_matcher = re.compile(r'<div class="comictitle"><strong>' +
|
|
||||||
tagre("a", "href", r'(http://[^"]+)') +
|
|
||||||
r'([^<]+)</a>')
|
|
||||||
num_matcher = re.compile(r'Number of Days: (\d+)')
|
|
||||||
|
|
||||||
# names of comics to exclude
|
|
||||||
exclude_comics = [
|
|
||||||
"10", # page is gone
|
|
||||||
"54sinRed", # page is 403 forbidden
|
|
||||||
"6D4", # redirected to another page
|
|
||||||
"AaaSoCAwesomenessandaSliceofCheese", # broken images
|
|
||||||
"AcrossthePond", # page moved
|
|
||||||
"ACDeceptibotscomic", # no images
|
|
||||||
"AdamandSei", # page has 403 forbidden
|
|
||||||
"AdamsRoadGang", # page is gone
|
|
||||||
"ADVENTURERS", # page is gone
|
|
||||||
"AiYaiYai", # page moved
|
|
||||||
"AlltheCommies", # missing images
|
|
||||||
"AltaModaMetro", # page redirected
|
|
||||||
"AltarGirl", # page redirected
|
|
||||||
"Amerika", # no images
|
|
||||||
"Angels", # page has 403 forbidden
|
|
||||||
"AngryDMonkey", # page redirected
|
|
||||||
"Angst", # page redirected
|
|
||||||
"Animenifesto", # too few images
|
|
||||||
"Anna", # no images
|
|
||||||
"Arcana", # archive broken
|
|
||||||
"Area15", # no images
|
|
||||||
"BaidheTu", # no images
|
|
||||||
"BasilFlint", # page redirected
|
|
||||||
"beerkada", # no images
|
|
||||||
"BelovedLeader", # broken images
|
|
||||||
"BigMouthComics", # page does not follow standard layout
|
|
||||||
"BilltheMagician", # page does not follow standard layout
|
|
||||||
"BlackBlue", # page moved
|
|
||||||
"BlackMagic", # page does not follow standard layout
|
|
||||||
"BloodBound", # page moved
|
|
||||||
"bloodofthedragon", # page does not follow standard layout
|
|
||||||
"BloodWing", # broken images
|
|
||||||
"BlueZombie", # broken page
|
|
||||||
"BoomerExpress", # redirection to another page
|
|
||||||
"BobOnline", # missing images
|
|
||||||
"BottomFlavor", # page does not follow standard layout
|
|
||||||
"BradTheVampire", # page does not follow standard layout
|
|
||||||
"BreakpointCity", # page moved
|
|
||||||
"Brinkerhoff", # page redirected
|
|
||||||
"CampusSafari", # page moved
|
|
||||||
"CapturetheMoment", # page moved
|
|
||||||
"CaseyandAndy", # page moved
|
|
||||||
"Catalyst", # page moved
|
|
||||||
"Cats", # broken images
|
|
||||||
"Chair", # page moved
|
|
||||||
"ChildrenAtPlay", # page does not follow standard layout
|
|
||||||
"Chu", # broken images
|
|
||||||
"CoACityofAscii", # only ascii images
|
|
||||||
"ComicMischief", # page moved
|
|
||||||
"ComputerGameAddicts", # page moved
|
|
||||||
"Concession", # page moved
|
|
||||||
"Countyoursheep", # broken links
|
|
||||||
"CorridorZ", # page does not follow standard layout
|
|
||||||
"CrashBoomMagic", # page moved
|
|
||||||
"CrazySlowlyGoing", # page has 403 forbidden
|
|
||||||
"CrimsonWings", # page moved
|
|
||||||
"DakotasRidge", # page moved
|
|
||||||
"DATAROM", # broken images
|
|
||||||
"DazeinaHaze", # page moved
|
|
||||||
"DIABOLICA", # broken images
|
|
||||||
"DIfIK", # page does not follow standard layout
|
|
||||||
"DigitalWar", # page is gone
|
|
||||||
"DimBulbComics", # page is gone
|
|
||||||
"DIVE", # page is gone
|
|
||||||
"DominicDeegan", # page moved
|
|
||||||
"DownwardBound", # page does not follow standard layout
|
|
||||||
"DungeonDamage", # page does not follow standard layout
|
|
||||||
"Dylan", # page has 403 forbidden
|
|
||||||
"EarthRiser", # redirects to a new page
|
|
||||||
"EdgetheDevilhunter", # page is gone
|
|
||||||
"EdibleDirt", # page moved
|
|
||||||
"EinstiensDesk", # page is gone
|
|
||||||
"ElfOnlyInn", # page moved
|
|
||||||
"Ensuing", # broken links
|
|
||||||
"etch", # broken images
|
|
||||||
"EternalCaffeineJunkie", # page does not follow standard layout
|
|
||||||
"EternityComplex", # page does not follow standard layout
|
|
||||||
"Evilish", # page moved
|
|
||||||
"EvolBara", # page is gone
|
|
||||||
"FaerieTales", # page does not follow standard layout
|
|
||||||
"FairestandFallen", # page does not follow standard layout
|
|
||||||
"FairyTaleNewVillage", # missing images
|
|
||||||
"FatesTear", # page moved
|
|
||||||
"FaultyLogic", # page does not follow standard layout
|
|
||||||
"FireontheMountain", # page does not follow standard layout
|
|
||||||
"FiveBucksanHour", # page is gone
|
|
||||||
"Flatwood", # page moved
|
|
||||||
"FLEMComics", # page moved
|
|
||||||
"FletchersCave", # page is broken
|
|
||||||
"FlipandSplog", # page does not follow standard layout
|
|
||||||
"ForcesofGoodandEvil", # page does not follow standard layout
|
|
||||||
"Framed", # page does not follow standard layout
|
|
||||||
"FurryBlackDevil", # page moved
|
|
||||||
"Galacticus", # page has 403 forbidden
|
|
||||||
"GamerPsychotica", # page does not follow standard layout
|
|
||||||
"GeebasonParade", # page does not follow standard layout
|
|
||||||
"Geeks", # page moved
|
|
||||||
"GeminiBright", # page does not follow standard layout
|
|
||||||
"GemutationsPlague", # page does not follow standard layout
|
|
||||||
"GeorgetheSecond", # page does not follow standard layout
|
|
||||||
"Ghostz", # page does not follow standard layout
|
|
||||||
"GODLIKE", # page has 403 forbidden
|
|
||||||
"GoForIt", # page is gone
|
|
||||||
"GothBoy", # page moved
|
|
||||||
"Gravity", # page does not follow standard layout
|
|
||||||
"Grimage", # page moved
|
|
||||||
"GrossePointeDogs", # page is broken
|
|
||||||
"GUComics", # page moved
|
|
||||||
"HalflightBreaking", # page does not follow standard layout
|
|
||||||
"HardUnderbelly", # page does not follow standard layout
|
|
||||||
"HazardousScience", # page is gone
|
|
||||||
"HereThereBeDragons", # page moved
|
|
||||||
"HighMaintenance", # missing images
|
|
||||||
"HighSchoolRPG", # page does not follow standard layout
|
|
||||||
"Horndog", # page moved
|
|
||||||
"HorseshoesandHandgrenades", # missing images
|
|
||||||
"HotelGrim", # missing images
|
|
||||||
"IAlwaysWakeUpLazy", # page moved
|
|
||||||
"Ihatesteve", # page is gone
|
|
||||||
"IllicitMiracles", # page does not follow standard layout
|
|
||||||
"IndefensiblePositions", # page does not follow standard layout
|
|
||||||
"InsanityFair", # page does not follow standard layout
|
|
||||||
"InsideJoke", # page is gone
|
|
||||||
"InsidetheBox", # page has 403 forbidden
|
|
||||||
"InternationalHopeFoundation", # page does not follow standard layout
|
|
||||||
"Inverloch", # page does not follow standard layout
|
|
||||||
"JamieandNick", # page moved
|
|
||||||
"JasonLovesHisGrandpa", # page is gone
|
|
||||||
"JavanteasFate", # page is gone
|
|
||||||
"JBBcomics", # page is gone
|
|
||||||
"JedandDark", # page does not follow standard layout
|
|
||||||
"JoBeth", # page moved
|
|
||||||
"Joyride", # page moved
|
|
||||||
"JustAnotherEscape", # page moved
|
|
||||||
"JustWeird", # page has 403 forbidden
|
|
||||||
"JuvenileDiversion", # page moved
|
|
||||||
"JWalkinAndapos", # missing images
|
|
||||||
"KarmaSlave", # page moved
|
|
||||||
"KeenLace", # page is gone
|
|
||||||
"khaoskomic", # page moved
|
|
||||||
"KillingTime", # page is gone
|
|
||||||
"KnightsOfTheNexus", # page does not follow standard layout
|
|
||||||
"KoFightClub", # page moved
|
|
||||||
"LabGoatsInc", # page moved
|
|
||||||
"LandofGreed", # page is gone
|
|
||||||
"LeanOnMe", # page has 403 forbidden
|
|
||||||
"LegendsofRovana", # page has 403 forbidden
|
|
||||||
"LifeatBayside", # page does not follow standard layout
|
|
||||||
"LifeinaNutshell", # page does not follow standard layout
|
|
||||||
"Lifesuchasitis", # page has 403 forbidden
|
|
||||||
"LinktotheBoards", # page does not follow standard layout
|
|
||||||
"LinT", # page moved
|
|
||||||
"LiterallySpeaking", # page does not follow standard layout
|
|
||||||
"LifeonForbez", # missing images
|
|
||||||
"LoxieAndZoot", # page does not follow standard layout
|
|
||||||
"Lunchtable", # missing images
|
|
||||||
"MacHall", # page does not follow standard layout
|
|
||||||
"MadWorld", # page has 403 forbidden
|
|
||||||
"Magellan", # page does not follow standard layout
|
|
||||||
"Marachan", # missing images
|
|
||||||
"MassProduction", # page does tno follow standard layout
|
|
||||||
"MayIHelpYou", # page has 403 forbidden
|
|
||||||
"Meiosis", # page moved
|
|
||||||
"Michikomonogatari", # page does not follow standard layout
|
|
||||||
"MidnorthFlourCo", # page has 403 forbidden
|
|
||||||
"Mindmistress", # page does not follow standard layout
|
|
||||||
"MintCondition", # page moved
|
|
||||||
"MisadventuresinPhysics", # page has 403 forbidden
|
|
||||||
"MobileMadness", # page does not follow standard layout
|
|
||||||
"MrPinkBlob", # page does not follow standard layout
|
|
||||||
"MyAngelYouAreAngel", # page is gone
|
|
||||||
"MyBrainHurts", # page does not follow standard layout
|
|
||||||
"NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
|
|
||||||
"NeglectedMarioCharacterComix", # page does not follow standard layout
|
|
||||||
"NekoTheKitty", # page does not follow standard layout
|
|
||||||
"Nemutionjewel", # page does not follow standard layout
|
|
||||||
"Nerdgasm", # missing images
|
|
||||||
"Nerdz", # page is gone
|
|
||||||
"Nervillsaga", # page does not follow standard layout
|
|
||||||
"NetherOakasuburbanadventure", # page does not follow standard layout
|
|
||||||
"NoNeedForBushido", # page moved
|
|
||||||
"Nothingcomesnaturally", # page does not follow standard layout
|
|
||||||
"NymphsoftheWest", # too few images
|
|
||||||
"OffTheWall", # page does not follow standard layout
|
|
||||||
"OneHourAxis", # page is gone
|
|
||||||
"OnlyOne", # page is gone
|
|
||||||
"OopsNevermind", # page is gone
|
|
||||||
"PacoStand", # page has 403 forbidden
|
|
||||||
"Pander", # page is gone
|
|
||||||
"PANDORA", # page is missing pages
|
|
||||||
"PhilosophyBites", # missing images
|
|
||||||
"PhilosophyMonkey", # page is gone
|
|
||||||
"PicpakDog", # page moved
|
|
||||||
"PictureDiary", # page is gone
|
|
||||||
"PillarsofFaith", # page does not follow standard layout
|
|
||||||
"Pimpette", # page moved
|
|
||||||
"PokC3A9Chow", # page has 403 forbidden
|
|
||||||
"PolleninArabia", # page does not follow standard layout
|
|
||||||
"PranMan", # page moved
|
|
||||||
"QueensOfRandomness", # broken images
|
|
||||||
"QuestionableTales", # page does not follow standard layout
|
|
||||||
"RadioactiveFanboys", # page does not follow standard layout
|
|
||||||
"RandomAssembly", # page is gone
|
|
||||||
"RandomInk", # page is gone
|
|
||||||
"ReceptorFatigue", # page does not follow standard layout
|
|
||||||
"Remsi", # page does not follow standard layout
|
|
||||||
"Reset", # page does not follow standard layout
|
|
||||||
"ResistanceLine", # page does not follow standard layout
|
|
||||||
"ReturntoDonnelly", # page is gone
|
|
||||||
"Riboflavin", # page does not follow standard layout
|
|
||||||
"RitualsandOfferings", # page is gone
|
|
||||||
"RiverCityHigh", # page is gone
|
|
||||||
"RMsothercomics", # page does not follow standard layout
|
|
||||||
"RogerAndDominic", # page does not follow standard layout
|
|
||||||
"RoleoftheDie", # page is gone
|
|
||||||
"RonnieRaccoon", # page moved
|
|
||||||
"RosalarianAndapossRandomCreepyTales", # page is gone
|
|
||||||
"RulesofMakeBelieve", # page is gone
|
|
||||||
"Rveillerie", # page has 403 forbidden
|
|
||||||
"SaintPetersCross", # page does not follow standard layout
|
|
||||||
"Saturnalia", # page moved
|
|
||||||
"SavageIslands", # page has 403 forbidden
|
|
||||||
"SaveMeGebus", # page does not follow standard layout
|
|
||||||
"Sawdust", # page has 403 forbidden
|
|
||||||
"Scooterboy1234", # page has 403 forbidden
|
|
||||||
"SecondNight", # page moved
|
|
||||||
"Sempiternal", # page moved
|
|
||||||
"Senioritis", # page has 403 forbidden
|
|
||||||
"ShivaeStudios", # page moved
|
|
||||||
"ShonenAiKudasai", # page is gone
|
|
||||||
"ShootMeNow", # page does not follow standard layout
|
|
||||||
"SidandLasker", # page moved
|
|
||||||
"SillyConeV", # page is gone
|
|
||||||
"Skunk", # page moved
|
|
||||||
"SLAGIT", # missing images
|
|
||||||
"SmithStone", # page has 403 forbidden
|
|
||||||
"SnowflakeStudios", # page is gone
|
|
||||||
"Sockd", # page is gone
|
|
||||||
"Soks", # page is gone
|
|
||||||
"SoManyLevels", # page moved
|
|
||||||
"SomethingSoft", # page is gone
|
|
||||||
"Sorcery101", # page moved
|
|
||||||
"Spacejams", # page does not follow standard layout
|
|
||||||
"SpellBinder", # page is gone
|
|
||||||
"SPQRBlues", # page moved
|
|
||||||
"StationV3", # page moved
|
|
||||||
"SticksandStuff", # page does not follow standard layout
|
|
||||||
"StickyFingers", # page does not follow standard layout
|
|
||||||
"Stubble", # page moved
|
|
||||||
"SurrealKins", # page is gone
|
|
||||||
"SwirlyMarkYume", # page does not follow standard layout
|
|
||||||
"SynapticMisfiring", # page is gone
|
|
||||||
"TalesoftheQuestor", # page moved
|
|
||||||
"TAVISION", # page moved
|
|
||||||
"ThatWasMcPherson", # page moved
|
|
||||||
"The6GUYSInMyHead", # page has 403 forbidden
|
|
||||||
"TheAdventuresofCaptainMooki", # page moved
|
|
||||||
"TheAdventuresofLilDenverPastrami", # page is gone
|
|
||||||
"TheAdventuresofPeppyThePipingPirate", # page is gone
|
|
||||||
"TheAmoeba", # page is gone
|
|
||||||
"TheAvatar", # page does not follow standard layout
|
|
||||||
"TheBessEffectGerman", # page moved
|
|
||||||
"TheBestandtheBrightest", # page moved
|
|
||||||
"TheCrossoverlord", # missing images
|
|
||||||
"TheDevilsPanties", # page moved
|
|
||||||
"TheDoctorPepperShow", # page has 403 forbidden
|
|
||||||
"TheFantasticalBestiary", # page has 403 forbidden
|
|
||||||
"TheGreenAvenger", # missing images
|
|
||||||
"TheGodsPack", # page has 403 forbidden
|
|
||||||
"TheMadBrothers", # page does not follow standard layout
|
|
||||||
"TheMediocres", # missing images
|
|
||||||
"TheNamelessStory", # page has 403 forbidden
|
|
||||||
"Thenoob", # page moved
|
|
||||||
"TheOrangeArrow", # page is gone
|
|
||||||
"TheSailorNeopetsRPG", # page does not follow standard layout
|
|
||||||
"TheWayoftheWorld", # page moved
|
|
||||||
"TheWorldofUh", # broken images
|
|
||||||
"TheWotch", # page does not follow standard layout
|
|
||||||
"ThunderandLightning", # page moved
|
|
||||||
"TinysWorld", # page does not follow standard layout
|
|
||||||
"ToonPimpsPalace", # page moved
|
|
||||||
"Tossers", # page moved
|
|
||||||
"Towner", # page does not follow standard layout
|
|
||||||
"Townies", # page is gone
|
|
||||||
"TracyandTristan", # page moved
|
|
||||||
"TrialsintheLight", # page does not follow standard layout
|
|
||||||
"Ttskr", # page does not follow standard layout
|
|
||||||
"Twelvedragons", # page does not follow standard layout
|
|
||||||
"TwoEvilScientists", # page moved
|
|
||||||
"TwoLumps", # page moved
|
|
||||||
"TwoSidesWide", # page moved
|
|
||||||
"Untitled", # page does not follow standard layout
|
|
||||||
"UBERGEEKSpriteWorld", # page is gone
|
|
||||||
"Vendetta", # page moved
|
|
||||||
"VictimsoftheSystem", # page moved
|
|
||||||
"Victor", # page moved
|
|
||||||
"WARPZONEthinkwithinthecube", # page does not follow standard layout
|
|
||||||
"WayoftheDodo", # page does not follow standard layout
|
|
||||||
"Wedontgetiteither", # page moved
|
|
||||||
"WeishauptScholars", # page does not follow standard layout
|
|
||||||
"Werechild", # page has 403 forbidden
|
|
||||||
"WhiskeyAndMelancholy", # missing pages
|
|
||||||
"YellowMoon", # page has 403 forbidden
|
|
||||||
"YouScrewedUp", # missing images
|
|
||||||
"YUMEdream", # page moved
|
|
||||||
"Zap", # page moved
|
|
||||||
"ZebraGirl", # page moved
|
|
||||||
"Zeek", # page moved
|
|
||||||
"Zootz", # page is gone
|
|
||||||
]
|
|
||||||
|
|
||||||
# links to last valid strips
|
|
||||||
url_overrides = {
|
|
||||||
"BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html",
|
|
||||||
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
|
|
||||||
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
|
|
||||||
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
|
|
||||||
'Candi': 'http://candicomics.com/',
|
|
||||||
"CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html",
|
|
||||||
"CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html",
|
|
||||||
"CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html",
|
|
||||||
"FreakU": "http://freaku.comicgenesis.com/d/20080827.html",
|
|
||||||
"FreeParking": "http://freeparking.comicgenesis.com/d/20051029.html",
|
|
||||||
"GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html",
|
|
||||||
"GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html",
|
|
||||||
"HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/",
|
|
||||||
"HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html",
|
|
||||||
"HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html",
|
|
||||||
"MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html",
|
|
||||||
"MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html",
|
|
||||||
"NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html",
|
|
||||||
"No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html",
|
|
||||||
"OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html",
|
|
||||||
"PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html",
|
|
||||||
"Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html",
|
|
||||||
"ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html",
|
|
||||||
"PET": "http://petcomic.comicgenesis.com/d/20070413.html",
|
|
||||||
"PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html",
|
|
||||||
"RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html",
|
|
||||||
"ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html",
|
|
||||||
"TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html",
|
|
||||||
"TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html",
|
|
||||||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
|
||||||
"""Parse one search result page."""
|
|
||||||
print("Parsing", url, file=sys.stderr)
|
|
||||||
try:
|
|
||||||
data = get_page(url, session).text
|
|
||||||
except IOError as msg:
|
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
|
||||||
return
|
|
||||||
for match in url_matcher.finditer(data):
|
|
||||||
url = match.group(1) + '/'
|
|
||||||
name = format_name(match.group(2))
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
if contains_case_insensitive(res, name):
|
|
||||||
# we cannot handle two comics that only differ in case
|
|
||||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
|
||||||
continue
|
|
||||||
# find out how many images this comic has
|
|
||||||
end = match.end()
|
|
||||||
mo = num_matcher.search(data[end:])
|
|
||||||
if not mo:
|
|
||||||
print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
|
|
||||||
continue
|
|
||||||
num = int(mo.group(1))
|
|
||||||
url = url_overrides.get(name, url)
|
|
||||||
try:
|
|
||||||
if "/d/" not in url:
|
|
||||||
check_robotstxt(url + "d/", session)
|
|
||||||
else:
|
|
||||||
check_robotstxt(url, session)
|
|
||||||
except IOError:
|
|
||||||
print("INFO: robots.txt denied for comicgenesis", repr(name))
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
res[name] = (url, num)
|
|
||||||
|
|
||||||
|
|
||||||
def get_results():
|
|
||||||
"""Parse all search result pages."""
|
|
||||||
# store info in a dictionary {name -> shortname}
|
|
||||||
res = {}
|
|
||||||
session = requests.Session()
|
|
||||||
base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
|
|
||||||
for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
|
|
||||||
handle_url(base % c, session, res)
|
|
||||||
save_result(res, json_file)
|
|
||||||
|
|
||||||
|
|
||||||
def has_comic(name):
|
|
||||||
"""Check if comic name already exists."""
|
|
||||||
names = [
|
|
||||||
("Creators/%s" % name).lower(),
|
|
||||||
("GoComics/%s" % name).lower(),
|
|
||||||
]
|
|
||||||
for scraperobj in get_scrapers():
|
|
||||||
lname = scraperclass.name.lower()
|
|
||||||
if lname in names:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
|
||||||
min_comics, filename = args
|
|
||||||
min_comics = int(min_comics)
|
|
||||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
|
||||||
for name, entry in sorted(load_result(json_file).items()):
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
url, num = entry
|
|
||||||
if num < min_comics:
|
|
||||||
continue
|
|
||||||
url = url.replace("comicgen.com", "comicgenesis.com")
|
|
||||||
if has_comic(name):
|
|
||||||
prefix = u'#'
|
|
||||||
else:
|
|
||||||
prefix = u''
|
|
||||||
fp.write(u"%sadd(%r, %r)\n" % (
|
|
||||||
prefix, str(truncate_name(name)), str(url))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
if len(sys.argv) > 1:
|
|
||||||
print_results(sys.argv[1:])
|
|
||||||
else:
|
|
||||||
get_results()
|
|
Loading…
Reference in a new issue