Adventures of the College
-# Pros
-url_matcher = re.compile(r'' +
- tagre("a", "href", r'(http://[^"]+)') +
- r'([^<]+)')
-num_matcher = re.compile(r'Number of Days: (\d+)')
-
-# names of comics to exclude
-exclude_comics = [
- "10", # page is gone
- "54sinRed", # page is 403 forbidden
- "6D4", # redirected to another page
- "AaaSoCAwesomenessandaSliceofCheese", # broken images
- "AcrossthePond", # page moved
- "ACDeceptibotscomic", # no images
- "AdamandSei", # page has 403 forbidden
- "AdamsRoadGang", # page is gone
- "ADVENTURERS", # page is gone
- "AiYaiYai", # page moved
- "AlltheCommies", # missing images
- "AltaModaMetro", # page redirected
- "AltarGirl", # page redirected
- "Amerika", # no images
- "Angels", # page has 403 forbidden
- "AngryDMonkey", # page redirected
- "Angst", # page redirected
- "Animenifesto", # too few images
- "Anna", # no images
- "Arcana", # archive broken
- "Area15", # no images
- "BaidheTu", # no images
- "BasilFlint", # page redirected
- "beerkada", # no images
- "BelovedLeader", # broken images
- "BigMouthComics", # page does not follow standard layout
- "BilltheMagician", # page does not follow standard layout
- "BlackBlue", # page moved
- "BlackMagic", # page does not follow standard layout
- "BloodBound", # page moved
- "bloodofthedragon", # page does not follow standard layout
- "BloodWing", # broken images
- "BlueZombie", # broken page
- "BoomerExpress", # redirection to another page
- "BobOnline", # missing images
- "BottomFlavor", # page does not follow standard layout
- "BradTheVampire", # page does not follow standard layout
- "BreakpointCity", # page moved
- "Brinkerhoff", # page redirected
- "CampusSafari", # page moved
- "CapturetheMoment", # page moved
- "CaseyandAndy", # page moved
- "Catalyst", # page moved
- "Cats", # broken images
- "Chair", # page moved
- "ChildrenAtPlay", # page does not follow standard layout
- "Chu", # broken images
- "CoACityofAscii", # only ascii images
- "ComicMischief", # page moved
- "ComputerGameAddicts", # page moved
- "Concession", # page moved
- "Countyoursheep", # broken links
- "CorridorZ", # page does not follow standard layout
- "CrashBoomMagic", # page moved
- "CrazySlowlyGoing", # page has 403 forbidden
- "CrimsonWings", # page moved
- "DakotasRidge", # page moved
- "DATAROM", # broken images
- "DazeinaHaze", # page moved
- "DIABOLICA", # broken images
- "DIfIK", # page does not follow standard layout
- "DigitalWar", # page is gone
- "DimBulbComics", # page is gone
- "DIVE", # page is gone
- "DominicDeegan", # page moved
- "DownwardBound", # page does not follow standard layout
- "DungeonDamage", # page does not follow standard layout
- "Dylan", # page has 403 forbidden
- "EarthRiser", # redirects to a new page
- "EdgetheDevilhunter", # page is gone
- "EdibleDirt", # page moved
- "EinstiensDesk", # page is gone
- "ElfOnlyInn", # page moved
- "Ensuing", # broken links
- "etch", # broken images
- "EternalCaffeineJunkie", # page does not follow standard layout
- "EternityComplex", # page does not follow standard layout
- "Evilish", # page moved
- "EvolBara", # page is gone
- "FaerieTales", # page does not follow standard layout
- "FairestandFallen", # page does not follow standard layout
- "FairyTaleNewVillage", # missing images
- "FatesTear", # page moved
- "FaultyLogic", # page does not follow standard layout
- "FireontheMountain", # page does not follow standard layout
- "FiveBucksanHour", # page is gone
- "Flatwood", # page moved
- "FLEMComics", # page moved
- "FletchersCave", # page is broken
- "FlipandSplog", # page does not follow standard layout
- "ForcesofGoodandEvil", # page does not follow standard layout
- "Framed", # page does not follow standard layout
- "FurryBlackDevil", # page moved
- "Galacticus", # page has 403 forbidden
- "GamerPsychotica", # page does not follow standard layout
- "GeebasonParade", # page does not follow standard layout
- "Geeks", # page moved
- "GeminiBright", # page does not follow standard layout
- "GemutationsPlague", # page does not follow standard layout
- "GeorgetheSecond", # page does not follow standard layout
- "Ghostz", # page does not follow standard layout
- "GODLIKE", # page has 403 forbidden
- "GoForIt", # page is gone
- "GothBoy", # page moved
- "Gravity", # page does not follow standard layout
- "Grimage", # page moved
- "GrossePointeDogs", # page is broken
- "GUComics", # page moved
- "HalflightBreaking", # page does not follow standard layout
- "HardUnderbelly", # page does not follow standard layout
- "HazardousScience", # page is gone
- "HereThereBeDragons", # page moved
- "HighMaintenance", # missing images
- "HighSchoolRPG", # page does not follow standard layout
- "Horndog", # page moved
- "HorseshoesandHandgrenades", # missing images
- "HotelGrim", # missing images
- "IAlwaysWakeUpLazy", # page moved
- "Ihatesteve", # page is gone
- "IllicitMiracles", # page does not follow standard layout
- "IndefensiblePositions", # page does not follow standard layout
- "InsanityFair", # page does not follow standard layout
- "InsideJoke", # page is gone
- "InsidetheBox", # page has 403 forbidden
- "InternationalHopeFoundation", # page does not follow standard layout
- "Inverloch", # page does not follow standard layout
- "JamieandNick", # page moved
- "JasonLovesHisGrandpa", # page is gone
- "JavanteasFate", # page is gone
- "JBBcomics", # page is gone
- "JedandDark", # page does not follow standard layout
- "JoBeth", # page moved
- "Joyride", # page moved
- "JustAnotherEscape", # page moved
- "JustWeird", # page has 403 forbidden
- "JuvenileDiversion", # page moved
- "JWalkinAndapos", # missing images
- "KarmaSlave", # page moved
- "KeenLace", # page is gone
- "khaoskomic", # page moved
- "KillingTime", # page is gone
- "KnightsOfTheNexus", # page does not follow standard layout
- "KoFightClub", # page moved
- "LabGoatsInc", # page moved
- "LandofGreed", # page is gone
- "LeanOnMe", # page has 403 forbidden
- "LegendsofRovana", # page has 403 forbidden
- "LifeatBayside", # page does not follow standard layout
- "LifeinaNutshell", # page does not follow standard layout
- "Lifesuchasitis", # page has 403 forbidden
- "LinktotheBoards", # page does not follow standard layout
- "LinT", # page moved
- "LiterallySpeaking", # page does not follow standard layout
- "LifeonForbez", # missing images
- "LoxieAndZoot", # page does not follow standard layout
- "Lunchtable", # missing images
- "MacHall", # page does not follow standard layout
- "MadWorld", # page has 403 forbidden
- "Magellan", # page does not follow standard layout
- "Marachan", # missing images
- "MassProduction", # page does tno follow standard layout
- "MayIHelpYou", # page has 403 forbidden
- "Meiosis", # page moved
- "Michikomonogatari", # page does not follow standard layout
- "MidnorthFlourCo", # page has 403 forbidden
- "Mindmistress", # page does not follow standard layout
- "MintCondition", # page moved
- "MisadventuresinPhysics", # page has 403 forbidden
- "MobileMadness", # page does not follow standard layout
- "MrPinkBlob", # page does not follow standard layout
- "MyAngelYouAreAngel", # page is gone
- "MyBrainHurts", # page does not follow standard layout
- "NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
- "NeglectedMarioCharacterComix", # page does not follow standard layout
- "NekoTheKitty", # page does not follow standard layout
- "Nemutionjewel", # page does not follow standard layout
- "Nerdgasm", # missing images
- "Nerdz", # page is gone
- "Nervillsaga", # page does not follow standard layout
- "NetherOakasuburbanadventure", # page does not follow standard layout
- "NoNeedForBushido", # page moved
- "Nothingcomesnaturally", # page does not follow standard layout
- "NymphsoftheWest", # too few images
- "OffTheWall", # page does not follow standard layout
- "OneHourAxis", # page is gone
- "OnlyOne", # page is gone
- "OopsNevermind", # page is gone
- "PacoStand", # page has 403 forbidden
- "Pander", # page is gone
- "PANDORA", # page is missing pages
- "PhilosophyBites", # missing images
- "PhilosophyMonkey", # page is gone
- "PicpakDog", # page moved
- "PictureDiary", # page is gone
- "PillarsofFaith", # page does not follow standard layout
- "Pimpette", # page moved
- "PokC3A9Chow", # page has 403 forbidden
- "PolleninArabia", # page does not follow standard layout
- "PranMan", # page moved
- "QueensOfRandomness", # broken images
- "QuestionableTales", # page does not follow standard layout
- "RadioactiveFanboys", # page does not follow standard layout
- "RandomAssembly", # page is gone
- "RandomInk", # page is gone
- "ReceptorFatigue", # page does not follow standard layout
- "Remsi", # page does not follow standard layout
- "Reset", # page does not follow standard layout
- "ResistanceLine", # page does not follow standard layout
- "ReturntoDonnelly", # page is gone
- "Riboflavin", # page does not follow standard layout
- "RitualsandOfferings", # page is gone
- "RiverCityHigh", # page is gone
- "RMsothercomics", # page does not follow standard layout
- "RogerAndDominic", # page does not follow standard layout
- "RoleoftheDie", # page is gone
- "RonnieRaccoon", # page moved
- "RosalarianAndapossRandomCreepyTales", # page is gone
- "RulesofMakeBelieve", # page is gone
- "Rveillerie", # page has 403 forbidden
- "SaintPetersCross", # page does not follow standard layout
- "Saturnalia", # page moved
- "SavageIslands", # page has 403 forbidden
- "SaveMeGebus", # page does not follow standard layout
- "Sawdust", # page has 403 forbidden
- "Scooterboy1234", # page has 403 forbidden
- "SecondNight", # page moved
- "Sempiternal", # page moved
- "Senioritis", # page has 403 forbidden
- "ShivaeStudios", # page moved
- "ShonenAiKudasai", # page is gone
- "ShootMeNow", # page does not follow standard layout
- "SidandLasker", # page moved
- "SillyConeV", # page is gone
- "Skunk", # page moved
- "SLAGIT", # missing images
- "SmithStone", # page has 403 forbidden
- "SnowflakeStudios", # page is gone
- "Sockd", # page is gone
- "Soks", # page is gone
- "SoManyLevels", # page moved
- "SomethingSoft", # page is gone
- "Sorcery101", # page moved
- "Spacejams", # page does not follow standard layout
- "SpellBinder", # page is gone
- "SPQRBlues", # page moved
- "StationV3", # page moved
- "SticksandStuff", # page does not follow standard layout
- "StickyFingers", # page does not follow standard layout
- "Stubble", # page moved
- "SurrealKins", # page is gone
- "SwirlyMarkYume", # page does not follow standard layout
- "SynapticMisfiring", # page is gone
- "TalesoftheQuestor", # page moved
- "TAVISION", # page moved
- "ThatWasMcPherson", # page moved
- "The6GUYSInMyHead", # page has 403 forbidden
- "TheAdventuresofCaptainMooki", # page moved
- "TheAdventuresofLilDenverPastrami", # page is gone
- "TheAdventuresofPeppyThePipingPirate", # page is gone
- "TheAmoeba", # page is gone
- "TheAvatar", # page does not follow standard layout
- "TheBessEffectGerman", # page moved
- "TheBestandtheBrightest", # page moved
- "TheCrossoverlord", # missing images
- "TheDevilsPanties", # page moved
- "TheDoctorPepperShow", # page has 403 forbidden
- "TheFantasticalBestiary", # page has 403 forbidden
- "TheGreenAvenger", # missing images
- "TheGodsPack", # page has 403 forbidden
- "TheMadBrothers", # page does not follow standard layout
- "TheMediocres", # missing images
- "TheNamelessStory", # page has 403 forbidden
- "Thenoob", # page moved
- "TheOrangeArrow", # page is gone
- "TheSailorNeopetsRPG", # page does not follow standard layout
- "TheWayoftheWorld", # page moved
- "TheWorldofUh", # broken images
- "TheWotch", # page does not follow standard layout
- "ThunderandLightning", # page moved
- "TinysWorld", # page does not follow standard layout
- "ToonPimpsPalace", # page moved
- "Tossers", # page moved
- "Towner", # page does not follow standard layout
- "Townies", # page is gone
- "TracyandTristan", # page moved
- "TrialsintheLight", # page does not follow standard layout
- "Ttskr", # page does not follow standard layout
- "Twelvedragons", # page does not follow standard layout
- "TwoEvilScientists", # page moved
- "TwoLumps", # page moved
- "TwoSidesWide", # page moved
- "Untitled", # page does not follow standard layout
- "UBERGEEKSpriteWorld", # page is gone
- "Vendetta", # page moved
- "VictimsoftheSystem", # page moved
- "Victor", # page moved
- "WARPZONEthinkwithinthecube", # page does not follow standard layout
- "WayoftheDodo", # page does not follow standard layout
- "Wedontgetiteither", # page moved
- "WeishauptScholars", # page does not follow standard layout
- "Werechild", # page has 403 forbidden
- "WhiskeyAndMelancholy", # missing pages
- "YellowMoon", # page has 403 forbidden
- "YouScrewedUp", # missing images
- "YUMEdream", # page moved
- "Zap", # page moved
- "ZebraGirl", # page moved
- "Zeek", # page moved
- "Zootz", # page is gone
-]
-
-# links to last valid strips
-url_overrides = {
- "BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html",
- "AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
- "ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
- "ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
- 'Candi': 'http://candicomics.com/',
- "CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html",
- "CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html",
- "CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html",
- "FreakU": "http://freaku.comicgenesis.com/d/20080827.html",
- "FreeParking": "http://freeparking.comicgenesis.com/d/20051029.html",
- "GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html",
- "GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html",
- "HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/",
- "HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html",
- "HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html",
- "MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html",
- "MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html",
- "NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html",
- "No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html",
- "OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html",
- "PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html",
- "Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html",
- "ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html",
- "PET": "http://petcomic.comicgenesis.com/d/20070413.html",
- "PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html",
- "RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html",
- "ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html",
- "TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html",
- "TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html",
- "Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
-}
-
-
-def handle_url(url, session, res):
- """Parse one search result page."""
- print("Parsing", url, file=sys.stderr)
- try:
- data = get_page(url, session).text
- except IOError as msg:
- print("ERROR:", msg, file=sys.stderr)
- return
- for match in url_matcher.finditer(data):
- url = match.group(1) + '/'
- name = format_name(match.group(2))
- if name in exclude_comics:
- continue
- if contains_case_insensitive(res, name):
- # we cannot handle two comics that only differ in case
- print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
- continue
- # find out how many images this comic has
- end = match.end()
- mo = num_matcher.search(data[end:])
- if not mo:
- print("ERROR:", repr(data[end:end + 300]), file=sys.stderr)
- continue
- num = int(mo.group(1))
- url = url_overrides.get(name, url)
- try:
- if "/d/" not in url:
- check_robotstxt(url + "d/", session)
- else:
- check_robotstxt(url, session)
- except IOError:
- print("INFO: robots.txt denied for comicgenesis", repr(name))
- continue
- else:
- res[name] = (url, num)
-
-
-def get_results():
- """Parse all search result pages."""
- # store info in a dictionary {name -> shortname}
- res = {}
- session = requests.Session()
- base = 'http://guide.comicgenesis.com/Keenspace_%s.html'
- for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ':
- handle_url(base % c, session, res)
- save_result(res, json_file)
-
-
-def has_comic(name):
- """Check if comic name already exists."""
- names = [
- ("Creators/%s" % name).lower(),
- ("GoComics/%s" % name).lower(),
- ]
- for scraperobj in get_scrapers():
- lname = scraperclass.name.lower()
- if lname in names:
- return True
- return False
-
-
-def print_results(args):
- """Print all comics that have at least the given number of minimum comic strips."""
- min_comics, filename = args
- min_comics = int(min_comics)
- with codecs.open(filename, 'a', 'utf-8') as fp:
- for name, entry in sorted(load_result(json_file).items()):
- if name in exclude_comics:
- continue
- url, num = entry
- if num < min_comics:
- continue
- url = url.replace("comicgen.com", "comicgenesis.com")
- if has_comic(name):
- prefix = u'#'
- else:
- prefix = u''
- fp.write(u"%sadd(%r, %r)\n" % (
- prefix, str(truncate_name(name)), str(url))
- )
-
-
-if __name__ == '__main__':
- if len(sys.argv) > 1:
- print_results(sys.argv[1:])
- else:
- get_results()