diff --git a/dosagelib/plugins/comicgenesis.py b/dosagelib/plugins/comicgenesis.py index 59d95d020..64cce5882 100644 --- a/dosagelib/plugins/comicgenesis.py +++ b/dosagelib/plugins/comicgenesis.py @@ -1,37 +1,26 @@ # -*- coding: utf-8 -*- # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher +# Copyright (C) 2015-2017 Tobias Gruetzmacher from __future__ import absolute_import, division, print_function -from re import compile - -from ..scraper import _BasicScraper -from ..util import tagre +from ..scraper import _ParserScraper # Comicgenesis has a lot of comics, but most of them are disallowed by # robots.txt -class ComicGenesis(_BasicScraper): - imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) - prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') + - '(?:Previous comic' + '|' + - tagre("img", "alt", "Previous comic") + '|' + - tagre("img", "src", "images/back\.gif") + - ')') +class ComicGenesis(_ParserScraper): multipleImagesPerStrip = True + imageSearch = '//img[contains(@src, "/comics/")]' + prevSearch = ( + '//a[img/@alt="Previous comic"]', + '//a[text()="Previous comic"]', + ) help = 'Index format: yyyymmdd' - def link_modifier(self, fromurl, tourl): - return tourl.replace( - "keenspace.com", "comicgenesis.com").replace( - "keenspot.com", "comicgenesis.com").replace( - "toonspace.com", "comicgenesis.com").replace( - "comicgen.com", "comicgenesis.com") - - def __init__(self, name, sub=None, last=None, baseUrl=None): + def __init__(self, name, sub=None, last=None, baseUrl=None, lang=None): super(ComicGenesis, self).__init__('ComicGenesis/' + name) if sub: @@ -44,12 +33,12 @@ class ComicGenesis(_BasicScraper): else: self.url = baseUrl + if lang: + self.lang = lang + @classmethod def getmodules(cls): - return [ - # do not edit anything below since these entries are generated from - # scripts/update_plugins.sh - # START AUTOUPDATE + return ( cls('AAAAA', 'aaaaa'), cls('AdventuresofKiltman', 'kiltman'), cls('AmorModerno', 'amormoderno'), @@ -61,9 +50,12 @@ class ComicGenesis(_BasicScraper): cls('BendyStrawVampires', 'bsvampires'), cls('BlindSight', 'blindsight'), cls('BreakingtheDoldrum', 'breakingthedoldrum'), + cls('BrotherSwan', 'warlordofnoodles'), cls('Candi', baseUrl='http://candicomics.com/'), + cls('Cerintha', 'cerintha'), cls('CorporateLife', 'corporatelife'), cls('DarkWelkin', 'darkwelkin'), + cls('DeepBlue', 'gjbivin', last='20131109'), cls('DemonEater', 'demoneater'), cls('DoodleDiaries', 'doodlediaries'), cls('DormSweetDorm', 'dormsweetdorm'), @@ -78,7 +70,6 @@ class ComicGenesis(_BasicScraper): cls('Flounderville', 'flounderville'), cls('GEM', 'keltzy'), cls('Gonefor300days', 'g4300d'), - cls('IBlameDanny', 'vileterror'), cls('ImpendingDoom', 'impending'), cls('InANutshell', 'nutshellcomics'), cls('KernyMantisComics', 'kernymantis'), @@ -91,12 +82,14 @@ class ComicGenesis(_BasicScraper): cls('LumiasKingdom', 'lumia'), cls('Majestic7', 'majestic7'), cls('MaximumWhimsy', 'maximumwhimsy'), - cls('MenschunsererZeitGerman', 'muz'), + cls('MenschUnsererZeitGerman', 'muz', lang='de', last='20090630'), + cls('MenschUnsererZeit', 'rabe', last='20090630'), cls('MoonCrest24', 'mooncrest', last='20121117'), cls('Mushian', 'tentoumushi'), cls('NightwolfCentral', 'nightwolfcentral'), - cls('NoTimeForLife', 'randyraven'), cls('NoneMoreComic', 'nonemore'), + cls('NoTimeForLife', 'randyraven', last='20100510'), + cls('OcculTango', 'occultango'), cls('ODCKS', 'odcks'), cls('OfDoom', 'ofdoom'), cls('OpportunityofaLifetime', 'carpathia'), @@ -119,12 +112,11 @@ class ComicGenesis(_BasicScraper): cls('TheAdventuresofVindibuddSuperheroInTraining', 'vindibudd', last='20070720'), cls('TheEasyBreather', 'easybreather'), cls('TheMisadventuresofOkk', 'okk'), - cls('ThePath', 'thepath'), + cls('ThePath', 'thepath', '20081226'), cls('TheTalesofKalduras', 'kalduras'), cls('Unconventional', 'unconventional'), cls('WarMageNC17', 'warmage'), cls('WebcomicTheWebcomicWebcomicWebcomicWebcomic', 'dannormnsanidey'), cls('WhatYouDontSee', 'phantomlady4'), cls('Wierdman', 'asa'), - # END AUTOUPDATE - ] + ) diff --git a/dosagelib/plugins/old.py b/dosagelib/plugins/old.py index 8a933fbc3..747f7c1af 100644 --- a/dosagelib/plugins/old.py +++ b/dosagelib/plugins/old.py @@ -191,6 +191,7 @@ class Removed(Scraper): cls('ComicFury/Wowwithatwistdamaclesandkejallcomic'), cls('ComicFury/YouAreNowEnteringAshburg'), cls('ComicGenesis/CryHavoc'), + cls('ComicGenesis/IBlameDanny'), cls('ComicGenesis/SueosdelSur'), cls('Commissioned'), cls('CowboyJedi', 'brk'), diff --git a/scripts/comicgenesis.py b/scripts/comicgenesis.py deleted file mode 100755 index 82eb95c96..000000000 --- a/scripts/comicgenesis.py +++ /dev/null @@ -1,470 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2016 Tobias Gruetzmacher -""" -Script to get a list of ComicGenesis comics and save the info in a -JSON file for further processing. -""" -from __future__ import absolute_import, division, print_function - -import codecs -import re -import sys -import os - -import requests - -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa -from dosagelib.util import get_page, tagre, check_robotstxt -from dosagelib.scraper import get_scrapers -from scriptutil import (contains_case_insensitive, save_result, load_result, - truncate_name, format_name) - -json_file = __file__.replace(".py", ".json") - -#
Adventures of the College -# Pros -url_matcher = re.compile(r'
' + - tagre("a", "href", r'(http://[^"]+)') + - r'([^<]+)') -num_matcher = re.compile(r'Number of Days: (\d+)') - -# names of comics to exclude -exclude_comics = [ - "10", # page is gone - "54sinRed", # page is 403 forbidden - "6D4", # redirected to another page - "AaaSoCAwesomenessandaSliceofCheese", # broken images - "AcrossthePond", # page moved - "ACDeceptibotscomic", # no images - "AdamandSei", # page has 403 forbidden - "AdamsRoadGang", # page is gone - "ADVENTURERS", # page is gone - "AiYaiYai", # page moved - "AlltheCommies", # missing images - "AltaModaMetro", # page redirected - "AltarGirl", # page redirected - "Amerika", # no images - "Angels", # page has 403 forbidden - "AngryDMonkey", # page redirected - "Angst", # page redirected - "Animenifesto", # too few images - "Anna", # no images - "Arcana", # archive broken - "Area15", # no images - "BaidheTu", # no images - "BasilFlint", # page redirected - "beerkada", # no images - "BelovedLeader", # broken images - "BigMouthComics", # page does not follow standard layout - "BilltheMagician", # page does not follow standard layout - "BlackBlue", # page moved - "BlackMagic", # page does not follow standard layout - "BloodBound", # page moved - "bloodofthedragon", # page does not follow standard layout - "BloodWing", # broken images - "BlueZombie", # broken page - "BoomerExpress", # redirection to another page - "BobOnline", # missing images - "BottomFlavor", # page does not follow standard layout - "BradTheVampire", # page does not follow standard layout - "BreakpointCity", # page moved - "Brinkerhoff", # page redirected - "CampusSafari", # page moved - "CapturetheMoment", # page moved - "CaseyandAndy", # page moved - "Catalyst", # page moved - "Cats", # broken images - "Chair", # page moved - "ChildrenAtPlay", # page does not follow standard layout - "Chu", # broken images - "CoACityofAscii", # only ascii images - "ComicMischief", # page moved - "ComputerGameAddicts", # page moved - "Concession", # page moved - "Countyoursheep", # broken links - "CorridorZ", # page does not follow standard layout - "CrashBoomMagic", # page moved - "CrazySlowlyGoing", # page has 403 forbidden - "CrimsonWings", # page moved - "DakotasRidge", # page moved - "DATAROM", # broken images - "DazeinaHaze", # page moved - "DIABOLICA", # broken images - "DIfIK", # page does not follow standard layout - "DigitalWar", # page is gone - "DimBulbComics", # page is gone - "DIVE", # page is gone - "DominicDeegan", # page moved - "DownwardBound", # page does not follow standard layout - "DungeonDamage", # page does not follow standard layout - "Dylan", # page has 403 forbidden - "EarthRiser", # redirects to a new page - "EdgetheDevilhunter", # page is gone - "EdibleDirt", # page moved - "EinstiensDesk", # page is gone - "ElfOnlyInn", # page moved - "Ensuing", # broken links - "etch", # broken images - "EternalCaffeineJunkie", # page does not follow standard layout - "EternityComplex", # page does not follow standard layout - "Evilish", # page moved - "EvolBara", # page is gone - "FaerieTales", # page does not follow standard layout - "FairestandFallen", # page does not follow standard layout - "FairyTaleNewVillage", # missing images - "FatesTear", # page moved - "FaultyLogic", # page does not follow standard layout - "FireontheMountain", # page does not follow standard layout - "FiveBucksanHour", # page is gone - "Flatwood", # page moved - "FLEMComics", # page moved - "FletchersCave", # page is broken - "FlipandSplog", # page does not follow standard layout - "ForcesofGoodandEvil", # page does not follow standard layout - "Framed", # page does not follow standard layout - "FurryBlackDevil", # page moved - "Galacticus", # page has 403 forbidden - "GamerPsychotica", # page does not follow standard layout - "GeebasonParade", # page does not follow standard layout - "Geeks", # page moved - "GeminiBright", # page does not follow standard layout - "GemutationsPlague", # page does not follow standard layout - "GeorgetheSecond", # page does not follow standard layout - "Ghostz", # page does not follow standard layout - "GODLIKE", # page has 403 forbidden - "GoForIt", # page is gone - "GothBoy", # page moved - "Gravity", # page does not follow standard layout - "Grimage", # page moved - "GrossePointeDogs", # page is broken - "GUComics", # page moved - "HalflightBreaking", # page does not follow standard layout - "HardUnderbelly", # page does not follow standard layout - "HazardousScience", # page is gone - "HereThereBeDragons", # page moved - "HighMaintenance", # missing images - "HighSchoolRPG", # page does not follow standard layout - "Horndog", # page moved - "HorseshoesandHandgrenades", # missing images - "HotelGrim", # missing images - "IAlwaysWakeUpLazy", # page moved - "Ihatesteve", # page is gone - "IllicitMiracles", # page does not follow standard layout - "IndefensiblePositions", # page does not follow standard layout - "InsanityFair", # page does not follow standard layout - "InsideJoke", # page is gone - "InsidetheBox", # page has 403 forbidden - "InternationalHopeFoundation", # page does not follow standard layout - "Inverloch", # page does not follow standard layout - "JamieandNick", # page moved - "JasonLovesHisGrandpa", # page is gone - "JavanteasFate", # page is gone - "JBBcomics", # page is gone - "JedandDark", # page does not follow standard layout - "JoBeth", # page moved - "Joyride", # page moved - "JustAnotherEscape", # page moved - "JustWeird", # page has 403 forbidden - "JuvenileDiversion", # page moved - "JWalkinAndapos", # missing images - "KarmaSlave", # page moved - "KeenLace", # page is gone - "khaoskomic", # page moved - "KillingTime", # page is gone - "KnightsOfTheNexus", # page does not follow standard layout - "KoFightClub", # page moved - "LabGoatsInc", # page moved - "LandofGreed", # page is gone - "LeanOnMe", # page has 403 forbidden - "LegendsofRovana", # page has 403 forbidden - "LifeatBayside", # page does not follow standard layout - "LifeinaNutshell", # page does not follow standard layout - "Lifesuchasitis", # page has 403 forbidden - "LinktotheBoards", # page does not follow standard layout - "LinT", # page moved - "LiterallySpeaking", # page does not follow standard layout - "LifeonForbez", # missing images - "LoxieAndZoot", # page does not follow standard layout - "Lunchtable", # missing images - "MacHall", # page does not follow standard layout - "MadWorld", # page has 403 forbidden - "Magellan", # page does not follow standard layout - "Marachan", # missing images - "MassProduction", # page does tno follow standard layout - "MayIHelpYou", # page has 403 forbidden - "Meiosis", # page moved - "Michikomonogatari", # page does not follow standard layout - "MidnorthFlourCo", # page has 403 forbidden - "Mindmistress", # page does not follow standard layout - "MintCondition", # page moved - "MisadventuresinPhysics", # page has 403 forbidden - "MobileMadness", # page does not follow standard layout - "MrPinkBlob", # page does not follow standard layout - "MyAngelYouAreAngel", # page is gone - "MyBrainHurts", # page does not follow standard layout - "NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout - "NeglectedMarioCharacterComix", # page does not follow standard layout - "NekoTheKitty", # page does not follow standard layout - "Nemutionjewel", # page does not follow standard layout - "Nerdgasm", # missing images - "Nerdz", # page is gone - "Nervillsaga", # page does not follow standard layout - "NetherOakasuburbanadventure", # page does not follow standard layout - "NoNeedForBushido", # page moved - "Nothingcomesnaturally", # page does not follow standard layout - "NymphsoftheWest", # too few images - "OffTheWall", # page does not follow standard layout - "OneHourAxis", # page is gone - "OnlyOne", # page is gone - "OopsNevermind", # page is gone - "PacoStand", # page has 403 forbidden - "Pander", # page is gone - "PANDORA", # page is missing pages - "PhilosophyBites", # missing images - "PhilosophyMonkey", # page is gone - "PicpakDog", # page moved - "PictureDiary", # page is gone - "PillarsofFaith", # page does not follow standard layout - "Pimpette", # page moved - "PokC3A9Chow", # page has 403 forbidden - "PolleninArabia", # page does not follow standard layout - "PranMan", # page moved - "QueensOfRandomness", # broken images - "QuestionableTales", # page does not follow standard layout - "RadioactiveFanboys", # page does not follow standard layout - "RandomAssembly", # page is gone - "RandomInk", # page is gone - "ReceptorFatigue", # page does not follow standard layout - "Remsi", # page does not follow standard layout - "Reset", # page does not follow standard layout - "ResistanceLine", # page does not follow standard layout - "ReturntoDonnelly", # page is gone - "Riboflavin", # page does not follow standard layout - "RitualsandOfferings", # page is gone - "RiverCityHigh", # page is gone - "RMsothercomics", # page does not follow standard layout - "RogerAndDominic", # page does not follow standard layout - "RoleoftheDie", # page is gone - "RonnieRaccoon", # page moved - "RosalarianAndapossRandomCreepyTales", # page is gone - "RulesofMakeBelieve", # page is gone - "Rveillerie", # page has 403 forbidden - "SaintPetersCross", # page does not follow standard layout - "Saturnalia", # page moved - "SavageIslands", # page has 403 forbidden - "SaveMeGebus", # page does not follow standard layout - "Sawdust", # page has 403 forbidden - "Scooterboy1234", # page has 403 forbidden - "SecondNight", # page moved - "Sempiternal", # page moved - "Senioritis", # page has 403 forbidden - "ShivaeStudios", # page moved - "ShonenAiKudasai", # page is gone - "ShootMeNow", # page does not follow standard layout - "SidandLasker", # page moved - "SillyConeV", # page is gone - "Skunk", # page moved - "SLAGIT", # missing images - "SmithStone", # page has 403 forbidden - "SnowflakeStudios", # page is gone - "Sockd", # page is gone - "Soks", # page is gone - "SoManyLevels", # page moved - "SomethingSoft", # page is gone - "Sorcery101", # page moved - "Spacejams", # page does not follow standard layout - "SpellBinder", # page is gone - "SPQRBlues", # page moved - "StationV3", # page moved - "SticksandStuff", # page does not follow standard layout - "StickyFingers", # page does not follow standard layout - "Stubble", # page moved - "SurrealKins", # page is gone - "SwirlyMarkYume", # page does not follow standard layout - "SynapticMisfiring", # page is gone - "TalesoftheQuestor", # page moved - "TAVISION", # page moved - "ThatWasMcPherson", # page moved - "The6GUYSInMyHead", # page has 403 forbidden - "TheAdventuresofCaptainMooki", # page moved - "TheAdventuresofLilDenverPastrami", # page is gone - "TheAdventuresofPeppyThePipingPirate", # page is gone - "TheAmoeba", # page is gone - "TheAvatar", # page does not follow standard layout - "TheBessEffectGerman", # page moved - "TheBestandtheBrightest", # page moved - "TheCrossoverlord", # missing images - "TheDevilsPanties", # page moved - "TheDoctorPepperShow", # page has 403 forbidden - "TheFantasticalBestiary", # page has 403 forbidden - "TheGreenAvenger", # missing images - "TheGodsPack", # page has 403 forbidden - "TheMadBrothers", # page does not follow standard layout - "TheMediocres", # missing images - "TheNamelessStory", # page has 403 forbidden - "Thenoob", # page moved - "TheOrangeArrow", # page is gone - "TheSailorNeopetsRPG", # page does not follow standard layout - "TheWayoftheWorld", # page moved - "TheWorldofUh", # broken images - "TheWotch", # page does not follow standard layout - "ThunderandLightning", # page moved - "TinysWorld", # page does not follow standard layout - "ToonPimpsPalace", # page moved - "Tossers", # page moved - "Towner", # page does not follow standard layout - "Townies", # page is gone - "TracyandTristan", # page moved - "TrialsintheLight", # page does not follow standard layout - "Ttskr", # page does not follow standard layout - "Twelvedragons", # page does not follow standard layout - "TwoEvilScientists", # page moved - "TwoLumps", # page moved - "TwoSidesWide", # page moved - "Untitled", # page does not follow standard layout - "UBERGEEKSpriteWorld", # page is gone - "Vendetta", # page moved - "VictimsoftheSystem", # page moved - "Victor", # page moved - "WARPZONEthinkwithinthecube", # page does not follow standard layout - "WayoftheDodo", # page does not follow standard layout - "Wedontgetiteither", # page moved - "WeishauptScholars", # page does not follow standard layout - "Werechild", # page has 403 forbidden - "WhiskeyAndMelancholy", # missing pages - "YellowMoon", # page has 403 forbidden - "YouScrewedUp", # missing images - "YUMEdream", # page moved - "Zap", # page moved - "ZebraGirl", # page moved - "Zeek", # page moved - "Zootz", # page is gone -] - -# links to last valid strips -url_overrides = { - "BallofYarn": "http://ballofyarn.comicgenesis.com/d/20020624.html", - "AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html", - "ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html", - "ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html", - 'Candi': 'http://candicomics.com/', - "CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html", - "CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html", - "CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html", - "FreakU": "http://freaku.comicgenesis.com/d/20080827.html", - "FreeParking": "http://freeparking.comicgenesis.com/d/20051029.html", - "GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html", - "GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html", - "HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/", - "HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html", - "HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html", - "MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html", - "MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html", - "NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html", - "No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html", - "OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html", - "PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html", - "Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html", - "ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html", - "PET": "http://petcomic.comicgenesis.com/d/20070413.html", - "PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html", - "RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html", - "ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html", - "TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html", - "TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html", - "Zortic": "http://zortic.comicgenesis.com/d/20030922.html", -} - - -def handle_url(url, session, res): - """Parse one search result page.""" - print("Parsing", url, file=sys.stderr) - try: - data = get_page(url, session).text - except IOError as msg: - print("ERROR:", msg, file=sys.stderr) - return - for match in url_matcher.finditer(data): - url = match.group(1) + '/' - name = format_name(match.group(2)) - if name in exclude_comics: - continue - if contains_case_insensitive(res, name): - # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) - continue - # find out how many images this comic has - end = match.end() - mo = num_matcher.search(data[end:]) - if not mo: - print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) - continue - num = int(mo.group(1)) - url = url_overrides.get(name, url) - try: - if "/d/" not in url: - check_robotstxt(url + "d/", session) - else: - check_robotstxt(url, session) - except IOError: - print("INFO: robots.txt denied for comicgenesis", repr(name)) - continue - else: - res[name] = (url, num) - - -def get_results(): - """Parse all search result pages.""" - # store info in a dictionary {name -> shortname} - res = {} - session = requests.Session() - base = 'http://guide.comicgenesis.com/Keenspace_%s.html' - for c in '0ABCDEFGHIJKLMNOPQRSTUVWXYZ': - handle_url(base % c, session, res) - save_result(res, json_file) - - -def has_comic(name): - """Check if comic name already exists.""" - names = [ - ("Creators/%s" % name).lower(), - ("GoComics/%s" % name).lower(), - ] - for scraperobj in get_scrapers(): - lname = scraperclass.name.lower() - if lname in names: - return True - return False - - -def print_results(args): - """Print all comics that have at least the given number of minimum comic strips.""" - min_comics, filename = args - min_comics = int(min_comics) - with codecs.open(filename, 'a', 'utf-8') as fp: - for name, entry in sorted(load_result(json_file).items()): - if name in exclude_comics: - continue - url, num = entry - if num < min_comics: - continue - url = url.replace("comicgen.com", "comicgenesis.com") - if has_comic(name): - prefix = u'#' - else: - prefix = u'' - fp.write(u"%sadd(%r, %r)\n" % ( - prefix, str(truncate_name(name)), str(url)) - ) - - -if __name__ == '__main__': - if len(sys.argv) > 1: - print_results(sys.argv[1:]) - else: - get_results()