2012-11-28 17:15:12 +00:00
#!/usr/bin/env python
2013-01-09 21:21:19 +00:00
# Copyright (C) 2012-2013 Bastian Kleineidam
2012-11-28 17:15:12 +00:00
"""
2013-03-11 20:50:49 +00:00
Script to get a list of ComicGenesis comics and save the info in a
JSON file for further processing .
2012-11-28 17:15:12 +00:00
"""
from __future__ import print_function
2013-05-22 20:29:03 +00:00
import codecs
2012-11-28 17:15:12 +00:00
import re
import sys
import os
2013-02-12 20:53:57 +00:00
import requests
2012-11-28 17:15:12 +00:00
sys . path . append ( os . path . join ( os . path . dirname ( __file__ ) , " .. " ) )
2013-03-12 19:46:48 +00:00
from dosagelib . util import getPageContent , asciify , unescape , tagre , check_robotstxt
2013-02-13 18:59:13 +00:00
from dosagelib . scraper import get_scraperclasses
2013-01-09 21:20:03 +00:00
from scriptutil import contains_case_insensitive , capfirst , save_result , load_result , truncate_name
2012-11-28 17:15:12 +00:00
json_file = __file__ . replace ( " .py " , " .json " )
# <div class="comictitle"><strong><a target="_blank" onclick="pageTrackerCG._link('http://collegepros.comicgenesis.com'); return false;" href="http://collegepros.comicgenesis.com">Adventures of the College Pros</a>
url_matcher = re . compile ( r ' <div class= " comictitle " ><strong> ' + tagre ( " a " , " href " , r ' (http://[^ " ]+) ' ) + r ' ([^<]+)</a> ' )
num_matcher = re . compile ( r ' Number of Days: ( \ d+) ' )
2012-11-29 05:46:58 +00:00
# names of comics to exclude
exclude_comics = [
2012-12-04 06:02:40 +00:00
" 10 " , # page is gone
" 54sinRed " , # page is 403 forbidden
" 6D4 " , # redirected to another page
" AaaSoCAwesomenessandaSliceofCheese " , # broken images
" AcrossthePond " , # page moved
" ACDeceptibotscomic " , # no images
" AdamandSei " , # page has 403 forbidden
" AdamsRoadGang " , # page is gone
" ADVENTURERS " , # page is gone
" AiYaiYai " , # page moved
" AlltheCommies " , # missing images
" AltaModaMetro " , # page redirected
" AltarGirl " , # page redirected
" Amerika " , # no images
" Angels " , # page has 403 forbidden
" AngryDMonkey " , # page redirected
" Angst " , # page redirected
" Animenifesto " , # too few images
" Anna " , # no images
" Arcana " , # archive broken
" Area15 " , # no images
" BaidheTu " , # no images
" BasilFlint " , # page redirected
" beerkada " , # no images
" BelovedLeader " , # broken images
" BigMouthComics " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" BilltheMagician " , # page does not follow standard layout
" BlackBlue " , # page moved
" BlackMagic " , # page does not follow standard layout
" BloodBound " , # page moved
" bloodofthedragon " , # page does not follow standard layout
" BloodWing " , # broken images
2012-12-04 06:02:40 +00:00
" BlueZombie " , # broken page
" BoomerExpress " , # redirection to another page
2012-12-05 20:52:52 +00:00
" BobOnline " , # missing images
" BottomFlavor " , # page does not follow standard layout
" BradTheVampire " , # page does not follow standard layout
" BreakpointCity " , # page moved
" Brinkerhoff " , # page redirected
" CampusSafari " , # page moved
" CapturetheMoment " , # page moved
" CaseyandAndy " , # page moved
" Catalyst " , # page moved
" Cats " , # broken images
" Chair " , # page moved
" ChildrenAtPlay " , # page does not follow standard layout
2012-12-12 16:41:29 +00:00
" Chu " , # broken images
2012-12-05 20:52:52 +00:00
" CoACityofAscii " , # only ascii images
" ComicMischief " , # page moved
" ComputerGameAddicts " , # page moved
" Concession " , # page moved
2012-12-07 23:45:18 +00:00
" Countyoursheep " , # broken links
2012-12-05 20:52:52 +00:00
" CorridorZ " , # page does not follow standard layout
" CrashBoomMagic " , # page moved
" CrazySlowlyGoing " , # page has 403 forbidden
" CrimsonWings " , # page moved
" DakotasRidge " , # page moved
" DATAROM " , # broken images
" DazeinaHaze " , # page moved
" DIABOLICA " , # broken images
" DIfIK " , # page does not follow standard layout
" DigitalWar " , # page is gone
" DimBulbComics " , # page is gone
" DIVE " , # page is gone
" DominicDeegan " , # page moved
2012-12-07 23:45:18 +00:00
" DownwardBound " , # page does not follow standard layout
2012-12-04 06:02:40 +00:00
" DungeonDamage " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" Dylan " , # page has 403 forbidden
2012-12-04 06:02:40 +00:00
" EarthRiser " , # redirects to a new page
2012-12-05 20:52:52 +00:00
" EdgetheDevilhunter " , # page is gone
" EdibleDirt " , # page moved
2013-03-12 19:46:48 +00:00
" EinstiensDesk " , # page is gone
2012-12-05 20:52:52 +00:00
" ElfOnlyInn " , # page moved
" Ensuing " , # broken links
" etch " , # broken images
" EternalCaffeineJunkie " , # page does not follow standard layout
" EternityComplex " , # page does not follow standard layout
" Evilish " , # page moved
" EvolBara " , # page is gone
" FaerieTales " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" FairestandFallen " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" FairyTaleNewVillage " , # missing images
2013-03-12 19:46:48 +00:00
" FatesTear " , # page moved
2012-12-04 06:02:40 +00:00
" FaultyLogic " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" FireontheMountain " , # page does not follow standard layout
" FiveBucksanHour " , # page is gone
" Flatwood " , # page moved
" FLEMComics " , # page moved
" FletchersCave " , # page is broken
2012-12-07 23:45:18 +00:00
" FlipandSplog " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" ForcesofGoodandEvil " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" Framed " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" FurryBlackDevil " , # page moved
" Galacticus " , # page has 403 forbidden
2012-12-07 23:45:18 +00:00
" GamerPsychotica " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" GeebasonParade " , # page does not follow standard layout
2012-12-12 16:41:29 +00:00
" Geeks " , # page moved
2012-12-05 20:52:52 +00:00
" GeminiBright " , # page does not follow standard layout
" GemutationsPlague " , # page does not follow standard layout
" GeorgetheSecond " , # page does not follow standard layout
" Ghostz " , # page does not follow standard layout
" GODLIKE " , # page has 403 forbidden
2012-12-04 06:02:40 +00:00
" GoForIt " , # page is gone
2012-12-05 20:52:52 +00:00
" GothBoy " , # page moved
2012-12-07 23:45:18 +00:00
" Gravity " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" Grimage " , # page moved
" GrossePointeDogs " , # page is broken
" GUComics " , # page moved
2012-12-07 23:45:18 +00:00
" HalflightBreaking " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" HardUnderbelly " , # page does not follow standard layout
" HazardousScience " , # page is gone
" HereThereBeDragons " , # page moved
" HighMaintenance " , # missing images
" HighSchoolRPG " , # page does not follow standard layout
" Horndog " , # page moved
" HorseshoesandHandgrenades " , # missing images
" HotelGrim " , # missing images
" IAlwaysWakeUpLazy " , # page moved
2012-12-12 16:41:29 +00:00
" Ihatesteve " , # page is gone
2012-12-05 20:52:52 +00:00
" IllicitMiracles " , # page does not follow standard layout
" IndefensiblePositions " , # page does not follow standard layout
" InsanityFair " , # page does not follow standard layout
" InsideJoke " , # page is gone
" InsidetheBox " , # page has 403 forbidden
" InternationalHopeFoundation " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" Inverloch " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" JamieandNick " , # page moved
" JasonLovesHisGrandpa " , # page is gone
" JavanteasFate " , # page is gone
" JBBcomics " , # page is gone
" JedandDark " , # page does not follow standard layout
" JoBeth " , # page moved
" Joyride " , # page moved
" JustAnotherEscape " , # page moved
2012-12-02 17:35:06 +00:00
" JustWeird " , # page has 403 forbidden
2012-12-05 20:52:52 +00:00
" JuvenileDiversion " , # page moved
" JWalkinAndapos " , # missing images
" KarmaSlave " , # page moved
" KeenLace " , # page is gone
" khaoskomic " , # page moved
" KillingTime " , # page is gone
" KnightsOfTheNexus " , # page does not follow standard layout
" KoFightClub " , # page moved
" LabGoatsInc " , # page moved
" LandofGreed " , # page is gone
" LeanOnMe " , # page has 403 forbidden
" LegendsofRovana " , # page has 403 forbidden
" LifeatBayside " , # page does not follow standard layout
" LifeinaNutshell " , # page does not follow standard layout
" Lifesuchasitis " , # page has 403 forbidden
" LinktotheBoards " , # page does not follow standard layout
" LinT " , # page moved
" LiterallySpeaking " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" LifeonForbez " , # missing images
2012-12-05 20:52:52 +00:00
" LoxieAndZoot " , # page does not follow standard layout
" Lunchtable " , # missing images
2012-12-07 23:45:18 +00:00
" MacHall " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" MadWorld " , # page has 403 forbidden
" Magellan " , # page does not follow standard layout
" Marachan " , # missing images
" MassProduction " , # page does tno follow standard layout
" MayIHelpYou " , # page has 403 forbidden
" Meiosis " , # page moved
2012-12-04 06:02:40 +00:00
" Michikomonogatari " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" MidnorthFlourCo " , # page has 403 forbidden
2012-12-07 23:45:18 +00:00
" Mindmistress " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" MintCondition " , # page moved
" MisadventuresinPhysics " , # page has 403 forbidden
2012-12-02 17:35:06 +00:00
" MobileMadness " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" MrPinkBlob " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" MyAngelYouAreAngel " , # page is gone
" MyBrainHurts " , # page does not follow standard layout
" NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee " , # page does not follow standard layout
" NeglectedMarioCharacterComix " , # page does not follow standard layout
2012-12-07 23:45:18 +00:00
" NekoTheKitty " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" Nemutionjewel " , # page does not follow standard layout
" Nerdgasm " , # missing images
" Nerdz " , # page is gone
" Nervillsaga " , # page does not follow standard layout
" NetherOakasuburbanadventure " , # page does not follow standard layout
" NoNeedForBushido " , # page moved
2012-12-12 16:41:29 +00:00
" Nothingcomesnaturally " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" NymphsoftheWest " , # too few images
" OffTheWall " , # page does not follow standard layout
" OneHourAxis " , # page is gone
" OnlyOne " , # page is gone
" OopsNevermind " , # page is gone
" PacoStand " , # page has 403 forbidden
" Pander " , # page is gone
" PANDORA " , # page is missing pages
" PhilosophyBites " , # missing images
" PhilosophyMonkey " , # page is gone
" PicpakDog " , # page moved
" PictureDiary " , # page is gone
" PillarsofFaith " , # page does not follow standard layout
" Pimpette " , # page moved
" PokC3A9Chow " , # page has 403 forbidden
" PolleninArabia " , # page does not follow standard layout
" PranMan " , # page moved
" QueensOfRandomness " , # broken images
" QuestionableTales " , # page does not follow standard layout
" RadioactiveFanboys " , # page does not follow standard layout
" RandomAssembly " , # page is gone
" RandomInk " , # page is gone
" ReceptorFatigue " , # page does not follow standard layout
" Remsi " , # page does not follow standard layout
" Reset " , # page does not follow standard layout
" ResistanceLine " , # page does not follow standard layout
" ReturntoDonnelly " , # page is gone
" Riboflavin " , # page does not follow standard layout
" RitualsandOfferings " , # page is gone
" RiverCityHigh " , # page is gone
2013-03-12 19:46:48 +00:00
" RMsothercomics " , # page does not follow standard layout
2012-12-02 17:35:06 +00:00
" RogerAndDominic " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" RoleoftheDie " , # page is gone
" RonnieRaccoon " , # page moved
" RosalarianAndapossRandomCreepyTales " , # page is gone
" RulesofMakeBelieve " , # page is gone
" Rveillerie " , # page has 403 forbidden
2013-03-12 19:46:48 +00:00
" SaintPetersCross " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" Saturnalia " , # page moved
" SavageIslands " , # page has 403 forbidden
2012-12-02 17:35:06 +00:00
" SaveMeGebus " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" Sawdust " , # page has 403 forbidden
" Scooterboy1234 " , # page has 403 forbidden
" SecondNight " , # page moved
" Sempiternal " , # page moved
" Senioritis " , # page has 403 forbidden
" ShivaeStudios " , # page moved
" ShonenAiKudasai " , # page is gone
" ShootMeNow " , # page does not follow standard layout
" SidandLasker " , # page moved
" SillyConeV " , # page is gone
" Skunk " , # page moved
" SLAGIT " , # missing images
" SmithStone " , # page has 403 forbidden
" SnowflakeStudios " , # page is gone
2013-03-12 19:46:48 +00:00
" Sockd " , # page is gone
2012-12-05 20:52:52 +00:00
" Soks " , # page is gone
" SoManyLevels " , # page moved
" SomethingSoft " , # page is gone
" Sorcery101 " , # page moved
2012-12-12 16:41:29 +00:00
" Spacejams " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" SpellBinder " , # page is gone
" SPQRBlues " , # page moved
" StationV3 " , # page moved
" SticksandStuff " , # page does not follow standard layout
" StickyFingers " , # page does not follow standard layout
" Stubble " , # page moved
" SurrealKins " , # page is gone
" SwirlyMarkYume " , # page does not follow standard layout
" SynapticMisfiring " , # page is gone
" TalesoftheQuestor " , # page moved
" TAVISION " , # page moved
" ThatWasMcPherson " , # page moved
" The6GUYSInMyHead " , # page has 403 forbidden
" TheAdventuresofCaptainMooki " , # page moved
2013-03-12 19:46:48 +00:00
" TheAdventuresofLilDenverPastrami " , # page is gone
2012-12-05 20:52:52 +00:00
" TheAdventuresofPeppyThePipingPirate " , # page is gone
" TheAmoeba " , # page is gone
2012-12-04 06:02:40 +00:00
" TheAvatar " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" TheBessEffectGerman " , # page moved
" TheBestandtheBrightest " , # page moved
2013-04-04 16:30:02 +00:00
" TheCrossoverlord " , # missing images
2012-12-05 20:52:52 +00:00
" TheDevilsPanties " , # page moved
" TheDoctorPepperShow " , # page has 403 forbidden
2013-05-25 21:24:33 +00:00
" TheFantasticalBestiary " , # page has 403 forbidden
2013-03-12 19:46:48 +00:00
" TheGodsPack " , # page has 403 forbidden
2012-12-05 20:52:52 +00:00
" TheMadBrothers " , # page does not follow standard layout
" TheMediocres " , # missing images
" TheNamelessStory " , # page has 403 forbidden
" Thenoob " , # page moved
" TheOrangeArrow " , # page is gone
" TheSailorNeopetsRPG " , # page does not follow standard layout
" TheWayoftheWorld " , # page moved
" TheWorldofUh " , # broken images
" TheWotch " , # page does not follow standard layout
" ThunderandLightning " , # page moved
" TinysWorld " , # page does not follow standard layout
2013-03-12 19:46:48 +00:00
" ToonPimpsPalace " , # page moved
2012-12-05 20:52:52 +00:00
" Tossers " , # page moved
" Towner " , # page does not follow standard layout
" Townies " , # page is gone
" TracyandTristan " , # page moved
" TrialsintheLight " , # page does not follow standard layout
2012-12-12 16:41:29 +00:00
" Ttskr " , # page does not follow standard layout
" Twelvedragons " , # page does not follow standard layout
2012-12-05 20:52:52 +00:00
" TwoEvilScientists " , # page moved
" TwoLumps " , # page moved
" TwoSidesWide " , # page moved
2012-12-07 23:45:18 +00:00
" Untitled " , # page does not follow standard layout
2013-07-09 20:21:12 +00:00
" UBERGEEKSpriteWorld " , # page is gone
2012-12-05 20:52:52 +00:00
" Vendetta " , # page moved
" VictimsoftheSystem " , # page moved
" Victor " , # page moved
" WARPZONEthinkwithinthecube " , # page does not follow standard layout
" WayoftheDodo " , # page does not follow standard layout
" Wedontgetiteither " , # page moved
" WeishauptScholars " , # page does not follow standard layout
" Werechild " , # page has 403 forbidden
" WhiskeyAndMelancholy " , # missing pages
" YellowMoon " , # page has 403 forbidden
" YouScrewedUp " , # missing images
" YUMEdream " , # page moved
" Zap " , # page moved
" ZebraGirl " , # page moved
" Zeek " , # page moved
" Zootz " , # page is gone
2012-11-29 05:46:58 +00:00
]
2012-12-04 06:02:40 +00:00
# links to last valid strips
2012-12-02 17:35:06 +00:00
url_overrides = {
" BallofYarn " : " http://ballofyarn.comicgenesis.com/d/20020624.html " ,
2012-12-04 06:02:40 +00:00
" AmazonSpaceRangers " : " http://amazons.comicgenesis.com/d/20051015.html " ,
" ArroganceinSimplicity " : " http://arrogance.comicgenesis.com/d/20030217.html " ,
" ATasteofEvil " : " http://atasteofevil.comicgenesis.com/d/20050314.html " ,
2012-12-07 23:45:18 +00:00
' Candi ' : ' http://candicomics.com/ ' ,
2012-12-05 20:52:52 +00:00
" CanYouKeepaSecret " : " http://cykas.comicgenesis.com/d/20041035.html " ,
" CapturetheMoment " : " http://capturethemoment.comicgenesis.com/d/20100927.html " ,
" CornerAlley13 " : " http://corneralley.comicgenesis.com/d/20101010.html " ,
2012-12-07 23:45:18 +00:00
" FreakU " : " http://freaku.comicgenesis.com/d/20080827.html " ,
" FreeParking " : " http://freeparking.comicgenesis.com/d/20051029.html " ,
2012-12-05 20:52:52 +00:00
" GoneAstray " : " http://goneastray.comicgenesis.com/d/20100305.html " ,
" GoodnEvil " : " http://gne.comicgenesis.com/d/20040814.html " ,
" HealerOnFeatheredWings " : " http://selsachronicles.comicgenesis.com/ " ,
" HowNottoRunAComic " : " http://hownottorunacomic.comicgenesis.com/d/19950719.html " ,
" HurricaneParty " : " http://hurricaneparty.comicgenesis.com/d/20040123.html " ,
" MaryQuiteContrary " : " http://marycontrary.comicgenesis.com/d/20070824.html " ,
" MoonCrest24 " : " http://mooncrest.comicgenesis.com/d/20121117.html " ,
" NekkoandJoruba " : " http://nekkoandjoruba.comicgenesis.com/d/20050816.html " ,
" No4thWalltoBreak " : " http://no4thwalltobreak.comicgenesis.com/d/20041025.html " ,
" OtakuKyokai " : " http://otakukyokai.comicgenesis.com/d/20060818.html " ,
" PandP " : " http://pandpcomic.comicgenesis.com/d/20021002.html " ,
" Paradigm " : " http://paradigm.comicgenesis.com/d/20020716.html " ,
" ParallelDementia " : " http://paralleldementia.comicgenesis.com/d/20071221.html " ,
" PET " : " http://petcomic.comicgenesis.com/d/20070413.html " ,
" PlanetsCollide " : " http://ruthcomix.comicgenesis.com/d/20010706.html " ,
" RuneMaster " : " http://runemaster.comicgenesis.com/d/20050607.html " ,
" ShinobiHigh " : " http://shinobihigh.comicgenesis.com/d/20020118.html " ,
" TheAdventuresofVindibuddSuperheroInTraining " : " http://vindibudd.comicgenesis.com/d/20070720.html " ,
" TriumphantLosers " : " http://triumphantlosers.comicgenesis.com/d/20081006.html " ,
" Zortic " : " http://zortic.comicgenesis.com/d/20030922.html " ,
2012-12-02 17:35:06 +00:00
}
2012-11-28 17:15:12 +00:00
2013-02-12 20:53:57 +00:00
def handle_url ( url , session , res ) :
2012-11-28 17:15:12 +00:00
""" Parse one search result page. """
print ( " Parsing " , url , file = sys . stderr )
try :
2013-02-12 20:53:57 +00:00
data , baseUrl = getPageContent ( url , session )
2012-11-28 17:15:12 +00:00
except IOError as msg :
print ( " ERROR: " , msg , file = sys . stderr )
return
for match in url_matcher . finditer ( data ) :
url = match . group ( 1 ) + ' / '
name = unescape ( match . group ( 2 ) )
name = asciify ( name . replace ( ' & ' , ' And ' ) . replace ( ' @ ' , ' At ' ) )
2012-12-12 16:41:29 +00:00
name = capfirst ( name )
2012-11-29 05:46:58 +00:00
if name in exclude_comics :
continue
2012-11-28 17:15:12 +00:00
if contains_case_insensitive ( res , name ) :
# we cannot handle two comics that only differ in case
2013-03-12 19:46:48 +00:00
print ( " INFO: skipping possible duplicate " , repr ( name ) , file = sys . stderr )
2012-11-28 17:15:12 +00:00
continue
# find out how many images this comic has
end = match . end ( )
mo = num_matcher . search ( data [ end : ] )
if not mo :
2012-12-12 16:41:29 +00:00
print ( " ERROR: " , repr ( data [ end : end + 300 ] ) , file = sys . stderr )
2012-11-28 17:15:12 +00:00
continue
num = int ( mo . group ( 1 ) )
2013-03-12 19:46:48 +00:00
url = url_overrides . get ( name , url )
try :
if " /d/ " not in url :
check_robotstxt ( url + " d/ " , session )
else :
check_robotstxt ( url , session )
except IOError :
print ( " INFO: robots.txt denied for " , repr ( name ) )
continue
else :
res [ name ] = ( url , num )
2012-11-28 17:15:12 +00:00
def get_results ( ) :
""" Parse all search result pages. """
# store info in a dictionary {name -> shortname}
res = { }
2013-02-12 20:53:57 +00:00
session = requests . Session ( )
2012-11-28 17:15:12 +00:00
base = ' http://guide.comicgenesis.com/Keenspace_ %s .html '
for c in ' 0ABCDEFGHIJKLMNOPQRSTUVWXYZ ' :
2013-02-12 20:53:57 +00:00
handle_url ( base % c , session , res )
2012-12-19 19:42:53 +00:00
save_result ( res , json_file )
2012-11-28 17:15:12 +00:00
def has_comic ( name ) :
2013-01-09 21:26:00 +00:00
""" Check if comic name already exists. """
2013-03-11 20:50:49 +00:00
names = [
( " Creators/ %s " % name ) . lower ( ) ,
( " GoComics/ %s " % name ) . lower ( ) ,
]
2013-02-13 18:59:13 +00:00
for scraperclass in get_scraperclasses ( ) :
2013-03-06 19:00:30 +00:00
lname = scraperclass . getName ( ) . lower ( )
2013-03-11 20:50:49 +00:00
if lname in names :
2012-11-28 17:15:12 +00:00
return True
return False
2012-12-12 16:41:29 +00:00
2012-11-28 17:15:12 +00:00
def print_results ( args ) :
""" Print all comics that have at least the given number of minimum comic strips. """
2013-05-22 20:29:03 +00:00
min_comics , filename = args
min_comics = int ( min_comics )
with codecs . open ( filename , ' a ' , ' utf-8 ' ) as fp :
for name , entry in sorted ( load_result ( json_file ) . items ( ) ) :
if name in exclude_comics :
continue
url , num = entry
if num < min_comics :
continue
url = url . replace ( " comicgen.com " , " comicgenesis.com " )
if has_comic ( name ) :
prefix = u ' # '
else :
prefix = u ' '
fp . write ( u " %s add( %r , %r ) \n " % (
prefix , str ( truncate_name ( name ) ) , str ( url ) )
)
2012-11-28 17:15:12 +00:00
if __name__ == ' __main__ ' :
if len ( sys . argv ) > 1 :
print_results ( sys . argv [ 1 : ] )
else :
get_results ( )