Fix more comics.

This commit is contained in:
Bastian Kleineidam 2012-12-05 21:52:52 +01:00
parent 387dff79a9
commit e5d9002f09
16 changed files with 366 additions and 44 deletions

View file

@ -68,7 +68,7 @@ pyflakes:
pyflakes $(PY_FILES_DIRS) pyflakes $(PY_FILES_DIRS)
count: count:
@sloccount dosage dosagelib | grep "Total Physical Source Lines of Code" @sloccount $(PY_FILES_DIRS) | grep "Total Physical Source Lines of Code"
clean: clean:
find . -name \*.pyc -delete find . -name \*.pyc -delete

View file

@ -7,7 +7,7 @@ import rfc822
import time import time
from .output import out from .output import out
from .util import urlopen, normaliseURL, unquote, strsize from .util import getImageObject, normaliseURL, unquote, strsize
from .events import getHandler from .events import getHandler
class FetchComicError(IOError): class FetchComicError(IOError):
@ -52,7 +52,7 @@ class ComicImage(object):
def connect(self): def connect(self):
"""Connect to host and get meta information.""" """Connect to host and get meta information."""
try: try:
self.urlobj = urlopen(self.url, referrer=self.referrer) self.urlobj = getImageObject(self.url, self.referrer)
except IOError as msg: except IOError as msg:
raise FetchComicError('Unable to retrieve URL.', self.url, msg) raise FetchComicError('Unable to retrieve URL.', self.url, msg)

View file

@ -9,7 +9,10 @@ from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) _imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') + _prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')') '(?:Previous comic' + '|' +
tagre("img", "alt", "Previous comic") + '|' +
tagre("img", "src", "images/back\.gif") +
')')
def add(name, url): def add(name, url):
classname = 'KeenSpot_%s' % name classname = 'KeenSpot_%s' % name
@ -17,7 +20,9 @@ def add(name, url):
@classmethod @classmethod
def _prevUrlModifier(cls, prevUrl): def _prevUrlModifier(cls, prevUrl):
if prevUrl: if prevUrl:
return prevUrl.replace("keenspace", "comicgenesis" return prevUrl.replace("keenspace.com", "comicgenesis.com"
).replace("keenspot.com", "comicgenesis.com"
).replace("toonspace.com", "comicgenesis.com"
).replace("comicgen.com", "comicgenesis.com") ).replace("comicgen.com", "comicgenesis.com")
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,

View file

@ -6,16 +6,17 @@ from re import compile
from ..scraper import make_scraper from ..scraper import make_scraper
from ..util import tagre from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)')) _imageSearch = compile(tagre("img", "src", r'(http://v\.cdn\.nuklearpower\.com/comics/[^"]+)'))
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous") _prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
def add(name, shortname): def add(name, shortname):
baseUrl = 'http://www.nuklearpower.com/' + shortname + '/' baseUrl = 'http://www.nuklearpower.com/'
latestUrl = baseUrl + shortname + '/'
classname = 'NuklearPower_%s' % name classname = 'NuklearPower_%s' % name
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
name='NuklearPower/' + name, name='NuklearPower/' + name,
latestUrl = baseUrl, latestUrl = latestUrl,
stripUrl = baseUrl + '%s', stripUrl = baseUrl + '%s',
imageSearch = _imageSearch, imageSearch = _imageSearch,
prevSearch = _prevSearch, prevSearch = _prevSearch,

View file

@ -8,8 +8,8 @@ from ..util import tagre
_imageSearch = compile(tagre("img", "src", r'(http://(?:www|img2)\.smackjeeves\.com/images/uploaded/comics/[^"]+)')) _imageSearch = compile(tagre("img", "src", r'(http://(?:www|img2)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
_linkSearch = tagre("a", "href", r'([^"]*/comics/\d+/[^"]*)') _linkSearch = tagre("a", "href", r'([^"]*/comics/\d+/[^"]*)')
_prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|&lt; Back)') _prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|&lt; Back|. previous)')
_nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next &gt;)') _nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next &gt;|next )')
def add(name): def add(name):
classname = 'SmackJeeves/' + name classname = 'SmackJeeves/' + name
@ -39,6 +39,3 @@ add('durian')
add('heard') add('heard')
add('mpmcomic') add('mpmcomic')
add('nlmo-project') add('nlmo-project')
add('paranoidloyd')
add('thatdreamagain')
add('wowcomics')

View file

@ -23,10 +23,8 @@ def add(name, host):
) )
add('Grim', 'grim')
add('KOF', 'kof') add('KOF', 'kof')
add('PowerPuffGirls', 'ppg') add('PowerPuffGirls', 'ppg')
add('Snafu', 'www')
add('Tin', 'tin') add('Tin', 'tin')
add('TW', 'tw') add('TW', 'tw')
add('Sugar', 'sugar') add('Sugar', 'sugar')

View file

@ -18,7 +18,7 @@ _imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^
def add(name, shortname): def add(name, shortname):
latestUrl = 'http://www.universaluclick.com%s' % shortname latestUrl = 'http://www.universaluclick.com%s' % shortname
classname = 'UClick_%s' % name classname = 'Universal_%s' % name
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):
@ -34,7 +34,7 @@ def add(name, shortname):
return parse_strdate(strdate).strftime("%Y%m%d") return parse_strdate(strdate).strftime("%Y%m%d")
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
name='UClick/' + name, name='Universal/' + name,
latestUrl = latestUrl, latestUrl = latestUrl,
stripUrl = latestUrl + '%s/', stripUrl = latestUrl + '%s/',
imageSearch = _imageSearch, imageSearch = _imageSearch,

View file

@ -17,18 +17,16 @@ def add(name, subpath):
latestUrl = baseUrl + subpath, latestUrl = baseUrl + subpath,
stripUrl = baseUrl + '?view=archive&amp;chapter=%s', stripUrl = baseUrl + '?view=archive&amp;chapter=%s',
imageSearch = _imageSearch, imageSearch = _imageSearch,
multipleImagesPerStrip = True,
prevSearch = _prevSearch, prevSearch = _prevSearch,
# the prevSearch is a redirect
prevUrlMatchesStripUrl = False,
help = 'Index format: nnnn (non-contiguous)', help = 'Index format: nnnn (non-contiguous)',
) )
add('AgnesQuill', 'daveroman/agnes/') add('AgnesQuill', 'daveroman/agnes/')
add('Elvenbaath', 'tdotodot2k/elvenbaath/')
add('IrrationalFears', 'uvernon/irrationalfears/')
add('KismetHuntersMoon', 'laylalawlor/huntersmoon/')
add('SaikoAndLavender', 'gc/saiko/')
add('MyMuse', 'gc/muse/') add('MyMuse', 'gc/muse/')
add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/') add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/')
add('JaxEpoch', 'johngreen/quicken/') add('JaxEpoch', 'johngreen/quicken/')
add('QuantumRockOfAges', 'DreamchildNYC/quantum/')
add('ClownSamurai', 'qsamurai/clownsamurai/') add('ClownSamurai', 'qsamurai/clownsamurai/')

View file

@ -22,11 +22,17 @@ class _BasicScraper(object):
@cvar prevSearch: A compiled regex that will locate the URL for the @cvar prevSearch: A compiled regex that will locate the URL for the
previous strip when applied to a strip page. previous strip when applied to a strip page.
''' '''
# if more than one image per URL is expected # if more than one image per URL is expected
multipleImagesPerStrip = False multipleImagesPerStrip = False
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
prevUrlMatchesStripUrl = True
# usually the index format help # usually the index format help
help = 'Sorry, no help for this comic yet.' help = 'Sorry, no help for this comic yet.'
def __init__(self, indexes=None): def __init__(self, indexes=None):
"""Initialize internal variables.""" """Initialize internal variables."""
self.urls = set() self.urls = set()

View file

@ -21,7 +21,12 @@ if os.name == 'nt':
has_curses = has_module("curses") has_curses = has_module("curses")
MAX_FILESIZE = 1024*1024*1 # 1MB # Maximum content size for HTML pages
MaxContentBytes = 1024 * 1024 * 2 # 2 MB
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
def tagre(tag, attribute, value, quote='"', before="", after=""): def tagre(tag, attribute, value, quote='"', before="", after=""):
"""Return a regular expression matching the given HTML tag, attribute """Return a regular expression matching the given HTML tag, attribute
@ -71,9 +76,9 @@ def case_insensitive_re(name):
baseSearch = re.compile(tagre("base", "href", '([^"]*)')) baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def getPageContent(url): def getPageContent(url, max_content_bytes=MaxContentBytes):
# read page data # read page data
page = urlopen(url) page = urlopen(url, max_content_bytes=max_content_bytes)
data = page.text data = page.text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
@ -85,6 +90,11 @@ def getPageContent(url):
return data, baseUrl return data, baseUrl
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
"""Get response object for given image URL."""
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrl(url, urlSearch): def fetchUrl(url, urlSearch):
data, baseUrl = getPageContent(url) data, baseUrl = getPageContent(url)
match = urlSearch.search(data) match = urlSearch.search(data)
@ -116,7 +126,6 @@ def fetchUrls(url, imageSearch, prevSearch=None):
prevUrl = match.group(1) prevUrl = match.group(1)
if not prevUrl: if not prevUrl:
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern)) raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
out.write('matched previous URL %r' % prevUrl, 2)
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl)) prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
else: else:
out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2) out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2)
@ -174,7 +183,7 @@ def normaliseURL(url):
return urlparse.urlunparse(pu) return urlparse.urlunparse(pu)
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5): def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None):
out.write('Open URL %s' % url, 2) out.write('Open URL %s' % url, 2)
assert retries >= 0, 'invalid retry value %r' % retries assert retries >= 0, 'invalid retry value %r' % retries
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
@ -183,7 +192,8 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
if referrer: if referrer:
headers['Referer'] = referrer headers['Referer'] = referrer
try: try:
req = requests.get(url, headers=headers, config=config) req = requests.get(url, headers=headers, config=config, prefetch=False)
check_content_size(url, req.headers, max_content_bytes)
req.raise_for_status() req.raise_for_status()
return req return req
except requests.exceptions.RequestException as err: except requests.exceptions.RequestException as err:
@ -191,6 +201,15 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
out.write(msg) out.write(msg)
raise IOError(msg) raise IOError(msg)
def check_content_size(url, headers, max_content_bytes):
if not max_content_bytes:
return
if 'content-length' in headers:
size = int(headers['content-length'])
if size > max_content_bytes:
msg = 'URL content of %s with %d Bytes exceeds %d Bytes.' % (url, size, max_content_bytes)
raise IOError(msg)
def get_columns (fp): def get_columns (fp):
"""Return number of columns for given file.""" """Return number of columns for given file."""

View file

@ -170,7 +170,7 @@ def handle_url(url, url_matcher, num_matcher, res):
end = match.end(1) end = match.end(1)
mo = num_matcher.search(data[end:]) mo = num_matcher.search(data[end:])
if not mo: if not mo:
print("ERROR:", repr(data[end:end+300], file=sys.stderr)) print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
continue continue
num = int(mo.group(1)) num = int(mo.group(1))
res[name] = num res[name] = num

File diff suppressed because one or more lines are too long

View file

@ -47,23 +47,269 @@ exclude_comics = [
"beerkada", # no images "beerkada", # no images
"BelovedLeader", # broken images "BelovedLeader", # broken images
"BigMouthComics", # page does not follow standard layout "BigMouthComics", # page does not follow standard layout
"", # page is gone "BilltheMagician", # page does not follow standard layout
"", # page is gone "BlackBlue", # page moved
"", # page is gone "BlackMagic", # page does not follow standard layout
"BloodBound", # page moved
"bloodofthedragon", # page does not follow standard layout
"BloodWing", # broken images
"BlueZombie", # broken page "BlueZombie", # broken page
"BoomerExpress", # redirection to another page "BoomerExpress", # redirection to another page
"BobOnline", # missing images
"BottomFlavor", # page does not follow standard layout
"BradTheVampire", # page does not follow standard layout
"BreakpointCity", # page moved
"Brinkerhoff", # page redirected
"CampusSafari", # page moved
"CapturetheMoment", # page moved
"CaseyandAndy", # page moved
"Catalyst", # page moved
"Cats", # broken images
"Chair", # page moved
"ChildrenAtPlay", # page does not follow standard layout
"chu", # broken images
"CoACityofAscii", # only ascii images
"ComicMischief", # page moved
"ComputerGameAddicts", # page moved
"Concession", # page moved
"CorridorZ", # page does not follow standard layout
"CrashBoomMagic", # page moved
"CrazySlowlyGoing", # page has 403 forbidden
"CrimsonWings", # page moved
"DakotasRidge", # page moved
"DATAROM", # broken images
"DazeinaHaze", # page moved
"DIABOLICA", # broken images
"DIfIK", # page does not follow standard layout
"DigitalWar", # page is gone
"DimBulbComics", # page is gone
"DIVE", # page is gone
"DominicDeegan", # page moved
"DungeonDamage", # page does not follow standard layout "DungeonDamage", # page does not follow standard layout
"Dylan", # page has 403 forbidden
"EarthRiser", # redirects to a new page "EarthRiser", # redirects to a new page
"EdgetheDevilhunter", # page is gone
"EdibleDirt", # page moved
"Einstien27sDesk", # page is gone
"ElfOnlyInn", # page moved
"Ensuing", # broken links
"etch", # broken images
"EternalCaffeineJunkie", # page does not follow standard layout
"EternityComplex", # page does not follow standard layout
"Evilish", # page moved
"EvolBara", # page is gone
"FaerieTales", # page does not follow standard layout
"FairyTaleNewVillage", # missing images
"Fate27sTear", # page moved
"FaultyLogic", # page does not follow standard layout "FaultyLogic", # page does not follow standard layout
"FireontheMountain", # page does not follow standard layout
"FiveBucksanHour", # page is gone
"Flatwood", # page moved
"FLEMComics", # page moved
"FletchersCave", # page is broken
"ForcesofGoodandEvil", # page does not follow standard layout
"FurryBlackDevil", # page moved
"Galacticus", # page has 403 forbidden
"GeebasonParade", # page does not follow standard layout
"geeks", # page moved
"GeminiBright", # page does not follow standard layout
"GemutationsPlague", # page does not follow standard layout
"GeorgetheSecond", # page does not follow standard layout
"Ghostz", # page does not follow standard layout
"GODLIKE", # page has 403 forbidden
"GoForIt", # page is gone "GoForIt", # page is gone
"JuvenileDiversion", # page moved "GothBoy", # page moved
"Grimage", # page moved
"GrossePointeDogs", # page is broken
"GUComics", # page moved
"HardUnderbelly", # page does not follow standard layout
"HazardousScience", # page is gone
"HereThereBeDragons", # page moved
"HighMaintenance", # missing images
"HighSchoolRPG", # page does not follow standard layout
"Horndog", # page moved
"HorseshoesandHandgrenades", # missing images
"HotelGrim", # missing images
"IAlwaysWakeUpLazy", # page moved
"ihatesteve", # page is gone
"IllicitMiracles", # page does not follow standard layout
"IndefensiblePositions", # page does not follow standard layout
"InsanityFair", # page does not follow standard layout
"InsideJoke", # page is gone
"InsidetheBox", # page has 403 forbidden
"InternationalHopeFoundation", # page does not follow standard layout
"JamieandNick", # page moved
"JasonLovesHisGrandpa", # page is gone
"JavanteasFate", # page is gone
"JBBcomics", # page is gone
"JedandDark", # page does not follow standard layout
"JoBeth", # page moved
"Joyride", # page moved
"JustAnotherEscape", # page moved
"JustWeird", # page has 403 forbidden "JustWeird", # page has 403 forbidden
"Michikomonogatari", # page does not follow standard layout "JuvenileDiversion", # page moved
"MobileMadness", # page does not follow standard layout "JWalkinAndapos", # missing images
"KarmaSlave", # page moved
"KeenLace", # page is gone
"khaoskomic", # page moved
"KillingTime", # page is gone
"KnightsOfTheNexus", # page does not follow standard layout "KnightsOfTheNexus", # page does not follow standard layout
"KoFightClub", # page moved
"LabGoatsInc", # page moved
"LandofGreed", # page is gone
"LeanOnMe", # page has 403 forbidden
"LegendsofRovana", # page has 403 forbidden
"LifeatBayside", # page does not follow standard layout
"LifeinaNutshell", # page does not follow standard layout
"Lifesuchasitis", # page has 403 forbidden
"LinktotheBoards", # page does not follow standard layout
"LinT", # page moved
"LiterallySpeaking", # page does not follow standard layout
"LoxieAndZoot", # page does not follow standard layout
"Lunchtable", # missing images
"MadWorld", # page has 403 forbidden
"Magellan", # page does not follow standard layout
"Marachan", # missing images
"MassProduction", # page does tno follow standard layout
"MayIHelpYou", # page has 403 forbidden
"Meiosis", # page moved
"Michikomonogatari", # page does not follow standard layout
"MidnorthFlourCo", # page has 403 forbidden
"MintCondition", # page moved
"MisadventuresinPhysics", # page has 403 forbidden
"MobileMadness", # page does not follow standard layout
"MyAngelYouAreAngel", # page is gone
"MyBrainHurts", # page does not follow standard layout
"NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
"NeglectedMarioCharacterComix", # page does not follow standard layout
"Nemutionjewel", # page does not follow standard layout
"Nerdgasm", # missing images
"Nerdz", # page is gone
"Nervillsaga", # page does not follow standard layout
"NetherOakasuburbanadventure", # page does not follow standard layout
"NoNeedForBushido", # page moved
"nothingcomesnaturally", # page does not follow standard layout
"NymphsoftheWest", # too few images
"OffTheWall", # page does not follow standard layout
"OneHourAxis", # page is gone
"OnlyOne", # page is gone
"OopsNevermind", # page is gone
"PacoStand", # page has 403 forbidden
"Pander", # page is gone
"PANDORA", # page is missing pages
"PhilosophyBites", # missing images
"PhilosophyMonkey", # page is gone
"PicpakDog", # page moved
"PictureDiary", # page is gone
"PillarsofFaith", # page does not follow standard layout
"Pimpette", # page moved
"PokC3A9Chow", # page has 403 forbidden
"PolleninArabia", # page does not follow standard layout
"PranMan", # page moved
"QueensOfRandomness", # broken images
"QuestionableTales", # page does not follow standard layout
"RadioactiveFanboys", # page does not follow standard layout
"RandomAssembly", # page is gone
"RandomInk", # page is gone
"ReceptorFatigue", # page does not follow standard layout
"Remsi", # page does not follow standard layout
"Reset", # page does not follow standard layout
"ResistanceLine", # page does not follow standard layout
"ReturntoDonnelly", # page is gone
"Riboflavin", # page does not follow standard layout
"RitualsandOfferings", # page is gone
"RiverCityHigh", # page is gone
"RM27sothercomics", # page does not follow standard layout
"RogerAndDominic", # page does not follow standard layout "RogerAndDominic", # page does not follow standard layout
"RoleoftheDie", # page is gone
"RonnieRaccoon", # page moved
"RosalarianAndapossRandomCreepyTales", # page is gone
"RulesofMakeBelieve", # page is gone
"Rveillerie", # page has 403 forbidden
"SaintPeter27sCross", # page does not follow standard layout
"Saturnalia", # page moved
"SavageIslands", # page has 403 forbidden
"SaveMeGebus", # page does not follow standard layout "SaveMeGebus", # page does not follow standard layout
"Sawdust", # page has 403 forbidden
"Scooterboy1234", # page has 403 forbidden
"SecondNight", # page moved
"Sempiternal", # page moved
"Senioritis", # page has 403 forbidden
"ShivaeStudios", # page moved
"ShonenAiKudasai", # page is gone
"ShootMeNow", # page does not follow standard layout
"SidandLasker", # page moved
"SillyConeV", # page is gone
"Skunk", # page moved
"SLAGIT", # missing images
"SmithStone", # page has 403 forbidden
"SnowflakeStudios", # page is gone
"Sock27d", # page is gone
"Soks", # page is gone
"SoManyLevels", # page moved
"SomethingSoft", # page is gone
"Sorcery101", # page moved
"SpellBinder", # page is gone
"SPQRBlues", # page moved
"StationV3", # page moved
"SticksandStuff", # page does not follow standard layout
"StickyFingers", # page does not follow standard layout
"Stubble", # page moved
"SurrealKins", # page is gone
"SwirlyMarkYume", # page does not follow standard layout
"SynapticMisfiring", # page is gone
"TalesoftheQuestor", # page moved
"TAVISION", # page moved
"ThatWasMcPherson", # page moved
"The6GUYSInMyHead", # page has 403 forbidden
"TheAdventuresofCaptainMooki", # page moved
"TheAdventuresofLi27lDenverPastrami", # page is gone
"TheAdventuresofPeppyThePipingPirate", # page is gone
"TheAmoeba", # page is gone
"TheAvatar", # page does not follow standard layout "TheAvatar", # page does not follow standard layout
"TheBessEffectGerman", # page moved
"TheBestandtheBrightest", # page moved
"TheDevilsPanties", # page moved
"TheDoctorPepperShow", # page has 403 forbidden
"TheGods27Pack", # page has 403 forbidden
"TheMadBrothers", # page does not follow standard layout
"TheMediocres", # missing images
"TheNamelessStory", # page has 403 forbidden
"Thenoob", # page moved
"TheOrangeArrow", # page is gone
"TheSailorNeopetsRPG", # page does not follow standard layout
"TheWayoftheWorld", # page moved
"TheWorldofUh", # broken images
"TheWotch", # page does not follow standard layout
"ThunderandLightning", # page moved
"TinysWorld", # page does not follow standard layout
"ToonPimp27sPalace", # page moved
"Tossers", # page moved
"Towner", # page does not follow standard layout
"Townies", # page is gone
"TracyandTristan", # page moved
"TrialsintheLight", # page does not follow standard layout
"ttskr", # page does not follow standard layout
"twelvedragons", # page does not follow standard layout
"TwoEvilScientists", # page moved
"TwoLumps", # page moved
"TwoSidesWide", # page moved
"Vendetta", # page moved
"VictimsoftheSystem", # page moved
"Victor", # page moved
"WARPZONEthinkwithinthecube", # page does not follow standard layout
"WayoftheDodo", # page does not follow standard layout
"Wedontgetiteither", # page moved
"WeishauptScholars", # page does not follow standard layout
"Werechild", # page has 403 forbidden
"WhiskeyAndMelancholy", # missing pages
"YellowMoon", # page has 403 forbidden
"YouScrewedUp", # missing images
"YUMEdream", # page moved
"Zap", # page moved
"ZebraGirl", # page moved
"Zeek", # page moved
"Zootz", # page is gone
] ]
# links to last valid strips # links to last valid strips
@ -72,8 +318,37 @@ url_overrides = {
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html", "AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html", "ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html", "ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
"": "", "CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html",
"": "", "CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html",
"CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html",
"Countyoursheep": "http://countyoursheep.keenspot.com/",
"FreakU": "http://freaku.comicgenesis.com//d/20080827.html",
"FreeParking": "http://freeparking.comicgenesis.com//d/20051029.html",
"GamerPsychotica": "http://gp.comicgenesis.com/d/20060113.html",
"GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html",
"GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html",
"HalflightBreaking": "http://halflight.comicgenesis.com/d/20021031.html",
"HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/",
"HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html",
"HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html",
"MacHall": "http://machall.comicgenesis.com/d/20020125.html",
"MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html",
"MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html",
"MrPinkBlob": "http://mrpinkblob.comicgenesis.com/d/100.html",
"NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html",
"No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html",
"OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html",
"PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html",
"Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html",
"ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html",
"PET": "http://petcomic.comicgenesis.com/d/20070413.html",
"PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html",
"RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html",
"ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html",
"spacejams": "http://spacejams.comicgenesis.com/d/20020820.html",
"TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html",
"TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html",
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
} }
def handle_url(url, res): def handle_url(url, res):

View file

@ -17,12 +17,21 @@ htmltemplate = """
<meta name="viewport" content="width=device-width"> <meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/normalize.css">
<link rel="stylesheet" href="css/main.css"> <link rel="stylesheet" href="css/main.css">
<link rel="stylesheet" href="css/dosage.css">
<script src="js/masonry.min.js"></script>
</head> </head>
<body> <body>
<p>Dosage test results from %(date)s</p> <p>Dosage test results from %(date)s</p>
<ul> <div id="container">
%(content)s %(content)s
</ul> </div>
<script>
window.onload = function() {
var wall = new Masonry( document.getElementById('container'), {
columnWidth: 240
});
};
</script>
</body> </body>
</html> </html>
""" """
@ -80,7 +89,7 @@ def get_content(filename):
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name) inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
else: else:
inner = '<span class="%s">%s</span>' % (css, name) inner = '<span class="%s">%s</span>' % (css, name)
res.append(' <li>%s</li>' % inner) res.append(' <div class="item">%s</div>' % inner)
return os.linesep.join(res) return os.linesep.join(res)

View file

@ -20,6 +20,20 @@ url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"BusinessAndFinance", # not a comic
"ComicPanel", # not a comic
"ComicsAZ", # not a comic
"ComicStrip", # not a comic
"Espaol", # not a comic
"Family", # not a comic
"ForKids", # not a comic
"JamesBond", # not a comic
"Men", # not a comic
"NEA", # not a comic
"Pets", # not a comic
"SundayOnly", # not a comic
"WebExclusive", # not a comic
"Women", # not a comic
] ]

View file

@ -43,7 +43,7 @@ class _ComicTester(TestCase):
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl) self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
if not self.scraperclass.multipleImagesPerStrip: if not self.scraperclass.multipleImagesPerStrip:
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl)) self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
if num > 0: if num > 0 and self.scraperclass.prevUrlMatchesStripUrl:
self.check_stripurl(strip) self.check_stripurl(strip)
num += 1 num += 1
if self.scraperclass.prevSearch: if self.scraperclass.prevSearch: