Fix more comics.
This commit is contained in:
parent
387dff79a9
commit
e5d9002f09
16 changed files with 366 additions and 44 deletions
2
Makefile
2
Makefile
|
@ -68,7 +68,7 @@ pyflakes:
|
|||
pyflakes $(PY_FILES_DIRS)
|
||||
|
||||
count:
|
||||
@sloccount dosage dosagelib | grep "Total Physical Source Lines of Code"
|
||||
@sloccount $(PY_FILES_DIRS) | grep "Total Physical Source Lines of Code"
|
||||
|
||||
clean:
|
||||
find . -name \*.pyc -delete
|
||||
|
|
|
@ -7,7 +7,7 @@ import rfc822
|
|||
import time
|
||||
|
||||
from .output import out
|
||||
from .util import urlopen, normaliseURL, unquote, strsize
|
||||
from .util import getImageObject, normaliseURL, unquote, strsize
|
||||
from .events import getHandler
|
||||
|
||||
class FetchComicError(IOError):
|
||||
|
@ -52,7 +52,7 @@ class ComicImage(object):
|
|||
def connect(self):
|
||||
"""Connect to host and get meta information."""
|
||||
try:
|
||||
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
||||
self.urlobj = getImageObject(self.url, self.referrer)
|
||||
except IOError as msg:
|
||||
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
||||
|
||||
|
|
|
@ -9,7 +9,10 @@ from ..util import tagre
|
|||
|
||||
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')')
|
||||
'(?:Previous comic' + '|' +
|
||||
tagre("img", "alt", "Previous comic") + '|' +
|
||||
tagre("img", "src", "images/back\.gif") +
|
||||
')')
|
||||
|
||||
def add(name, url):
|
||||
classname = 'KeenSpot_%s' % name
|
||||
|
@ -17,7 +20,9 @@ def add(name, url):
|
|||
@classmethod
|
||||
def _prevUrlModifier(cls, prevUrl):
|
||||
if prevUrl:
|
||||
return prevUrl.replace("keenspace", "comicgenesis"
|
||||
return prevUrl.replace("keenspace.com", "comicgenesis.com"
|
||||
).replace("keenspot.com", "comicgenesis.com"
|
||||
).replace("toonspace.com", "comicgenesis.com"
|
||||
).replace("comicgen.com", "comicgenesis.com")
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
|
|
|
@ -6,16 +6,17 @@ from re import compile
|
|||
from ..scraper import make_scraper
|
||||
from ..util import tagre
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)'))
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://v\.cdn\.nuklearpower\.com/comics/[^"]+)'))
|
||||
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
|
||||
|
||||
def add(name, shortname):
|
||||
baseUrl = 'http://www.nuklearpower.com/' + shortname + '/'
|
||||
baseUrl = 'http://www.nuklearpower.com/'
|
||||
latestUrl = baseUrl + shortname + '/'
|
||||
classname = 'NuklearPower_%s' % name
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='NuklearPower/' + name,
|
||||
latestUrl = baseUrl,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = baseUrl + '%s',
|
||||
imageSearch = _imageSearch,
|
||||
prevSearch = _prevSearch,
|
||||
|
|
|
@ -8,8 +8,8 @@ from ..util import tagre
|
|||
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://(?:www|img2)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
|
||||
_linkSearch = tagre("a", "href", r'([^"]*/comics/\d+/[^"]*)')
|
||||
_prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|< Back)')
|
||||
_nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next >)')
|
||||
_prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|< Back|. previous)')
|
||||
_nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next >|next )')
|
||||
|
||||
def add(name):
|
||||
classname = 'SmackJeeves/' + name
|
||||
|
@ -39,6 +39,3 @@ add('durian')
|
|||
add('heard')
|
||||
add('mpmcomic')
|
||||
add('nlmo-project')
|
||||
add('paranoidloyd')
|
||||
add('thatdreamagain')
|
||||
add('wowcomics')
|
||||
|
|
|
@ -23,10 +23,8 @@ def add(name, host):
|
|||
)
|
||||
|
||||
|
||||
add('Grim', 'grim')
|
||||
add('KOF', 'kof')
|
||||
add('PowerPuffGirls', 'ppg')
|
||||
add('Snafu', 'www')
|
||||
add('Tin', 'tin')
|
||||
add('TW', 'tw')
|
||||
add('Sugar', 'sugar')
|
||||
|
|
|
@ -18,7 +18,7 @@ _imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^
|
|||
|
||||
def add(name, shortname):
|
||||
latestUrl = 'http://www.universaluclick.com%s' % shortname
|
||||
classname = 'UClick_%s' % name
|
||||
classname = 'Universal_%s' % name
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
|
@ -34,7 +34,7 @@ def add(name, shortname):
|
|||
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name='UClick/' + name,
|
||||
name='Universal/' + name,
|
||||
latestUrl = latestUrl,
|
||||
stripUrl = latestUrl + '%s/',
|
||||
imageSearch = _imageSearch,
|
||||
|
|
|
@ -17,18 +17,16 @@ def add(name, subpath):
|
|||
latestUrl = baseUrl + subpath,
|
||||
stripUrl = baseUrl + '?view=archive&chapter=%s',
|
||||
imageSearch = _imageSearch,
|
||||
multipleImagesPerStrip = True,
|
||||
prevSearch = _prevSearch,
|
||||
# the prevSearch is a redirect
|
||||
prevUrlMatchesStripUrl = False,
|
||||
help = 'Index format: nnnn (non-contiguous)',
|
||||
)
|
||||
|
||||
|
||||
add('AgnesQuill', 'daveroman/agnes/')
|
||||
add('Elvenbaath', 'tdotodot2k/elvenbaath/')
|
||||
add('IrrationalFears', 'uvernon/irrationalfears/')
|
||||
add('KismetHuntersMoon', 'laylalawlor/huntersmoon/')
|
||||
add('SaikoAndLavender', 'gc/saiko/')
|
||||
add('MyMuse', 'gc/muse/')
|
||||
add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/')
|
||||
add('JaxEpoch', 'johngreen/quicken/')
|
||||
add('QuantumRockOfAges', 'DreamchildNYC/quantum/')
|
||||
add('ClownSamurai', 'qsamurai/clownsamurai/')
|
||||
|
|
|
@ -22,11 +22,17 @@ class _BasicScraper(object):
|
|||
@cvar prevSearch: A compiled regex that will locate the URL for the
|
||||
previous strip when applied to a strip page.
|
||||
'''
|
||||
|
||||
# if more than one image per URL is expected
|
||||
multipleImagesPerStrip = False
|
||||
|
||||
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||
prevUrlMatchesStripUrl = True
|
||||
|
||||
# usually the index format help
|
||||
help = 'Sorry, no help for this comic yet.'
|
||||
|
||||
|
||||
def __init__(self, indexes=None):
|
||||
"""Initialize internal variables."""
|
||||
self.urls = set()
|
||||
|
|
|
@ -21,7 +21,12 @@ if os.name == 'nt':
|
|||
|
||||
has_curses = has_module("curses")
|
||||
|
||||
MAX_FILESIZE = 1024*1024*1 # 1MB
|
||||
# Maximum content size for HTML pages
|
||||
MaxContentBytes = 1024 * 1024 * 2 # 2 MB
|
||||
|
||||
# Maximum content size for images
|
||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||
|
||||
|
||||
def tagre(tag, attribute, value, quote='"', before="", after=""):
|
||||
"""Return a regular expression matching the given HTML tag, attribute
|
||||
|
@ -71,9 +76,9 @@ def case_insensitive_re(name):
|
|||
|
||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||
|
||||
def getPageContent(url):
|
||||
def getPageContent(url, max_content_bytes=MaxContentBytes):
|
||||
# read page data
|
||||
page = urlopen(url)
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes)
|
||||
data = page.text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
|
@ -85,6 +90,11 @@ def getPageContent(url):
|
|||
return data, baseUrl
|
||||
|
||||
|
||||
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
||||
"""Get response object for given image URL."""
|
||||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||
|
||||
|
||||
def fetchUrl(url, urlSearch):
|
||||
data, baseUrl = getPageContent(url)
|
||||
match = urlSearch.search(data)
|
||||
|
@ -116,7 +126,6 @@ def fetchUrls(url, imageSearch, prevSearch=None):
|
|||
prevUrl = match.group(1)
|
||||
if not prevUrl:
|
||||
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
|
||||
out.write('matched previous URL %r' % prevUrl, 2)
|
||||
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
|
||||
else:
|
||||
out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2)
|
||||
|
@ -174,7 +183,7 @@ def normaliseURL(url):
|
|||
return urlparse.urlunparse(pu)
|
||||
|
||||
|
||||
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
||||
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None):
|
||||
out.write('Open URL %s' % url, 2)
|
||||
assert retries >= 0, 'invalid retry value %r' % retries
|
||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||
|
@ -183,7 +192,8 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
|||
if referrer:
|
||||
headers['Referer'] = referrer
|
||||
try:
|
||||
req = requests.get(url, headers=headers, config=config)
|
||||
req = requests.get(url, headers=headers, config=config, prefetch=False)
|
||||
check_content_size(url, req.headers, max_content_bytes)
|
||||
req.raise_for_status()
|
||||
return req
|
||||
except requests.exceptions.RequestException as err:
|
||||
|
@ -191,6 +201,15 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
|||
out.write(msg)
|
||||
raise IOError(msg)
|
||||
|
||||
def check_content_size(url, headers, max_content_bytes):
|
||||
if not max_content_bytes:
|
||||
return
|
||||
if 'content-length' in headers:
|
||||
size = int(headers['content-length'])
|
||||
if size > max_content_bytes:
|
||||
msg = 'URL content of %s with %d Bytes exceeds %d Bytes.' % (url, size, max_content_bytes)
|
||||
raise IOError(msg)
|
||||
|
||||
|
||||
def get_columns (fp):
|
||||
"""Return number of columns for given file."""
|
||||
|
|
|
@ -170,7 +170,7 @@ def handle_url(url, url_matcher, num_matcher, res):
|
|||
end = match.end(1)
|
||||
mo = num_matcher.search(data[end:])
|
||||
if not mo:
|
||||
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
|
||||
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
||||
continue
|
||||
num = int(mo.group(1))
|
||||
res[name] = num
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -47,23 +47,269 @@ exclude_comics = [
|
|||
"beerkada", # no images
|
||||
"BelovedLeader", # broken images
|
||||
"BigMouthComics", # page does not follow standard layout
|
||||
"", # page is gone
|
||||
"", # page is gone
|
||||
"", # page is gone
|
||||
"BilltheMagician", # page does not follow standard layout
|
||||
"BlackBlue", # page moved
|
||||
"BlackMagic", # page does not follow standard layout
|
||||
"BloodBound", # page moved
|
||||
"bloodofthedragon", # page does not follow standard layout
|
||||
"BloodWing", # broken images
|
||||
"BlueZombie", # broken page
|
||||
"BoomerExpress", # redirection to another page
|
||||
"BobOnline", # missing images
|
||||
"BottomFlavor", # page does not follow standard layout
|
||||
"BradTheVampire", # page does not follow standard layout
|
||||
"BreakpointCity", # page moved
|
||||
"Brinkerhoff", # page redirected
|
||||
"CampusSafari", # page moved
|
||||
"CapturetheMoment", # page moved
|
||||
"CaseyandAndy", # page moved
|
||||
"Catalyst", # page moved
|
||||
"Cats", # broken images
|
||||
"Chair", # page moved
|
||||
"ChildrenAtPlay", # page does not follow standard layout
|
||||
"chu", # broken images
|
||||
"CoACityofAscii", # only ascii images
|
||||
"ComicMischief", # page moved
|
||||
"ComputerGameAddicts", # page moved
|
||||
"Concession", # page moved
|
||||
"CorridorZ", # page does not follow standard layout
|
||||
"CrashBoomMagic", # page moved
|
||||
"CrazySlowlyGoing", # page has 403 forbidden
|
||||
"CrimsonWings", # page moved
|
||||
"DakotasRidge", # page moved
|
||||
"DATAROM", # broken images
|
||||
"DazeinaHaze", # page moved
|
||||
"DIABOLICA", # broken images
|
||||
"DIfIK", # page does not follow standard layout
|
||||
"DigitalWar", # page is gone
|
||||
"DimBulbComics", # page is gone
|
||||
"DIVE", # page is gone
|
||||
"DominicDeegan", # page moved
|
||||
"DungeonDamage", # page does not follow standard layout
|
||||
"Dylan", # page has 403 forbidden
|
||||
"EarthRiser", # redirects to a new page
|
||||
"EdgetheDevilhunter", # page is gone
|
||||
"EdibleDirt", # page moved
|
||||
"Einstien27sDesk", # page is gone
|
||||
"ElfOnlyInn", # page moved
|
||||
"Ensuing", # broken links
|
||||
"etch", # broken images
|
||||
"EternalCaffeineJunkie", # page does not follow standard layout
|
||||
"EternityComplex", # page does not follow standard layout
|
||||
"Evilish", # page moved
|
||||
"EvolBara", # page is gone
|
||||
"FaerieTales", # page does not follow standard layout
|
||||
"FairyTaleNewVillage", # missing images
|
||||
"Fate27sTear", # page moved
|
||||
"FaultyLogic", # page does not follow standard layout
|
||||
"FireontheMountain", # page does not follow standard layout
|
||||
"FiveBucksanHour", # page is gone
|
||||
"Flatwood", # page moved
|
||||
"FLEMComics", # page moved
|
||||
"FletchersCave", # page is broken
|
||||
"ForcesofGoodandEvil", # page does not follow standard layout
|
||||
"FurryBlackDevil", # page moved
|
||||
"Galacticus", # page has 403 forbidden
|
||||
"GeebasonParade", # page does not follow standard layout
|
||||
"geeks", # page moved
|
||||
"GeminiBright", # page does not follow standard layout
|
||||
"GemutationsPlague", # page does not follow standard layout
|
||||
"GeorgetheSecond", # page does not follow standard layout
|
||||
"Ghostz", # page does not follow standard layout
|
||||
"GODLIKE", # page has 403 forbidden
|
||||
"GoForIt", # page is gone
|
||||
"JuvenileDiversion", # page moved
|
||||
"GothBoy", # page moved
|
||||
"Grimage", # page moved
|
||||
"GrossePointeDogs", # page is broken
|
||||
"GUComics", # page moved
|
||||
"HardUnderbelly", # page does not follow standard layout
|
||||
"HazardousScience", # page is gone
|
||||
"HereThereBeDragons", # page moved
|
||||
"HighMaintenance", # missing images
|
||||
"HighSchoolRPG", # page does not follow standard layout
|
||||
"Horndog", # page moved
|
||||
"HorseshoesandHandgrenades", # missing images
|
||||
"HotelGrim", # missing images
|
||||
"IAlwaysWakeUpLazy", # page moved
|
||||
"ihatesteve", # page is gone
|
||||
"IllicitMiracles", # page does not follow standard layout
|
||||
"IndefensiblePositions", # page does not follow standard layout
|
||||
"InsanityFair", # page does not follow standard layout
|
||||
"InsideJoke", # page is gone
|
||||
"InsidetheBox", # page has 403 forbidden
|
||||
"InternationalHopeFoundation", # page does not follow standard layout
|
||||
"JamieandNick", # page moved
|
||||
"JasonLovesHisGrandpa", # page is gone
|
||||
"JavanteasFate", # page is gone
|
||||
"JBBcomics", # page is gone
|
||||
"JedandDark", # page does not follow standard layout
|
||||
"JoBeth", # page moved
|
||||
"Joyride", # page moved
|
||||
"JustAnotherEscape", # page moved
|
||||
"JustWeird", # page has 403 forbidden
|
||||
"Michikomonogatari", # page does not follow standard layout
|
||||
"MobileMadness", # page does not follow standard layout
|
||||
"JuvenileDiversion", # page moved
|
||||
"JWalkinAndapos", # missing images
|
||||
"KarmaSlave", # page moved
|
||||
"KeenLace", # page is gone
|
||||
"khaoskomic", # page moved
|
||||
"KillingTime", # page is gone
|
||||
"KnightsOfTheNexus", # page does not follow standard layout
|
||||
"KoFightClub", # page moved
|
||||
"LabGoatsInc", # page moved
|
||||
"LandofGreed", # page is gone
|
||||
"LeanOnMe", # page has 403 forbidden
|
||||
"LegendsofRovana", # page has 403 forbidden
|
||||
"LifeatBayside", # page does not follow standard layout
|
||||
"LifeinaNutshell", # page does not follow standard layout
|
||||
"Lifesuchasitis", # page has 403 forbidden
|
||||
"LinktotheBoards", # page does not follow standard layout
|
||||
"LinT", # page moved
|
||||
"LiterallySpeaking", # page does not follow standard layout
|
||||
"LoxieAndZoot", # page does not follow standard layout
|
||||
"Lunchtable", # missing images
|
||||
"MadWorld", # page has 403 forbidden
|
||||
"Magellan", # page does not follow standard layout
|
||||
"Marachan", # missing images
|
||||
"MassProduction", # page does tno follow standard layout
|
||||
"MayIHelpYou", # page has 403 forbidden
|
||||
"Meiosis", # page moved
|
||||
"Michikomonogatari", # page does not follow standard layout
|
||||
"MidnorthFlourCo", # page has 403 forbidden
|
||||
"MintCondition", # page moved
|
||||
"MisadventuresinPhysics", # page has 403 forbidden
|
||||
"MobileMadness", # page does not follow standard layout
|
||||
"MyAngelYouAreAngel", # page is gone
|
||||
"MyBrainHurts", # page does not follow standard layout
|
||||
"NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
|
||||
"NeglectedMarioCharacterComix", # page does not follow standard layout
|
||||
"Nemutionjewel", # page does not follow standard layout
|
||||
"Nerdgasm", # missing images
|
||||
"Nerdz", # page is gone
|
||||
"Nervillsaga", # page does not follow standard layout
|
||||
"NetherOakasuburbanadventure", # page does not follow standard layout
|
||||
"NoNeedForBushido", # page moved
|
||||
"nothingcomesnaturally", # page does not follow standard layout
|
||||
"NymphsoftheWest", # too few images
|
||||
"OffTheWall", # page does not follow standard layout
|
||||
"OneHourAxis", # page is gone
|
||||
"OnlyOne", # page is gone
|
||||
"OopsNevermind", # page is gone
|
||||
"PacoStand", # page has 403 forbidden
|
||||
"Pander", # page is gone
|
||||
"PANDORA", # page is missing pages
|
||||
"PhilosophyBites", # missing images
|
||||
"PhilosophyMonkey", # page is gone
|
||||
"PicpakDog", # page moved
|
||||
"PictureDiary", # page is gone
|
||||
"PillarsofFaith", # page does not follow standard layout
|
||||
"Pimpette", # page moved
|
||||
"PokC3A9Chow", # page has 403 forbidden
|
||||
"PolleninArabia", # page does not follow standard layout
|
||||
"PranMan", # page moved
|
||||
"QueensOfRandomness", # broken images
|
||||
"QuestionableTales", # page does not follow standard layout
|
||||
"RadioactiveFanboys", # page does not follow standard layout
|
||||
"RandomAssembly", # page is gone
|
||||
"RandomInk", # page is gone
|
||||
"ReceptorFatigue", # page does not follow standard layout
|
||||
"Remsi", # page does not follow standard layout
|
||||
"Reset", # page does not follow standard layout
|
||||
"ResistanceLine", # page does not follow standard layout
|
||||
"ReturntoDonnelly", # page is gone
|
||||
"Riboflavin", # page does not follow standard layout
|
||||
"RitualsandOfferings", # page is gone
|
||||
"RiverCityHigh", # page is gone
|
||||
"RM27sothercomics", # page does not follow standard layout
|
||||
"RogerAndDominic", # page does not follow standard layout
|
||||
"RoleoftheDie", # page is gone
|
||||
"RonnieRaccoon", # page moved
|
||||
"RosalarianAndapossRandomCreepyTales", # page is gone
|
||||
"RulesofMakeBelieve", # page is gone
|
||||
"Rveillerie", # page has 403 forbidden
|
||||
"SaintPeter27sCross", # page does not follow standard layout
|
||||
"Saturnalia", # page moved
|
||||
"SavageIslands", # page has 403 forbidden
|
||||
"SaveMeGebus", # page does not follow standard layout
|
||||
"Sawdust", # page has 403 forbidden
|
||||
"Scooterboy1234", # page has 403 forbidden
|
||||
"SecondNight", # page moved
|
||||
"Sempiternal", # page moved
|
||||
"Senioritis", # page has 403 forbidden
|
||||
"ShivaeStudios", # page moved
|
||||
"ShonenAiKudasai", # page is gone
|
||||
"ShootMeNow", # page does not follow standard layout
|
||||
"SidandLasker", # page moved
|
||||
"SillyConeV", # page is gone
|
||||
"Skunk", # page moved
|
||||
"SLAGIT", # missing images
|
||||
"SmithStone", # page has 403 forbidden
|
||||
"SnowflakeStudios", # page is gone
|
||||
"Sock27d", # page is gone
|
||||
"Soks", # page is gone
|
||||
"SoManyLevels", # page moved
|
||||
"SomethingSoft", # page is gone
|
||||
"Sorcery101", # page moved
|
||||
"SpellBinder", # page is gone
|
||||
"SPQRBlues", # page moved
|
||||
"StationV3", # page moved
|
||||
"SticksandStuff", # page does not follow standard layout
|
||||
"StickyFingers", # page does not follow standard layout
|
||||
"Stubble", # page moved
|
||||
"SurrealKins", # page is gone
|
||||
"SwirlyMarkYume", # page does not follow standard layout
|
||||
"SynapticMisfiring", # page is gone
|
||||
"TalesoftheQuestor", # page moved
|
||||
"TAVISION", # page moved
|
||||
"ThatWasMcPherson", # page moved
|
||||
"The6GUYSInMyHead", # page has 403 forbidden
|
||||
"TheAdventuresofCaptainMooki", # page moved
|
||||
"TheAdventuresofLi27lDenverPastrami", # page is gone
|
||||
"TheAdventuresofPeppyThePipingPirate", # page is gone
|
||||
"TheAmoeba", # page is gone
|
||||
"TheAvatar", # page does not follow standard layout
|
||||
"TheBessEffectGerman", # page moved
|
||||
"TheBestandtheBrightest", # page moved
|
||||
"TheDevilsPanties", # page moved
|
||||
"TheDoctorPepperShow", # page has 403 forbidden
|
||||
"TheGods27Pack", # page has 403 forbidden
|
||||
"TheMadBrothers", # page does not follow standard layout
|
||||
"TheMediocres", # missing images
|
||||
"TheNamelessStory", # page has 403 forbidden
|
||||
"Thenoob", # page moved
|
||||
"TheOrangeArrow", # page is gone
|
||||
"TheSailorNeopetsRPG", # page does not follow standard layout
|
||||
"TheWayoftheWorld", # page moved
|
||||
"TheWorldofUh", # broken images
|
||||
"TheWotch", # page does not follow standard layout
|
||||
"ThunderandLightning", # page moved
|
||||
"TinysWorld", # page does not follow standard layout
|
||||
"ToonPimp27sPalace", # page moved
|
||||
"Tossers", # page moved
|
||||
"Towner", # page does not follow standard layout
|
||||
"Townies", # page is gone
|
||||
"TracyandTristan", # page moved
|
||||
"TrialsintheLight", # page does not follow standard layout
|
||||
"ttskr", # page does not follow standard layout
|
||||
"twelvedragons", # page does not follow standard layout
|
||||
"TwoEvilScientists", # page moved
|
||||
"TwoLumps", # page moved
|
||||
"TwoSidesWide", # page moved
|
||||
"Vendetta", # page moved
|
||||
"VictimsoftheSystem", # page moved
|
||||
"Victor", # page moved
|
||||
"WARPZONEthinkwithinthecube", # page does not follow standard layout
|
||||
"WayoftheDodo", # page does not follow standard layout
|
||||
"Wedontgetiteither", # page moved
|
||||
"WeishauptScholars", # page does not follow standard layout
|
||||
"Werechild", # page has 403 forbidden
|
||||
"WhiskeyAndMelancholy", # missing pages
|
||||
"YellowMoon", # page has 403 forbidden
|
||||
"YouScrewedUp", # missing images
|
||||
"YUMEdream", # page moved
|
||||
"Zap", # page moved
|
||||
"ZebraGirl", # page moved
|
||||
"Zeek", # page moved
|
||||
"Zootz", # page is gone
|
||||
]
|
||||
|
||||
# links to last valid strips
|
||||
|
@ -72,8 +318,37 @@ url_overrides = {
|
|||
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
|
||||
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
|
||||
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
|
||||
"": "",
|
||||
"": "",
|
||||
"CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html",
|
||||
"CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html",
|
||||
"CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html",
|
||||
"Countyoursheep": "http://countyoursheep.keenspot.com/",
|
||||
"FreakU": "http://freaku.comicgenesis.com//d/20080827.html",
|
||||
"FreeParking": "http://freeparking.comicgenesis.com//d/20051029.html",
|
||||
"GamerPsychotica": "http://gp.comicgenesis.com/d/20060113.html",
|
||||
"GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html",
|
||||
"GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html",
|
||||
"HalflightBreaking": "http://halflight.comicgenesis.com/d/20021031.html",
|
||||
"HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/",
|
||||
"HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html",
|
||||
"HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html",
|
||||
"MacHall": "http://machall.comicgenesis.com/d/20020125.html",
|
||||
"MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html",
|
||||
"MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html",
|
||||
"MrPinkBlob": "http://mrpinkblob.comicgenesis.com/d/100.html",
|
||||
"NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html",
|
||||
"No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html",
|
||||
"OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html",
|
||||
"PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html",
|
||||
"Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html",
|
||||
"ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html",
|
||||
"PET": "http://petcomic.comicgenesis.com/d/20070413.html",
|
||||
"PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html",
|
||||
"RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html",
|
||||
"ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html",
|
||||
"spacejams": "http://spacejams.comicgenesis.com/d/20020820.html",
|
||||
"TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html",
|
||||
"TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html",
|
||||
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||
}
|
||||
|
||||
def handle_url(url, res):
|
||||
|
|
|
@ -17,12 +17,21 @@ htmltemplate = """
|
|||
<meta name="viewport" content="width=device-width">
|
||||
<link rel="stylesheet" href="css/normalize.css">
|
||||
<link rel="stylesheet" href="css/main.css">
|
||||
<link rel="stylesheet" href="css/dosage.css">
|
||||
<script src="js/masonry.min.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<p>Dosage test results from %(date)s</p>
|
||||
<ul>
|
||||
%(content)s
|
||||
</ul>
|
||||
<p>Dosage test results from %(date)s</p>
|
||||
<div id="container">
|
||||
%(content)s
|
||||
</div>
|
||||
<script>
|
||||
window.onload = function() {
|
||||
var wall = new Masonry( document.getElementById('container'), {
|
||||
columnWidth: 240
|
||||
});
|
||||
};
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
@ -80,7 +89,7 @@ def get_content(filename):
|
|||
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
|
||||
else:
|
||||
inner = '<span class="%s">%s</span>' % (css, name)
|
||||
res.append(' <li>%s</li>' % inner)
|
||||
res.append(' <div class="item">%s</div>' % inner)
|
||||
return os.linesep.join(res)
|
||||
|
||||
|
||||
|
|
|
@ -20,6 +20,20 @@ url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
|||
|
||||
# names of comics to exclude
|
||||
exclude_comics = [
|
||||
"BusinessAndFinance", # not a comic
|
||||
"ComicPanel", # not a comic
|
||||
"ComicsAZ", # not a comic
|
||||
"ComicStrip", # not a comic
|
||||
"Espaol", # not a comic
|
||||
"Family", # not a comic
|
||||
"ForKids", # not a comic
|
||||
"JamesBond", # not a comic
|
||||
"Men", # not a comic
|
||||
"NEA", # not a comic
|
||||
"Pets", # not a comic
|
||||
"SundayOnly", # not a comic
|
||||
"WebExclusive", # not a comic
|
||||
"Women", # not a comic
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -43,7 +43,7 @@ class _ComicTester(TestCase):
|
|||
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
|
||||
if not self.scraperclass.multipleImagesPerStrip:
|
||||
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
|
||||
if num > 0:
|
||||
if num > 0 and self.scraperclass.prevUrlMatchesStripUrl:
|
||||
self.check_stripurl(strip)
|
||||
num += 1
|
||||
if self.scraperclass.prevSearch:
|
||||
|
|
Loading…
Reference in a new issue