Fix more comics.
This commit is contained in:
parent
387dff79a9
commit
e5d9002f09
16 changed files with 366 additions and 44 deletions
2
Makefile
2
Makefile
|
@ -68,7 +68,7 @@ pyflakes:
|
||||||
pyflakes $(PY_FILES_DIRS)
|
pyflakes $(PY_FILES_DIRS)
|
||||||
|
|
||||||
count:
|
count:
|
||||||
@sloccount dosage dosagelib | grep "Total Physical Source Lines of Code"
|
@sloccount $(PY_FILES_DIRS) | grep "Total Physical Source Lines of Code"
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
find . -name \*.pyc -delete
|
find . -name \*.pyc -delete
|
||||||
|
|
|
@ -7,7 +7,7 @@ import rfc822
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import urlopen, normaliseURL, unquote, strsize
|
from .util import getImageObject, normaliseURL, unquote, strsize
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
|
||||||
class FetchComicError(IOError):
|
class FetchComicError(IOError):
|
||||||
|
@ -52,7 +52,7 @@ class ComicImage(object):
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Connect to host and get meta information."""
|
"""Connect to host and get meta information."""
|
||||||
try:
|
try:
|
||||||
self.urlobj = urlopen(self.url, referrer=self.referrer)
|
self.urlobj = getImageObject(self.url, self.referrer)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
raise FetchComicError('Unable to retrieve URL.', self.url, msg)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,10 @@ from ..util import tagre
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
_imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)'))
|
||||||
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
_prevSearch = compile(tagre("a", "href", r'([^"]*/d/\d{8}\.html)') +
|
||||||
'(?:Previous comic|'+tagre("img", "alt", "Previous comic")+')')
|
'(?:Previous comic' + '|' +
|
||||||
|
tagre("img", "alt", "Previous comic") + '|' +
|
||||||
|
tagre("img", "src", "images/back\.gif") +
|
||||||
|
')')
|
||||||
|
|
||||||
def add(name, url):
|
def add(name, url):
|
||||||
classname = 'KeenSpot_%s' % name
|
classname = 'KeenSpot_%s' % name
|
||||||
|
@ -17,7 +20,9 @@ def add(name, url):
|
||||||
@classmethod
|
@classmethod
|
||||||
def _prevUrlModifier(cls, prevUrl):
|
def _prevUrlModifier(cls, prevUrl):
|
||||||
if prevUrl:
|
if prevUrl:
|
||||||
return prevUrl.replace("keenspace", "comicgenesis"
|
return prevUrl.replace("keenspace.com", "comicgenesis.com"
|
||||||
|
).replace("keenspot.com", "comicgenesis.com"
|
||||||
|
).replace("toonspace.com", "comicgenesis.com"
|
||||||
).replace("comicgen.com", "comicgenesis.com")
|
).replace("comicgen.com", "comicgenesis.com")
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
|
|
|
@ -6,16 +6,17 @@ from re import compile
|
||||||
from ..scraper import make_scraper
|
from ..scraper import make_scraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "src", r'(http://www\.nuklearpower\.com/comics/[^"]+)'))
|
_imageSearch = compile(tagre("img", "src", r'(http://v\.cdn\.nuklearpower\.com/comics/[^"]+)'))
|
||||||
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
|
_prevSearch = compile(tagre("a", "href", r'([^"]+)') + "Previous")
|
||||||
|
|
||||||
def add(name, shortname):
|
def add(name, shortname):
|
||||||
baseUrl = 'http://www.nuklearpower.com/' + shortname + '/'
|
baseUrl = 'http://www.nuklearpower.com/'
|
||||||
|
latestUrl = baseUrl + shortname + '/'
|
||||||
classname = 'NuklearPower_%s' % name
|
classname = 'NuklearPower_%s' % name
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
name='NuklearPower/' + name,
|
name='NuklearPower/' + name,
|
||||||
latestUrl = baseUrl,
|
latestUrl = latestUrl,
|
||||||
stripUrl = baseUrl + '%s',
|
stripUrl = baseUrl + '%s',
|
||||||
imageSearch = _imageSearch,
|
imageSearch = _imageSearch,
|
||||||
prevSearch = _prevSearch,
|
prevSearch = _prevSearch,
|
||||||
|
|
|
@ -8,8 +8,8 @@ from ..util import tagre
|
||||||
|
|
||||||
_imageSearch = compile(tagre("img", "src", r'(http://(?:www|img2)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
|
_imageSearch = compile(tagre("img", "src", r'(http://(?:www|img2)\.smackjeeves\.com/images/uploaded/comics/[^"]+)'))
|
||||||
_linkSearch = tagre("a", "href", r'([^"]*/comics/\d+/[^"]*)')
|
_linkSearch = tagre("a", "href", r'([^"]*/comics/\d+/[^"]*)')
|
||||||
_prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|< Back)')
|
_prevSearch = compile(_linkSearch + '(?:<img[^>]*alt="< Previous"|< Back|. previous)')
|
||||||
_nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next >)')
|
_nextSearch = compile(_linkSearch + '(?:<img[^>]*alt="Next >"|Next >|next )')
|
||||||
|
|
||||||
def add(name):
|
def add(name):
|
||||||
classname = 'SmackJeeves/' + name
|
classname = 'SmackJeeves/' + name
|
||||||
|
@ -39,6 +39,3 @@ add('durian')
|
||||||
add('heard')
|
add('heard')
|
||||||
add('mpmcomic')
|
add('mpmcomic')
|
||||||
add('nlmo-project')
|
add('nlmo-project')
|
||||||
add('paranoidloyd')
|
|
||||||
add('thatdreamagain')
|
|
||||||
add('wowcomics')
|
|
||||||
|
|
|
@ -23,10 +23,8 @@ def add(name, host):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
add('Grim', 'grim')
|
|
||||||
add('KOF', 'kof')
|
add('KOF', 'kof')
|
||||||
add('PowerPuffGirls', 'ppg')
|
add('PowerPuffGirls', 'ppg')
|
||||||
add('Snafu', 'www')
|
|
||||||
add('Tin', 'tin')
|
add('Tin', 'tin')
|
||||||
add('TW', 'tw')
|
add('TW', 'tw')
|
||||||
add('Sugar', 'sugar')
|
add('Sugar', 'sugar')
|
||||||
|
|
|
@ -18,7 +18,7 @@ _imageSearch = compile(tagre("img", "src", r'(http://assets\.amuniversal\.com/[^
|
||||||
|
|
||||||
def add(name, shortname):
|
def add(name, shortname):
|
||||||
latestUrl = 'http://www.universaluclick.com%s' % shortname
|
latestUrl = 'http://www.universaluclick.com%s' % shortname
|
||||||
classname = 'UClick_%s' % name
|
classname = 'Universal_%s' % name
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
@ -34,7 +34,7 @@ def add(name, shortname):
|
||||||
return parse_strdate(strdate).strftime("%Y%m%d")
|
return parse_strdate(strdate).strftime("%Y%m%d")
|
||||||
|
|
||||||
globals()[classname] = make_scraper(classname,
|
globals()[classname] = make_scraper(classname,
|
||||||
name='UClick/' + name,
|
name='Universal/' + name,
|
||||||
latestUrl = latestUrl,
|
latestUrl = latestUrl,
|
||||||
stripUrl = latestUrl + '%s/',
|
stripUrl = latestUrl + '%s/',
|
||||||
imageSearch = _imageSearch,
|
imageSearch = _imageSearch,
|
||||||
|
|
|
@ -17,18 +17,16 @@ def add(name, subpath):
|
||||||
latestUrl = baseUrl + subpath,
|
latestUrl = baseUrl + subpath,
|
||||||
stripUrl = baseUrl + '?view=archive&chapter=%s',
|
stripUrl = baseUrl + '?view=archive&chapter=%s',
|
||||||
imageSearch = _imageSearch,
|
imageSearch = _imageSearch,
|
||||||
|
multipleImagesPerStrip = True,
|
||||||
prevSearch = _prevSearch,
|
prevSearch = _prevSearch,
|
||||||
|
# the prevSearch is a redirect
|
||||||
|
prevUrlMatchesStripUrl = False,
|
||||||
help = 'Index format: nnnn (non-contiguous)',
|
help = 'Index format: nnnn (non-contiguous)',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
add('AgnesQuill', 'daveroman/agnes/')
|
add('AgnesQuill', 'daveroman/agnes/')
|
||||||
add('Elvenbaath', 'tdotodot2k/elvenbaath/')
|
|
||||||
add('IrrationalFears', 'uvernon/irrationalfears/')
|
|
||||||
add('KismetHuntersMoon', 'laylalawlor/huntersmoon/')
|
|
||||||
add('SaikoAndLavender', 'gc/saiko/')
|
|
||||||
add('MyMuse', 'gc/muse/')
|
add('MyMuse', 'gc/muse/')
|
||||||
add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/')
|
add('NekkoAndJoruba', 'nekkoandjoruba/nekkoandjoruba/')
|
||||||
add('JaxEpoch', 'johngreen/quicken/')
|
add('JaxEpoch', 'johngreen/quicken/')
|
||||||
add('QuantumRockOfAges', 'DreamchildNYC/quantum/')
|
|
||||||
add('ClownSamurai', 'qsamurai/clownsamurai/')
|
add('ClownSamurai', 'qsamurai/clownsamurai/')
|
||||||
|
|
|
@ -22,11 +22,17 @@ class _BasicScraper(object):
|
||||||
@cvar prevSearch: A compiled regex that will locate the URL for the
|
@cvar prevSearch: A compiled regex that will locate the URL for the
|
||||||
previous strip when applied to a strip page.
|
previous strip when applied to a strip page.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# if more than one image per URL is expected
|
# if more than one image per URL is expected
|
||||||
multipleImagesPerStrip = False
|
multipleImagesPerStrip = False
|
||||||
|
|
||||||
|
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
||||||
|
prevUrlMatchesStripUrl = True
|
||||||
|
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
help = 'Sorry, no help for this comic yet.'
|
help = 'Sorry, no help for this comic yet.'
|
||||||
|
|
||||||
|
|
||||||
def __init__(self, indexes=None):
|
def __init__(self, indexes=None):
|
||||||
"""Initialize internal variables."""
|
"""Initialize internal variables."""
|
||||||
self.urls = set()
|
self.urls = set()
|
||||||
|
|
|
@ -21,7 +21,12 @@ if os.name == 'nt':
|
||||||
|
|
||||||
has_curses = has_module("curses")
|
has_curses = has_module("curses")
|
||||||
|
|
||||||
MAX_FILESIZE = 1024*1024*1 # 1MB
|
# Maximum content size for HTML pages
|
||||||
|
MaxContentBytes = 1024 * 1024 * 2 # 2 MB
|
||||||
|
|
||||||
|
# Maximum content size for images
|
||||||
|
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
||||||
|
|
||||||
|
|
||||||
def tagre(tag, attribute, value, quote='"', before="", after=""):
|
def tagre(tag, attribute, value, quote='"', before="", after=""):
|
||||||
"""Return a regular expression matching the given HTML tag, attribute
|
"""Return a regular expression matching the given HTML tag, attribute
|
||||||
|
@ -71,9 +76,9 @@ def case_insensitive_re(name):
|
||||||
|
|
||||||
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
|
||||||
|
|
||||||
def getPageContent(url):
|
def getPageContent(url, max_content_bytes=MaxContentBytes):
|
||||||
# read page data
|
# read page data
|
||||||
page = urlopen(url)
|
page = urlopen(url, max_content_bytes=max_content_bytes)
|
||||||
data = page.text
|
data = page.text
|
||||||
# determine base URL
|
# determine base URL
|
||||||
baseUrl = None
|
baseUrl = None
|
||||||
|
@ -85,6 +90,11 @@ def getPageContent(url):
|
||||||
return data, baseUrl
|
return data, baseUrl
|
||||||
|
|
||||||
|
|
||||||
|
def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
||||||
|
"""Get response object for given image URL."""
|
||||||
|
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, urlSearch):
|
def fetchUrl(url, urlSearch):
|
||||||
data, baseUrl = getPageContent(url)
|
data, baseUrl = getPageContent(url)
|
||||||
match = urlSearch.search(data)
|
match = urlSearch.search(data)
|
||||||
|
@ -116,7 +126,6 @@ def fetchUrls(url, imageSearch, prevSearch=None):
|
||||||
prevUrl = match.group(1)
|
prevUrl = match.group(1)
|
||||||
if not prevUrl:
|
if not prevUrl:
|
||||||
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
|
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
|
||||||
out.write('matched previous URL %r' % prevUrl, 2)
|
|
||||||
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
|
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
|
||||||
else:
|
else:
|
||||||
out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2)
|
out.write('no previous URL %s at %s' % (prevSearch.pattern, url), 2)
|
||||||
|
@ -174,7 +183,7 @@ def normaliseURL(url):
|
||||||
return urlparse.urlunparse(pu)
|
return urlparse.urlunparse(pu)
|
||||||
|
|
||||||
|
|
||||||
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5, max_content_bytes=None):
|
||||||
out.write('Open URL %s' % url, 2)
|
out.write('Open URL %s' % url, 2)
|
||||||
assert retries >= 0, 'invalid retry value %r' % retries
|
assert retries >= 0, 'invalid retry value %r' % retries
|
||||||
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
assert retry_wait_seconds > 0, 'invalid retry seconds value %r' % retry_wait_seconds
|
||||||
|
@ -183,7 +192,8 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
||||||
if referrer:
|
if referrer:
|
||||||
headers['Referer'] = referrer
|
headers['Referer'] = referrer
|
||||||
try:
|
try:
|
||||||
req = requests.get(url, headers=headers, config=config)
|
req = requests.get(url, headers=headers, config=config, prefetch=False)
|
||||||
|
check_content_size(url, req.headers, max_content_bytes)
|
||||||
req.raise_for_status()
|
req.raise_for_status()
|
||||||
return req
|
return req
|
||||||
except requests.exceptions.RequestException as err:
|
except requests.exceptions.RequestException as err:
|
||||||
|
@ -191,6 +201,15 @@ def urlopen(url, referrer=None, retries=3, retry_wait_seconds=5):
|
||||||
out.write(msg)
|
out.write(msg)
|
||||||
raise IOError(msg)
|
raise IOError(msg)
|
||||||
|
|
||||||
|
def check_content_size(url, headers, max_content_bytes):
|
||||||
|
if not max_content_bytes:
|
||||||
|
return
|
||||||
|
if 'content-length' in headers:
|
||||||
|
size = int(headers['content-length'])
|
||||||
|
if size > max_content_bytes:
|
||||||
|
msg = 'URL content of %s with %d Bytes exceeds %d Bytes.' % (url, size, max_content_bytes)
|
||||||
|
raise IOError(msg)
|
||||||
|
|
||||||
|
|
||||||
def get_columns (fp):
|
def get_columns (fp):
|
||||||
"""Return number of columns for given file."""
|
"""Return number of columns for given file."""
|
||||||
|
|
|
@ -170,7 +170,7 @@ def handle_url(url, url_matcher, num_matcher, res):
|
||||||
end = match.end(1)
|
end = match.end(1)
|
||||||
mo = num_matcher.search(data[end:])
|
mo = num_matcher.search(data[end:])
|
||||||
if not mo:
|
if not mo:
|
||||||
print("ERROR:", repr(data[end:end+300], file=sys.stderr))
|
print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
num = int(mo.group(1))
|
num = int(mo.group(1))
|
||||||
res[name] = num
|
res[name] = num
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -47,23 +47,269 @@ exclude_comics = [
|
||||||
"beerkada", # no images
|
"beerkada", # no images
|
||||||
"BelovedLeader", # broken images
|
"BelovedLeader", # broken images
|
||||||
"BigMouthComics", # page does not follow standard layout
|
"BigMouthComics", # page does not follow standard layout
|
||||||
"", # page is gone
|
"BilltheMagician", # page does not follow standard layout
|
||||||
"", # page is gone
|
"BlackBlue", # page moved
|
||||||
"", # page is gone
|
"BlackMagic", # page does not follow standard layout
|
||||||
|
"BloodBound", # page moved
|
||||||
|
"bloodofthedragon", # page does not follow standard layout
|
||||||
|
"BloodWing", # broken images
|
||||||
"BlueZombie", # broken page
|
"BlueZombie", # broken page
|
||||||
"BoomerExpress", # redirection to another page
|
"BoomerExpress", # redirection to another page
|
||||||
|
"BobOnline", # missing images
|
||||||
|
"BottomFlavor", # page does not follow standard layout
|
||||||
|
"BradTheVampire", # page does not follow standard layout
|
||||||
|
"BreakpointCity", # page moved
|
||||||
|
"Brinkerhoff", # page redirected
|
||||||
|
"CampusSafari", # page moved
|
||||||
|
"CapturetheMoment", # page moved
|
||||||
|
"CaseyandAndy", # page moved
|
||||||
|
"Catalyst", # page moved
|
||||||
|
"Cats", # broken images
|
||||||
|
"Chair", # page moved
|
||||||
|
"ChildrenAtPlay", # page does not follow standard layout
|
||||||
|
"chu", # broken images
|
||||||
|
"CoACityofAscii", # only ascii images
|
||||||
|
"ComicMischief", # page moved
|
||||||
|
"ComputerGameAddicts", # page moved
|
||||||
|
"Concession", # page moved
|
||||||
|
"CorridorZ", # page does not follow standard layout
|
||||||
|
"CrashBoomMagic", # page moved
|
||||||
|
"CrazySlowlyGoing", # page has 403 forbidden
|
||||||
|
"CrimsonWings", # page moved
|
||||||
|
"DakotasRidge", # page moved
|
||||||
|
"DATAROM", # broken images
|
||||||
|
"DazeinaHaze", # page moved
|
||||||
|
"DIABOLICA", # broken images
|
||||||
|
"DIfIK", # page does not follow standard layout
|
||||||
|
"DigitalWar", # page is gone
|
||||||
|
"DimBulbComics", # page is gone
|
||||||
|
"DIVE", # page is gone
|
||||||
|
"DominicDeegan", # page moved
|
||||||
"DungeonDamage", # page does not follow standard layout
|
"DungeonDamage", # page does not follow standard layout
|
||||||
|
"Dylan", # page has 403 forbidden
|
||||||
"EarthRiser", # redirects to a new page
|
"EarthRiser", # redirects to a new page
|
||||||
|
"EdgetheDevilhunter", # page is gone
|
||||||
|
"EdibleDirt", # page moved
|
||||||
|
"Einstien27sDesk", # page is gone
|
||||||
|
"ElfOnlyInn", # page moved
|
||||||
|
"Ensuing", # broken links
|
||||||
|
"etch", # broken images
|
||||||
|
"EternalCaffeineJunkie", # page does not follow standard layout
|
||||||
|
"EternityComplex", # page does not follow standard layout
|
||||||
|
"Evilish", # page moved
|
||||||
|
"EvolBara", # page is gone
|
||||||
|
"FaerieTales", # page does not follow standard layout
|
||||||
|
"FairyTaleNewVillage", # missing images
|
||||||
|
"Fate27sTear", # page moved
|
||||||
"FaultyLogic", # page does not follow standard layout
|
"FaultyLogic", # page does not follow standard layout
|
||||||
|
"FireontheMountain", # page does not follow standard layout
|
||||||
|
"FiveBucksanHour", # page is gone
|
||||||
|
"Flatwood", # page moved
|
||||||
|
"FLEMComics", # page moved
|
||||||
|
"FletchersCave", # page is broken
|
||||||
|
"ForcesofGoodandEvil", # page does not follow standard layout
|
||||||
|
"FurryBlackDevil", # page moved
|
||||||
|
"Galacticus", # page has 403 forbidden
|
||||||
|
"GeebasonParade", # page does not follow standard layout
|
||||||
|
"geeks", # page moved
|
||||||
|
"GeminiBright", # page does not follow standard layout
|
||||||
|
"GemutationsPlague", # page does not follow standard layout
|
||||||
|
"GeorgetheSecond", # page does not follow standard layout
|
||||||
|
"Ghostz", # page does not follow standard layout
|
||||||
|
"GODLIKE", # page has 403 forbidden
|
||||||
"GoForIt", # page is gone
|
"GoForIt", # page is gone
|
||||||
"JuvenileDiversion", # page moved
|
"GothBoy", # page moved
|
||||||
|
"Grimage", # page moved
|
||||||
|
"GrossePointeDogs", # page is broken
|
||||||
|
"GUComics", # page moved
|
||||||
|
"HardUnderbelly", # page does not follow standard layout
|
||||||
|
"HazardousScience", # page is gone
|
||||||
|
"HereThereBeDragons", # page moved
|
||||||
|
"HighMaintenance", # missing images
|
||||||
|
"HighSchoolRPG", # page does not follow standard layout
|
||||||
|
"Horndog", # page moved
|
||||||
|
"HorseshoesandHandgrenades", # missing images
|
||||||
|
"HotelGrim", # missing images
|
||||||
|
"IAlwaysWakeUpLazy", # page moved
|
||||||
|
"ihatesteve", # page is gone
|
||||||
|
"IllicitMiracles", # page does not follow standard layout
|
||||||
|
"IndefensiblePositions", # page does not follow standard layout
|
||||||
|
"InsanityFair", # page does not follow standard layout
|
||||||
|
"InsideJoke", # page is gone
|
||||||
|
"InsidetheBox", # page has 403 forbidden
|
||||||
|
"InternationalHopeFoundation", # page does not follow standard layout
|
||||||
|
"JamieandNick", # page moved
|
||||||
|
"JasonLovesHisGrandpa", # page is gone
|
||||||
|
"JavanteasFate", # page is gone
|
||||||
|
"JBBcomics", # page is gone
|
||||||
|
"JedandDark", # page does not follow standard layout
|
||||||
|
"JoBeth", # page moved
|
||||||
|
"Joyride", # page moved
|
||||||
|
"JustAnotherEscape", # page moved
|
||||||
"JustWeird", # page has 403 forbidden
|
"JustWeird", # page has 403 forbidden
|
||||||
"Michikomonogatari", # page does not follow standard layout
|
"JuvenileDiversion", # page moved
|
||||||
"MobileMadness", # page does not follow standard layout
|
"JWalkinAndapos", # missing images
|
||||||
|
"KarmaSlave", # page moved
|
||||||
|
"KeenLace", # page is gone
|
||||||
|
"khaoskomic", # page moved
|
||||||
|
"KillingTime", # page is gone
|
||||||
"KnightsOfTheNexus", # page does not follow standard layout
|
"KnightsOfTheNexus", # page does not follow standard layout
|
||||||
|
"KoFightClub", # page moved
|
||||||
|
"LabGoatsInc", # page moved
|
||||||
|
"LandofGreed", # page is gone
|
||||||
|
"LeanOnMe", # page has 403 forbidden
|
||||||
|
"LegendsofRovana", # page has 403 forbidden
|
||||||
|
"LifeatBayside", # page does not follow standard layout
|
||||||
|
"LifeinaNutshell", # page does not follow standard layout
|
||||||
|
"Lifesuchasitis", # page has 403 forbidden
|
||||||
|
"LinktotheBoards", # page does not follow standard layout
|
||||||
|
"LinT", # page moved
|
||||||
|
"LiterallySpeaking", # page does not follow standard layout
|
||||||
|
"LoxieAndZoot", # page does not follow standard layout
|
||||||
|
"Lunchtable", # missing images
|
||||||
|
"MadWorld", # page has 403 forbidden
|
||||||
|
"Magellan", # page does not follow standard layout
|
||||||
|
"Marachan", # missing images
|
||||||
|
"MassProduction", # page does tno follow standard layout
|
||||||
|
"MayIHelpYou", # page has 403 forbidden
|
||||||
|
"Meiosis", # page moved
|
||||||
|
"Michikomonogatari", # page does not follow standard layout
|
||||||
|
"MidnorthFlourCo", # page has 403 forbidden
|
||||||
|
"MintCondition", # page moved
|
||||||
|
"MisadventuresinPhysics", # page has 403 forbidden
|
||||||
|
"MobileMadness", # page does not follow standard layout
|
||||||
|
"MyAngelYouAreAngel", # page is gone
|
||||||
|
"MyBrainHurts", # page does not follow standard layout
|
||||||
|
"NAFTANorthAmericanFreeToonAgreementalsoYankuckcanee", # page does not follow standard layout
|
||||||
|
"NeglectedMarioCharacterComix", # page does not follow standard layout
|
||||||
|
"Nemutionjewel", # page does not follow standard layout
|
||||||
|
"Nerdgasm", # missing images
|
||||||
|
"Nerdz", # page is gone
|
||||||
|
"Nervillsaga", # page does not follow standard layout
|
||||||
|
"NetherOakasuburbanadventure", # page does not follow standard layout
|
||||||
|
"NoNeedForBushido", # page moved
|
||||||
|
"nothingcomesnaturally", # page does not follow standard layout
|
||||||
|
"NymphsoftheWest", # too few images
|
||||||
|
"OffTheWall", # page does not follow standard layout
|
||||||
|
"OneHourAxis", # page is gone
|
||||||
|
"OnlyOne", # page is gone
|
||||||
|
"OopsNevermind", # page is gone
|
||||||
|
"PacoStand", # page has 403 forbidden
|
||||||
|
"Pander", # page is gone
|
||||||
|
"PANDORA", # page is missing pages
|
||||||
|
"PhilosophyBites", # missing images
|
||||||
|
"PhilosophyMonkey", # page is gone
|
||||||
|
"PicpakDog", # page moved
|
||||||
|
"PictureDiary", # page is gone
|
||||||
|
"PillarsofFaith", # page does not follow standard layout
|
||||||
|
"Pimpette", # page moved
|
||||||
|
"PokC3A9Chow", # page has 403 forbidden
|
||||||
|
"PolleninArabia", # page does not follow standard layout
|
||||||
|
"PranMan", # page moved
|
||||||
|
"QueensOfRandomness", # broken images
|
||||||
|
"QuestionableTales", # page does not follow standard layout
|
||||||
|
"RadioactiveFanboys", # page does not follow standard layout
|
||||||
|
"RandomAssembly", # page is gone
|
||||||
|
"RandomInk", # page is gone
|
||||||
|
"ReceptorFatigue", # page does not follow standard layout
|
||||||
|
"Remsi", # page does not follow standard layout
|
||||||
|
"Reset", # page does not follow standard layout
|
||||||
|
"ResistanceLine", # page does not follow standard layout
|
||||||
|
"ReturntoDonnelly", # page is gone
|
||||||
|
"Riboflavin", # page does not follow standard layout
|
||||||
|
"RitualsandOfferings", # page is gone
|
||||||
|
"RiverCityHigh", # page is gone
|
||||||
|
"RM27sothercomics", # page does not follow standard layout
|
||||||
"RogerAndDominic", # page does not follow standard layout
|
"RogerAndDominic", # page does not follow standard layout
|
||||||
|
"RoleoftheDie", # page is gone
|
||||||
|
"RonnieRaccoon", # page moved
|
||||||
|
"RosalarianAndapossRandomCreepyTales", # page is gone
|
||||||
|
"RulesofMakeBelieve", # page is gone
|
||||||
|
"Rveillerie", # page has 403 forbidden
|
||||||
|
"SaintPeter27sCross", # page does not follow standard layout
|
||||||
|
"Saturnalia", # page moved
|
||||||
|
"SavageIslands", # page has 403 forbidden
|
||||||
"SaveMeGebus", # page does not follow standard layout
|
"SaveMeGebus", # page does not follow standard layout
|
||||||
|
"Sawdust", # page has 403 forbidden
|
||||||
|
"Scooterboy1234", # page has 403 forbidden
|
||||||
|
"SecondNight", # page moved
|
||||||
|
"Sempiternal", # page moved
|
||||||
|
"Senioritis", # page has 403 forbidden
|
||||||
|
"ShivaeStudios", # page moved
|
||||||
|
"ShonenAiKudasai", # page is gone
|
||||||
|
"ShootMeNow", # page does not follow standard layout
|
||||||
|
"SidandLasker", # page moved
|
||||||
|
"SillyConeV", # page is gone
|
||||||
|
"Skunk", # page moved
|
||||||
|
"SLAGIT", # missing images
|
||||||
|
"SmithStone", # page has 403 forbidden
|
||||||
|
"SnowflakeStudios", # page is gone
|
||||||
|
"Sock27d", # page is gone
|
||||||
|
"Soks", # page is gone
|
||||||
|
"SoManyLevels", # page moved
|
||||||
|
"SomethingSoft", # page is gone
|
||||||
|
"Sorcery101", # page moved
|
||||||
|
"SpellBinder", # page is gone
|
||||||
|
"SPQRBlues", # page moved
|
||||||
|
"StationV3", # page moved
|
||||||
|
"SticksandStuff", # page does not follow standard layout
|
||||||
|
"StickyFingers", # page does not follow standard layout
|
||||||
|
"Stubble", # page moved
|
||||||
|
"SurrealKins", # page is gone
|
||||||
|
"SwirlyMarkYume", # page does not follow standard layout
|
||||||
|
"SynapticMisfiring", # page is gone
|
||||||
|
"TalesoftheQuestor", # page moved
|
||||||
|
"TAVISION", # page moved
|
||||||
|
"ThatWasMcPherson", # page moved
|
||||||
|
"The6GUYSInMyHead", # page has 403 forbidden
|
||||||
|
"TheAdventuresofCaptainMooki", # page moved
|
||||||
|
"TheAdventuresofLi27lDenverPastrami", # page is gone
|
||||||
|
"TheAdventuresofPeppyThePipingPirate", # page is gone
|
||||||
|
"TheAmoeba", # page is gone
|
||||||
"TheAvatar", # page does not follow standard layout
|
"TheAvatar", # page does not follow standard layout
|
||||||
|
"TheBessEffectGerman", # page moved
|
||||||
|
"TheBestandtheBrightest", # page moved
|
||||||
|
"TheDevilsPanties", # page moved
|
||||||
|
"TheDoctorPepperShow", # page has 403 forbidden
|
||||||
|
"TheGods27Pack", # page has 403 forbidden
|
||||||
|
"TheMadBrothers", # page does not follow standard layout
|
||||||
|
"TheMediocres", # missing images
|
||||||
|
"TheNamelessStory", # page has 403 forbidden
|
||||||
|
"Thenoob", # page moved
|
||||||
|
"TheOrangeArrow", # page is gone
|
||||||
|
"TheSailorNeopetsRPG", # page does not follow standard layout
|
||||||
|
"TheWayoftheWorld", # page moved
|
||||||
|
"TheWorldofUh", # broken images
|
||||||
|
"TheWotch", # page does not follow standard layout
|
||||||
|
"ThunderandLightning", # page moved
|
||||||
|
"TinysWorld", # page does not follow standard layout
|
||||||
|
"ToonPimp27sPalace", # page moved
|
||||||
|
"Tossers", # page moved
|
||||||
|
"Towner", # page does not follow standard layout
|
||||||
|
"Townies", # page is gone
|
||||||
|
"TracyandTristan", # page moved
|
||||||
|
"TrialsintheLight", # page does not follow standard layout
|
||||||
|
"ttskr", # page does not follow standard layout
|
||||||
|
"twelvedragons", # page does not follow standard layout
|
||||||
|
"TwoEvilScientists", # page moved
|
||||||
|
"TwoLumps", # page moved
|
||||||
|
"TwoSidesWide", # page moved
|
||||||
|
"Vendetta", # page moved
|
||||||
|
"VictimsoftheSystem", # page moved
|
||||||
|
"Victor", # page moved
|
||||||
|
"WARPZONEthinkwithinthecube", # page does not follow standard layout
|
||||||
|
"WayoftheDodo", # page does not follow standard layout
|
||||||
|
"Wedontgetiteither", # page moved
|
||||||
|
"WeishauptScholars", # page does not follow standard layout
|
||||||
|
"Werechild", # page has 403 forbidden
|
||||||
|
"WhiskeyAndMelancholy", # missing pages
|
||||||
|
"YellowMoon", # page has 403 forbidden
|
||||||
|
"YouScrewedUp", # missing images
|
||||||
|
"YUMEdream", # page moved
|
||||||
|
"Zap", # page moved
|
||||||
|
"ZebraGirl", # page moved
|
||||||
|
"Zeek", # page moved
|
||||||
|
"Zootz", # page is gone
|
||||||
]
|
]
|
||||||
|
|
||||||
# links to last valid strips
|
# links to last valid strips
|
||||||
|
@ -72,8 +318,37 @@ url_overrides = {
|
||||||
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
|
"AmazonSpaceRangers": "http://amazons.comicgenesis.com/d/20051015.html",
|
||||||
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
|
"ArroganceinSimplicity": "http://arrogance.comicgenesis.com/d/20030217.html",
|
||||||
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
|
"ATasteofEvil": "http://atasteofevil.comicgenesis.com/d/20050314.html",
|
||||||
"": "",
|
"CanYouKeepaSecret": "http://cykas.comicgenesis.com/d/20041035.html",
|
||||||
"": "",
|
"CapturetheMoment": "http://capturethemoment.comicgenesis.com/d/20100927.html",
|
||||||
|
"CornerAlley13": "http://corneralley.comicgenesis.com/d/20101010.html",
|
||||||
|
"Countyoursheep": "http://countyoursheep.keenspot.com/",
|
||||||
|
"FreakU": "http://freaku.comicgenesis.com//d/20080827.html",
|
||||||
|
"FreeParking": "http://freeparking.comicgenesis.com//d/20051029.html",
|
||||||
|
"GamerPsychotica": "http://gp.comicgenesis.com/d/20060113.html",
|
||||||
|
"GoneAstray": "http://goneastray.comicgenesis.com/d/20100305.html",
|
||||||
|
"GoodnEvil": "http://gne.comicgenesis.com/d/20040814.html",
|
||||||
|
"HalflightBreaking": "http://halflight.comicgenesis.com/d/20021031.html",
|
||||||
|
"HealerOnFeatheredWings": "http://selsachronicles.comicgenesis.com/",
|
||||||
|
"HowNottoRunAComic": "http://hownottorunacomic.comicgenesis.com/d/19950719.html",
|
||||||
|
"HurricaneParty": "http://hurricaneparty.comicgenesis.com/d/20040123.html",
|
||||||
|
"MacHall": "http://machall.comicgenesis.com/d/20020125.html",
|
||||||
|
"MaryQuiteContrary": "http://marycontrary.comicgenesis.com/d/20070824.html",
|
||||||
|
"MoonCrest24": "http://mooncrest.comicgenesis.com/d/20121117.html",
|
||||||
|
"MrPinkBlob": "http://mrpinkblob.comicgenesis.com/d/100.html",
|
||||||
|
"NekkoandJoruba": "http://nekkoandjoruba.comicgenesis.com/d/20050816.html",
|
||||||
|
"No4thWalltoBreak": "http://no4thwalltobreak.comicgenesis.com/d/20041025.html",
|
||||||
|
"OtakuKyokai": "http://otakukyokai.comicgenesis.com/d/20060818.html",
|
||||||
|
"PandP": "http://pandpcomic.comicgenesis.com/d/20021002.html",
|
||||||
|
"Paradigm": "http://paradigm.comicgenesis.com/d/20020716.html",
|
||||||
|
"ParallelDementia": "http://paralleldementia.comicgenesis.com/d/20071221.html",
|
||||||
|
"PET": "http://petcomic.comicgenesis.com/d/20070413.html",
|
||||||
|
"PlanetsCollide": "http://ruthcomix.comicgenesis.com/d/20010706.html",
|
||||||
|
"RuneMaster": "http://runemaster.comicgenesis.com/d/20050607.html",
|
||||||
|
"ShinobiHigh": "http://shinobihigh.comicgenesis.com/d/20020118.html",
|
||||||
|
"spacejams": "http://spacejams.comicgenesis.com/d/20020820.html",
|
||||||
|
"TheAdventuresofVindibuddSuperheroInTraining": "http://vindibudd.comicgenesis.com/d/20070720.html",
|
||||||
|
"TriumphantLosers": "http://triumphantlosers.comicgenesis.com/d/20081006.html",
|
||||||
|
"Zortic": "http://zortic.comicgenesis.com/d/20030922.html",
|
||||||
}
|
}
|
||||||
|
|
||||||
def handle_url(url, res):
|
def handle_url(url, res):
|
||||||
|
|
|
@ -17,12 +17,21 @@ htmltemplate = """
|
||||||
<meta name="viewport" content="width=device-width">
|
<meta name="viewport" content="width=device-width">
|
||||||
<link rel="stylesheet" href="css/normalize.css">
|
<link rel="stylesheet" href="css/normalize.css">
|
||||||
<link rel="stylesheet" href="css/main.css">
|
<link rel="stylesheet" href="css/main.css">
|
||||||
|
<link rel="stylesheet" href="css/dosage.css">
|
||||||
|
<script src="js/masonry.min.js"></script>
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<p>Dosage test results from %(date)s</p>
|
<p>Dosage test results from %(date)s</p>
|
||||||
<ul>
|
<div id="container">
|
||||||
%(content)s
|
%(content)s
|
||||||
</ul>
|
</div>
|
||||||
|
<script>
|
||||||
|
window.onload = function() {
|
||||||
|
var wall = new Masonry( document.getElementById('container'), {
|
||||||
|
columnWidth: 240
|
||||||
|
});
|
||||||
|
};
|
||||||
|
</script>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
@ -80,7 +89,7 @@ def get_content(filename):
|
||||||
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
|
inner = '<a href="%s" class="%s">%s</a>' % (url, css, name)
|
||||||
else:
|
else:
|
||||||
inner = '<span class="%s">%s</span>' % (css, name)
|
inner = '<span class="%s">%s</span>' % (css, name)
|
||||||
res.append(' <li>%s</li>' % inner)
|
res.append(' <div class="item">%s</div>' % inner)
|
||||||
return os.linesep.join(res)
|
return os.linesep.join(res)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -20,6 +20,20 @@ url_matcher = re.compile(r'<li><a href="(/comics/[^"]+)">([^<]+)</a>')
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
|
"BusinessAndFinance", # not a comic
|
||||||
|
"ComicPanel", # not a comic
|
||||||
|
"ComicsAZ", # not a comic
|
||||||
|
"ComicStrip", # not a comic
|
||||||
|
"Espaol", # not a comic
|
||||||
|
"Family", # not a comic
|
||||||
|
"ForKids", # not a comic
|
||||||
|
"JamesBond", # not a comic
|
||||||
|
"Men", # not a comic
|
||||||
|
"NEA", # not a comic
|
||||||
|
"Pets", # not a comic
|
||||||
|
"SundayOnly", # not a comic
|
||||||
|
"WebExclusive", # not a comic
|
||||||
|
"Women", # not a comic
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,7 @@ class _ComicTester(TestCase):
|
||||||
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
|
self.check(images > 0, 'failed to find images at %s' % strip.stripUrl)
|
||||||
if not self.scraperclass.multipleImagesPerStrip:
|
if not self.scraperclass.multipleImagesPerStrip:
|
||||||
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
|
self.check(images == 1, 'found %d instead of 1 image at %s' % (images, strip.stripUrl))
|
||||||
if num > 0:
|
if num > 0 and self.scraperclass.prevUrlMatchesStripUrl:
|
||||||
self.check_stripurl(strip)
|
self.check_stripurl(strip)
|
||||||
num += 1
|
num += 1
|
||||||
if self.scraperclass.prevSearch:
|
if self.scraperclass.prevSearch:
|
||||||
|
|
Loading…
Reference in a new issue