Various comics are fixed.

This commit is contained in:
Bastian Kleineidam 2012-12-13 21:05:27 +01:00
parent de1b80fa4d
commit 5f9e5ae3ca
18 changed files with 2857 additions and 170 deletions

File diff suppressed because it is too large Load diff

View file

@ -226,14 +226,6 @@ class Angband(_BasicScraper):
help = 'Index format: yyyy-mm-dd'
class ActionAthena(_BasicScraper):
latestUrl = 'http://actionathena.com/'
stripUrl = latestUrl + '2%s'
imageSearch = compile(r'<img src=\'(http://actionathena.com/comics/.+?)\'>')
prevSearch = compile(r'<a href="(http://actionathena.com/.+?)">&laquo; Previous</a>')
help = 'Index format: yyyy/mm/dd/strip-name'
class AlsoBagels(_BasicScraper):
latestUrl = 'http://alsobagels.com/'
stripUrl = latestUrl + 'index.php/comic/%s/'

View file

@ -181,7 +181,7 @@ class CatAndGirl(_BasicScraper):
class CyanideAndHappiness(_BasicScraper):
latestUrl = 'http://www.explosm.net/comics/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http:\/\/www\.explosm\.net/db/files/Comics/[^"]+)'))
imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?explosm\.net/db/files/Comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/comics/\d+/)', before="prev"))
help = 'Index format: n (unpadded)'
@ -234,14 +234,6 @@ class Chester5000XYV(_BasicScraper):
help = 'Index format: nnn'
class CalamitiesOfNature(_BasicScraper):
latestUrl = 'http://www.calamitiesofnature.com/'
stripUrl = latestUrl + 'archive/?c=%s'
imageSearch = compile(tagre("img", "src", r'(archive/\d+[^"]+|http://www\.calamitiesofnature\.com/archive/\d+[^"]+)'))
prevSearch = compile(r'<a id="previous" href="(http://www.calamitiesofnature.com/archive/\?c\=\d+)">')
help = 'Index format: nnn'
class Champ2010(_BasicScraper):
# the latest URL is hard coded since the comic is discontinued
latestUrl = 'http://jedcollins.com/champ2010/champ-12-30-10.html'

View file

@ -28,7 +28,8 @@ class Damonk(_BasicScraper):
help = 'Index format: yyyymmdd'
class DandyAndCompany(_BasicScraper):
# XXX disallowed /search by robots.txt
class _DandyAndCompany(_BasicScraper):
latestUrl = 'http://www.dandyandcompany.com/'
stripUrl = None
multipleImagesPerStrip = True

View file

@ -68,7 +68,8 @@ class EmergencyExit(_BasicScraper):
help = 'Index format: n'
class ErrantStory(_BasicScraper):
# XXX disallowed by robots.txt
class _ErrantStory(_BasicScraper):
latestUrl = 'http://www.errantstory.com/'
stripUrl = latestUrl + '%s'
imageSearch = compile(r'<img[^>]+?src="([^"]*?comics/.+?)"')

View file

@ -58,10 +58,19 @@ class KillerKomics(_BasicScraper):
help = 'Index format: strip-name'
class Kofightclub(_BasicScraper):
# XXX disallowed by robots.txt
class _Kofightclub(_BasicScraper):
latestUrl = 'http://www.kofightclub.com/'
stripUrl = latestUrl + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(\.\./images/\d+[^"]+)'))
prevSearch = compile(tagre("a", "href", r'((?:http://www\.kofightclub\.com)?/d/\d+\.html)')
+ tagre("img", "alt", "Previous comic"))
help = 'Index format: yyyymmdd'
class KuroShouri(_BasicScraper):
latestUrl = 'http://kuroshouri.com/'
stripUrl = latestUrl + '?webcomic_post=%s'
imageSearch = compile(tagre("img", "src", r"(http://kuroshouri\.com/wp-content/webcomic/kuroshouri/[^'\"]+)", quote="['\"]"))
prevSearch = compile(tagre("a", "href", r'(http://kuroshouri\.com/\?webcomic_post=[^"]+)', after="previous"))
help = 'Index format: chapter-n-page-m'

View file

@ -72,14 +72,6 @@ class Melonpool(_BasicScraper):
help = 'Index format: n'
class MintCondition(_BasicScraper):
latestUrl = 'http://www.mintconditioncomic.com/'
stripUrl = latestUrl + '%s/'
imageSearch = compile(tagre("img", "src", r'(http://www\.mintconditioncomic\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(http://www\.mintconditioncomic\.com/[^"]+)', after="prev"))
help = 'Index format: yyyy/mm/dd/stripname'
class Misfile(_BasicScraper):
latestUrl = 'http://www.misfile.com/'
stripUrl = latestUrl + '?date=%s'

View file

@ -35,7 +35,7 @@ class OnTheEdge(_BasicScraper):
class OneQuestion(_BasicScraper):
latestUrl = 'http://www.onequestioncomic.com/'
latestUrl = 'http://onequestioncomic.com/'
stripUrl = latestUrl + 'comic.php?strip_id=%s'
imageSearch = compile(tagre("img", "src", r'(istrip_files/strips/\d+\.jpg)'))
prevSearch = compile(tagre("a", "href", r'(comic\.php\?strip_id=\d+)') + tagre("img", "src", r'img/arrow_prev\.jpg'))

View file

@ -142,7 +142,8 @@ class SPQRBlues(_BasicScraper):
help = 'Index format: number'
class StationV3(_BasicScraper):
# XXX disallowed by robots.txt
class _StationV3(_BasicScraper):
latestUrl = 'http://www.stationv3.com/'
stripUrl = latestUrl + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(http://www\.stationv3\.com/comics/[^"]+)'))
@ -228,7 +229,8 @@ class Spamusement(_BasicScraper):
starter = indirectStarter('http://spamusement.com/', prevSearch)
class StrangeCandy(_BasicScraper):
# XXX disallowed by robots.txt
class _StrangeCandy(_BasicScraper):
latestUrl = 'http://www.strangecandy.net/'
stripUrl = latestUrl + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(/comics/\d+\.jpg)'))

View file

@ -13,12 +13,12 @@ _attrs = dict(
next = case_insensitive_re("next"),
)
_prevSearch = compile(_linkSearch +
r'(?:<img[^>]+alt="[^"]*%(prev)s|<img[^>]+(?:button_previous|nav_prev4)\.|[^<]*%(back)s|\s*<<? (?:%(back)s|%(prev)s)|[^<]*%(prev)s)' % _attrs)
r'(?:<img[^>]+alt="[^"]*%(prev)s|<img[^>]+(?:button_previous|naviButtons_Previous|nav_prev4|prev|previous|webbuttonback|PrevArrow)\.|[^<]*%(back)s|\s*<<? (?:%(back)s|%(prev)s)|[^<]*%(prev)s)' % _attrs)
_nextSearch = compile(_linkSearch +
r'(?:<img[^>]+alt="%(next)s|<img[^>]+(?:button_next|nav_next4)\.|\s*<?[^<]*%(next)s)' % _attrs)
r'(?:<img[^>]+alt="%(next)s|<img[^>]+(?:button_next|naviButtons_Next|nav_next4|next|webbuttonnext-1|NextArrow)\.|\s*<?[^<]*%(next)s)' % _attrs)
def add(name, url, description, adult, bounce):
classname = 'SmackJeeves/' + name
classname = 'SmackJeeves_' + name
def modifier(pageUrl):
if adult:
@ -48,6 +48,7 @@ def add(name, url, description, adult, bounce):
return "%s_%s" % (name, num)
globals()[classname] = make_scraper(classname,
name = 'SmackJeeves/' + name,
adult = adult,
starter = _starter,
prevUrlModifier = lambda cls, url: modifier(url),

View file

@ -71,7 +71,8 @@ class TinyKittenTeeth(_BasicScraper):
help = 'Index format: yyyy/mm/dd/stripname (unpadded)'
class TwoLumps(_BasicScraper):
# XXX disallowed by robots.txt
class _TwoLumps(_BasicScraper):
latestUrl = 'http://www.twolumps.net/'
stripUrl = latestUrl + 'd/%s.html'
imageSearch = compile(tagre("img", "src", r'(/comics/[^"]+)'))

View file

@ -26,7 +26,8 @@ class UnicornJelly(_BasicScraper):
help = 'Index format: nnn'
class UserFriendly(_BasicScraper):
# XXX disallowed by robots.txt
class _UserFriendly(_BasicScraper):
starter = bounceStarter('http://ars.userfriendly.org/cartoons/?mode=classic', compile(r'<area shape="rect" href="(/cartoons/\?id=\d{8}&mode=classic)" coords="[\d, ]+?" alt="">'))
stripUrl = 'http://ars.userfriendly.org/cartoons/?id=%s&mode=classic'
imageSearch = compile(r'<img border="0" src="\s*(http://www.userfriendly.org/cartoons/archives/\d{2}\w{3}/.+?\.gif)"')

View file

@ -57,7 +57,8 @@ class WotNow(_BasicScraper):
help = 'Index format: n (unpadded)'
class WorldOfWarcraftEh(_BasicScraper):
# XXX disallowed by robots.txt
class _WorldOfWarcraftEh(_BasicScraper):
latestUrl = 'http://woweh.com/'
stripUrl = None
imageSearch = compile(r'http://woweh.com/(comics/.+?)"')

View file

@ -30,6 +30,7 @@ exclude_comics = [
"Apartment_408_Full_Size", # broken images
"Apple_Valley", # broken images
"Apt_408_Minis", # broken images
"Art_dump", # broken images
"Atxs", # broken images
"A_Word_Of_Wisdom", # broken images
"Brathalla", # broken images
@ -64,6 +65,7 @@ exclude_comics = [
"Inside_OuT", # broken images
"Journey_to_Raifina", # broken images
"KALA_dan", # broken images
"Kuro_Shouri", # page moved
"Live_to_tell", # start page requires login
"Locoma", # broken images
"London_Underworld", # broken images
@ -141,7 +143,7 @@ exclude_comics = [
"Weave", # broken images
"Weirdlings", # template error
"Welcome_To_Border_City", # broken images
"what_comes_first", # start page requires login
"What_comes_first", # start page requires login
"Within_Shadows", # broken images
"Xolta", # start page requires login
"XTIN__The_Dragons_Dream_World", # start page requires login

View file

@ -38,6 +38,7 @@ exclude_comics = [
"RichardsPoorAlmanac", # missing images
"SherpaAid", # comic unavailable
"SparComics", # comic unavailable
"SurvivingSingle", # comic unavailable
]

View file

@ -23,6 +23,9 @@ htmltemplate = """
</head>
<body>
<p>Dosage test results from %(date)s</p>
<p>Note that it is almost impossible to get a 100% OK test run
due to temporary network failures or sites that are just updating
the comic page.</p>
<div id="container">
%(content)s
</div>
@ -70,14 +73,16 @@ def get_content(filename):
with open(filename, "r") as f:
print("Tests parsed: 0", end=" ", file=sys.stderr)
num_tests = 0
add_reason = False
for line in f:
if line.startswith((". ", "F ")) and "test_comics" in line:
add_reason = line.startswith("F ")
num_tests += 1
try:
tests.append(get_test(line))
add_reason = line.startswith("F ")
except Exception as msg:
print("WARNING:", msg, file=sys.stderr)
continue
elif add_reason and line.startswith(" E "):
reason = line[3:].strip()
tests[-1][-1] = reason

View file

@ -40,6 +40,8 @@ exclude_comics = [
"MylifewithFel", # does not follow standard layout
"NegativeZen", # does not follow standard layout
"NightShot", # does not follow standard layout
"NormalIsBoring", # does not follow standard layout
"Okamirai", # images are 403 forbidden
"OmnisSpriteShowcase", # missing images
"OpticalDisarray", # does not follow standard layout
"PicturesofYou", # does not follow standard layout
@ -49,6 +51,7 @@ exclude_comics = [
"Ribon", # does not follow standard layout
"SecretSanta2011", # missing images
"ShinkaTheLastEevee", # does not follow standard layout
"SimplePixel", # does not follow standard layout
"SJArtCollab", # missing images
"SlightlyDifferent", # missing images
"TheAfterSubtract", # does not follow standard layout

View file

@ -30,6 +30,7 @@ exclude_comics = [
"JamesBond", # not a comic
"Men", # not a comic
"NEA", # not a comic
"PeanutsPortuguese", # not found
"Pets", # not a comic
"SundayOnly", # not a comic
"WebExclusive", # not a comic