# -*- coding: iso-8859-1 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2013 Bastian Kleineidam from re import compile, escape, MULTILINE, IGNORECASE, sub from os.path import splitext from ..scraper import _BasicScraper from ..helpers import indirectStarter, bounceStarter from ..util import tagre, getPageContent class SabrinaOnline(_BasicScraper): description = u'Skunks, computers and porn' url = 'http://sabrina-online.com/' imageSearch = compile(tagre("a", "href", r'(strips/[^"]*)')) prevSearch = compile(tagre("a", "href", r"(\d\d\d\d-\d\d.html)") + tagre("img", "src", "b_back.gif")) help = 'Index format: n (unpadded)' adult = True multipleImagesPerStrip = True @classmethod def starter(cls): """Pick last one in a list of archive pages.""" archive = cls.url + 'archive.html' data = getPageContent(archive, cls.session)[0] search = compile(tagre("a", "href", r"(\d\d\d\d-\d\d.html)")) archivepages = search.findall(data) return cls.url + archivepages[-1] class SailorsunOrg(_BasicScraper): url = 'http://sailorsun.org/' rurl = escape(url) stripUrl = url + '?p=%s' firstStripUrl = stripUrl % '21' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) rurl = escape(url) prevSearch = compile(tagre("a", "href", r'(%s\?p=\d+)' % rurl, after="prev")) help = 'Index format: n (unpadded)' class SamAndFuzzy(_BasicScraper): description = u"Serial about a cab driver and his bear-like friend by Sam Logan. Offers a reader's guide, forum, and frequently asked questions." url = 'http://www.samandfuzzy.com/' stripUrl = 'http://samandfuzzy.com/%s' firstStripUrl = stripUrl % '1' imageSearch = compile(r'(/comics/.+?)" alt') prevSearch = compile(r'"><<', IGNORECASE) help = 'Index format: nnn' starter = indirectStarter(url, compile(r'SEXY LOSERS Latest SL Comic \(#\d+\)', IGNORECASE)) @classmethod def namer(cls, imageUrl, pageUrl): index = pageUrl.split('/')[-1].split('.')[0] title = imageUrl.split('/')[-1].split('.')[0] return index + '-' + title # XXX site has been hacked class _ShadowGirls(_BasicScraper): description = u"It's like H.P. Lovecraft meets the Gilmore Girls!" url = 'http://www.shadowgirlscomic.com/' stripUrl = url + 'comics/%s' firstStripUrl = stripUrl % 'book-1/chapter-1-broken-dreams/welcome' imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]*)')) prevSearch = compile(tagre("a", "href", r'([^"]*)', after='navi-prev')) help = 'Index format: custom' starter = indirectStarter(url, compile(tagre("a", "href", r'([^"]*/comics/[^"]+)'))) class Sheldon(_BasicScraper): description = u'The story of a software company tycoon billionaire ten-year-old, his grampa, his duck, his pug and a lizard.' url = 'http://www.sheldoncomics.com/' rurl = escape(url) stripUrl = url + 'archive/%s.html' firstStripUrl = stripUrl % '011130' imageSearch = compile(tagre("img", "src", r'(http://cdn\.sheldoncomics\.com/strips/[^"]+)')) prevSearch = compile(tagre("a", "href", r'(%sarchive/\d+\.html)' % rurl, after="sidenav-prev")) help = 'Index format: yymmdd' class ShermansLagoon(_BasicScraper): description = u"Sherman's Lagoon by Jim Toomey" url = 'http://shermanslagoon.com/' rurl = escape(url) stripUrl = url + 'comics/%s' firstStripUrl = stripUrl % '/december-29-2003/' imageSearch = compile(tagre("img", "src", r'(http://safr\.kingfeatures\.com/idn/etv/zone/xml/content.php\?file=[^"]+)')) prevSearch = compile(tagre("a", "href", r'(%scomics/[^"]+/)' % rurl) + '« previous') starter = bounceStarter(url, compile(tagre("a", "href", r'(%scomics/[^"]+/)' % rurl, after="next"))) @classmethod def namer(cls, imageUrl, pageUrl): name = pageUrl.split('/')[-2] # name is monthname-day-year month, day, year = name.split('-') return "%s-%s-%s" % (year, month, day) class Shivae(_BasicScraper): url = 'http://shivae.net/' rurl = escape(url) stripUrl = url + 'blog/%s/' firstStripUrl = stripUrl % '2007/09/21/09212007' imageSearch = compile(tagre("img", "src", r'(%sfiles/comics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%sblog/[^"]+)' % rurl, after="Previous")) help = 'Index format: yyyy/mm/dd/stripname' # XXX disallowed by robots.txt class _Shortpacked(_BasicScraper): url = 'http://www.shortpacked.com/' rurl = escape(url) stripUrl = url + '%s/' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\d+/comic/[^"]+)' % rurl, after="prev")) help = 'Index format: yyyy/comic/book-nn/mm-name1/name2' class SinFest(_BasicScraper): description = u'Strip dealing with contemporary issues and religion. Created by Tatsuya Ishida.' name = 'KeenSpot/SinFest' url = 'http://www.sinfest.net/' stripUrl = url + 'archive_page.php?comicID=%s' imageSearch = compile(r']+?>') help = 'Index format: yymmdd' class SMBC(_BasicScraper): description = u"Saturday Morning Breakfast Cereal" url = 'http://www.smbc-comics.com/' rurl = escape(url) stripUrl = url + '?id=%s' firstStripUrl = stripUrl % '1' imageSearch = compile(tagre("img", "src", r"(%scomics/\d{8}(?:\w2?|-\d)?\.\w{3})\s*" % rurl, quote="'")) prevSearch = compile(tagre("a", "href", r'([^"]+)#comic', after="backRollover")) help = 'Index format: nnnn' def shouldSkipUrl(self, url): """Skip promo or missing update pages.""" return url in ( self.stripUrl % '2865', self.stripUrl % '2653', self.stripUrl % '2424', self.stripUrl % '2226', self.stripUrl % '2069', self.stripUrl % '1895', self.stripUrl % '1896', self.stripUrl % '1589', ) class SnowFlakes(_BasicScraper): description = u'Snowflakes - A comic by James Ashby, Chris Jones and Zach Weiner.' url = 'http://www.snowflakescomic.com/' stripUrl = url + '?id=%s&sl=%s' firstStripUrl = stripUrl % ('103', '1') endOfLife = True imageSearch = ( compile(tagre("img", "src", r'(comics/[^"]+)')), compile(tagre("img", "src", r'(http://www.snowflakescomic.com/comics/[^"]+)')), ) prevSearch = compile(tagre("a", "href", r'(/\?id=\d+\&sl=\d)', quote="") + tagre("img", "src", r'images/nav_prior-ON\.gif')) help = 'Index format: number' @classmethod def starter(cls): return cls.stripUrl % ('530', '5') def getStripIndexUrl(self, index): return self.stripUrl % (index, index[0]) @classmethod def namer(cls, imageUrl, pageUrl): """Use strip index number for image name.""" index = int(compile(r'id=(\d+)').search(pageUrl).group(1)) ext = imageUrl.rsplit('.', 1)[1] return "SnowFlakes-%d.%s" % (index, ext) def shouldSkipUrl(self, url): """Skip pages without images.""" return url in ( self.stripUrl % ('279', '2'), # no comic self.stripUrl % ('278', '2'), # no comic self.stripUrl % ('277', '2'), # no comic self.stripUrl % ('276', '2'), # no comic self.stripUrl % ('275', '2'), # no comic self.stripUrl % ('214', '2'), # no comic ) class SnowFlame(_BasicScraper): description = u'The fan-comic series featuring "The Man Powered by Cocaine"' url = 'http://www.snowflamecomic.com/' rurl = escape(url) stripUrl = url + '?comic=snowflame-%s-%s' firstStripUrl = stripUrl % ('01', '01') imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl, after="Snow[Ff]lame the fan made")) prevSearch = compile(tagre("span", "class", "mininav-prev") + tagre("a", "href", r'(%s\?comic=snowflame[^"]+)' % rurl)) starter = bounceStarter(url, compile(tagre("span", "class", "mininav-next") + tagre("a", "href", r'(%s\?comic=snowflame[^"]+)' % rurl))) help = 'Index format: chapter-page' def getStripIndexUrl(self, index): return self.stripUrl % index.split('-') @classmethod def namer(cls, imageUrl, pageUrl): prefix, filename = imageUrl.rsplit('/', 1) ro = compile(r'snowflame-([^-]+)-([^-]+)') mo = ro.search(pageUrl) chapter = mo.group(1) page = mo.group(2) return "%s-%s-%s" % (chapter, page, filename) class SodiumEyes(_BasicScraper): url = 'http://sodiumeyes.com/' rurl = escape(url) stripUrl = url + '%s/' firstStripUrl = stripUrl % '2007/11/08/damning-evidence' imageSearch = compile(tagre("img", "src", r'(%scomic/[^ ]+)' % rurl, quote="")) prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev")) help = 'Index format: yyyy/mm/dd/stripname' class Sorcery101(_BasicScraper): description = u'Welcome to the site of Kel McDonald, professional comic illustrator and writer.' baseUrl = 'http://www.sorcery101.net/' url = baseUrl + 'sorcery-101/' rurl = escape(baseUrl) stripUrl = url + '%s/' imageSearch = compile(tagre("img", "src", r'(%swp-content/uploads/\d+/\d+/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%ssorcery-101/[^"]+)' % rurl, after="previous-")) help = 'Index format: stripname' class SpaceTrawler(_BasicScraper): url = 'http://spacetrawler.com/' rurl = escape(url) stripUrl = url + '%s/' firstStripUrl = stripUrl % '2010/01/01/spacetrawler-4' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\d+/\d+/\d+/[^"]+)' % rurl, after="navi-prev")) help = 'Index format: yyyy/mm/dd/stripname' class Spamusement(_BasicScraper): description = u'Spamusement! Poorly-drawn cartoons inspired by actual spam subject lines!' url = 'http://spamusement.com/' rurl = escape(url) stripUrl = url + 'index.php/comics/view/%s' imageSearch = compile(r'' % rurl, IGNORECASE) help = 'Index format: n (unpadded)' starter = indirectStarter(url, prevSearch) class SpareParts(_BasicScraper): description = u'Spare Parts by Terrence and Isabel Marks!' baseUrl = 'http://www.sparepartscomics.com/' url = baseUrl + 'comics/?date=20080328' stripUrl = baseUrl + 'comics/index.php?date=%s' firstStripUrl = stripUrl % '20031022' imageSearch = compile(tagre("img", "src", r'(http://www\.sparepartscomics\.com/comics/[^"]+)')) prevSearch = compile(tagre("a", "href", r'(index\.php\?date=\d+)', quote="'") + "Previous Comic") help = 'Index format: yyyymmdd' class Spinnerette(_BasicScraper): url = 'http://www.spinnyverse.com/' rurl = escape(url) stripUrl = url + '%s/' firstStripUrl = stripUrl % '2010/02/09/02092010' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, before="Previous Comic")) help = 'Index format: number' class SPQRBlues(_BasicScraper): description = u"You can skip the next comic if you'd like to pass over the rest of this (very mildly) mature theme. I've tried to clarify the legalities as pointed out in the comments." url = 'http://spqrblues.com/IV/' rurl = escape(url) stripUrl = url + '?p=%s' firstStripUrl = stripUrl % '1467' imageSearch = compile(tagre("img", "src", r'(%scomics/\d+\.png)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\?p=\d+)' % rurl, after="prev")) help = 'Index format: number' # XXX disallowed by robots.txt class _StationV3(_BasicScraper): url = 'http://www.stationv3.com/' rurl = escape(url) stripUrl = url + 'd/%s.html' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%sd/\d+\.html)' % rurl) + tagre("img", "src", r'http://www\.stationv3\.com/images/previous\.gif')) help = 'Index format: yyyymmdd' class StickyDillyBuns(_BasicScraper): url = 'http://www.stickydillybuns.com/' stripUrl = url + 'strips-sdb/%s' firstStripUrl = stripUrl % 'awesome_leading_man' imageSearch = compile(tagre("img", "src", r'([^"]*/comics/[^"]+)')) prevSearch = compile(tagre("a", "href", r'([^"]*/strips-sdb/[^"]+)', before="cn[id]prev")) help = 'Index format: name' class Stubble(_BasicScraper): url = 'http://stubblecomics.com/' rurl = escape(url) stripUrl = url + '?p=%s' firstStripUrl = stripUrl % '4' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\?p=\d+)' % rurl, after="navi-prev")) help = 'Index format: number' class StuffNoOneToldMe(_BasicScraper): description = u"Everyday's life advices in the shape of witty and humorous cartoons." url = 'http://www.snotm.com/' stripUrl = url + '%s.html' firstStripUrl = stripUrl % '2010/05/01' olderHref = r"(http://www\.snotm\.com/\d+/\d+/[^']+\.html)" starter = indirectStarter(url, compile(tagre("a", "href", olderHref, quote="'"))) imageSearch = ( compile(tagre("img", "src", r'(http://i\.imgur\.com/[^"]+)') + r"(?:|
)"), compile(tagre("img", "src", r'(http://\d+\.bp\.blogspot\.com/[^"]+)') + r"(?:(?: )?|)"), compile(tagre("img", "src", r'(https://lh\d+\.googleusercontent\.com/[^"]+)') + r""), ) prevSearch = compile(tagre("a", "href", olderHref, quote="'", before="older-link")) multipleImagesPerStrip = True help = 'Index format: yyyy/mm/stripname' @classmethod def namer(cls, imageUrl, pageUrl): """Use page URL to construct meaningful image name.""" parts, year, month, stripname = pageUrl.rsplit('/', 3) stripname = stripname.rsplit('.', 1)[0] parts, imagename = imageUrl.rsplit('/', 1) return '%s-%s-%s-%s' % (year, month, stripname, imagename) def shouldSkipUrl(self, url): """Skip pages without images.""" return url in ( self.stripUrl % '2012/08/self-rant', # no comic self.stripUrl % '2012/06/if-you-wonder-where-ive-been', # video self.stripUrl % '2011/10/i-didnt-make-this-nor-have-anything-to', # video self.stripUrl % '2010/12/first-snotm-fans-in-sao-paulo', # no comic self.stripUrl % '2010/11/ear-infection', # no comic ) class StrawberryDeathCake(_BasicScraper): description = u"Update2 I'm alive and still working on the comic, but progress has been slow. I'm inching my way through sketches. Update-A little break from the comic." url = 'http://strawberrydeathcake.com/' rurl = escape(url) stripUrl = url + 'archive/%s/' imageSearch = compile(tagre("img", "src", r'(%swp-content/webcomic/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%sarchive/[^"]+)' % rurl, after="previous")) help = 'Index format: stripname' class SuburbanTribe(_BasicScraper): url = 'http://www.pixelwhip.com/' rurl = escape(url) stripUrl = url + '?p=%s' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\?p=\d+)' % rurl, after="prev")) help = 'Index format: nnnn' class SomethingPositive(_BasicScraper): url = 'http://www.somethingpositive.net/' stripUrl = url + 'sp%s.shtml' imageSearch = ( compile(tagre("img", "src", r'(sp\d+\.png)')), compile(tagre("img", "src", r'(twither\.gif)')), ) prevSearch = compile(tagre("a", "href", r'(sp\d+\.shtml)') + "(?:" + tagre("img", "src", r'images/previous\.gif') + "|Previous)") help = 'Index format: mmddyyyy' class StarCrossdDestiny(_BasicScraper): description = u'Furturistic fantasy. A group of outcasts fight to survive in a world that shuns them as freaks.' baseUrl = 'http://www.starcrossd.net/' rurl = escape(baseUrl) url = baseUrl + 'comic.html' stripUrl = baseUrl + 'archives/%s.html' firstStripUrl = stripUrl % '00000001' imageSearch = compile(tagre("img", "src", r'(http://(?:www\.)?starcrossd\.net/(?:ch1|strips|book2)/[^"]+)')) prevSearch = compile(r']*"[^"]*"[^>]*>prev' % rurl, IGNORECASE) help = 'Index format: nnnnnnnn' @classmethod def namer(cls, imageUrl, pageUrl): if imageUrl.find('ch1') == -1: # At first all images were stored in a strips/ directory but that was changed with the introduction of book2 imageUrl = sub('(?:strips)|(?:images)','book1',imageUrl) elif not imageUrl.find('strips') == -1: imageUrl = imageUrl.replace('strips/','') directory, filename = imageUrl.split('/')[-2:] filename, extension = splitext(filename) return directory + '-' + filename # XXX disallowed by robots.txt class _StrangeCandy(_BasicScraper): url = 'http://www.strangecandy.net/' stripUrl = url + 'd/%s.html' imageSearch = compile(tagre("img", "src", r'(/comics/\d+\.jpg)')) prevSearch = compile(tagre("a", "href", r'(/d/\d+\.html)') + tagre("img", "alt", "Previous comic")) help = 'Index format: yyyyddmm' class SupernormalStep(_BasicScraper): description = u'Supernormal Step - Magic, Face Punching, and a Robot or Two' url = 'http://supernormalstep.com/' rurl = escape(url) stripUrl = url + '?p=%s' firstStripUrl = stripUrl % '8' imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) prevSearch = compile(tagre("a", "href", r'(%s\?p=\d+)' % rurl, after="prev")) help = 'Index format: number'