2012-06-20 20:41:04 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
|
|
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
2013-01-28 17:52:26 +00:00
|
|
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
2012-12-12 16:41:29 +00:00
|
|
|
import requests
|
2013-03-08 05:46:50 +00:00
|
|
|
import time
|
2013-04-25 20:40:06 +00:00
|
|
|
import os
|
2013-04-08 19:20:01 +00:00
|
|
|
from . import loader, configuration
|
2013-11-29 19:26:49 +00:00
|
|
|
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
|
|
|
|
makeSequence, get_system_uid, urlopen, getDirname, unescape)
|
2012-10-11 10:03:12 +00:00
|
|
|
from .comic import ComicStrip
|
2012-10-11 17:53:37 +00:00
|
|
|
from .output import out
|
2013-03-10 15:23:04 +00:00
|
|
|
from .events import getHandler
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2013-03-26 16:33:27 +00:00
|
|
|
class Genre:
|
|
|
|
"""Genre of a comic strip."""
|
|
|
|
adventure = u"Adventure"
|
|
|
|
crazy = u"Crazy"
|
|
|
|
drama = u"Drama"
|
|
|
|
fantasy = u"Fantasy"
|
|
|
|
gaming = u"Gaming"
|
|
|
|
humor = u"Humor"
|
|
|
|
reallife = u"Real life"
|
|
|
|
scifi = u"Sci-fi"
|
|
|
|
other = u"Other"
|
|
|
|
|
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
class _BasicScraper(object):
|
2013-03-07 17:22:39 +00:00
|
|
|
'''Base class with scrape functions for comics.'''
|
|
|
|
|
|
|
|
# The URL for the comic strip
|
|
|
|
url = None
|
|
|
|
|
|
|
|
# A string that is interpolated with the strip index to yield the URL for a particular strip.
|
|
|
|
stripUrl = None
|
|
|
|
|
|
|
|
# Stop search for previous URLs at this URL
|
2013-02-13 18:59:59 +00:00
|
|
|
firstStripUrl = None
|
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# if more than one image per URL is expected
|
|
|
|
multipleImagesPerStrip = False
|
2012-12-05 20:52:52 +00:00
|
|
|
|
|
|
|
# set to False if previous URLs do not match the strip URL (ie. because of redirects)
|
|
|
|
prevUrlMatchesStripUrl = True
|
|
|
|
|
2012-12-08 20:29:57 +00:00
|
|
|
# set to True if this comic contains adult content
|
|
|
|
adult = False
|
|
|
|
|
2013-04-25 20:40:06 +00:00
|
|
|
# set to True if this comic will not get updated anymore
|
|
|
|
endOfLife = False
|
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# a description of the comic contents
|
2013-04-29 05:35:56 +00:00
|
|
|
description = u''
|
2012-12-12 16:41:29 +00:00
|
|
|
|
2013-03-08 21:33:05 +00:00
|
|
|
# langauge of the comic (two-letter ISO 639-1 code)
|
|
|
|
lang = 'en'
|
|
|
|
|
2013-03-26 16:33:27 +00:00
|
|
|
# list of genres for this comic strip
|
2013-03-26 18:58:22 +00:00
|
|
|
genres = (Genre.other,)
|
2013-03-26 16:33:27 +00:00
|
|
|
|
2013-03-07 17:22:39 +00:00
|
|
|
# compiled regular expression that will locate the URL for the previous strip in a page
|
2013-03-12 19:48:26 +00:00
|
|
|
# this can also be a list or tuple of compiled regular expressions
|
2013-03-07 17:22:39 +00:00
|
|
|
prevSearch = None
|
|
|
|
|
|
|
|
# compiled regular expression that will locate the strip image URLs strip in a page
|
2013-03-12 19:48:26 +00:00
|
|
|
# this can also be a list or tuple of compiled regular expressions
|
2013-03-07 17:22:39 +00:00
|
|
|
imageSearch = None
|
|
|
|
|
2013-11-29 19:26:49 +00:00
|
|
|
# compiled regular expression to store a text together with the image
|
|
|
|
# sometimes comic strips have additional text info for each comic
|
|
|
|
textSearch = None
|
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# usually the index format help
|
2012-12-12 16:41:29 +00:00
|
|
|
help = ''
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-03-08 21:33:05 +00:00
|
|
|
# wait time between downloading comic strips
|
2013-03-08 05:46:50 +00:00
|
|
|
waitSeconds = 0
|
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# HTTP session storing cookies
|
|
|
|
session = requests.session()
|
2012-12-05 20:52:52 +00:00
|
|
|
|
2012-10-11 17:53:37 +00:00
|
|
|
def __init__(self, indexes=None):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Initialize internal variables."""
|
|
|
|
self.urls = set()
|
2013-02-13 19:00:16 +00:00
|
|
|
if indexes:
|
2013-03-06 19:00:30 +00:00
|
|
|
self.indexes = tuple(sorted(indexes))
|
2013-02-13 19:00:16 +00:00
|
|
|
else:
|
|
|
|
self.indexes = tuple()
|
2013-02-20 19:51:39 +00:00
|
|
|
self.skippedUrls = set()
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = False
|
2013-02-13 19:00:16 +00:00
|
|
|
|
|
|
|
def __cmp__(self, other):
|
2013-02-18 19:02:16 +00:00
|
|
|
"""Compare scraper by name and index list."""
|
2013-02-13 19:00:16 +00:00
|
|
|
if not isinstance(other, _BasicScraper):
|
|
|
|
return 1
|
|
|
|
# first, order by name
|
2013-03-06 19:00:30 +00:00
|
|
|
d = cmp(self.getName(), other.getName())
|
2013-02-13 19:00:16 +00:00
|
|
|
if d != 0:
|
|
|
|
return d
|
|
|
|
# then by indexes
|
|
|
|
return cmp(self.indexes, other.indexes)
|
|
|
|
|
|
|
|
def __hash__(self):
|
2013-02-18 19:02:16 +00:00
|
|
|
"""Get hash value from name and index list."""
|
2013-03-06 19:00:30 +00:00
|
|
|
return hash((self.getName(), self.indexes))
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-03-06 19:00:30 +00:00
|
|
|
def shouldSkipUrl(self, url):
|
|
|
|
"""Determine if search for images in given URL should be skipped."""
|
|
|
|
return False
|
2013-03-04 18:10:27 +00:00
|
|
|
|
|
|
|
def getComicStrip(self, url, data, baseUrl):
|
|
|
|
"""Get comic strip downloader for given URL and data."""
|
|
|
|
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
2013-12-04 16:54:55 +00:00
|
|
|
# map modifier function on image URLs
|
|
|
|
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
|
|
|
# remove duplicate URLs
|
|
|
|
imageUrls = set(imageUrls)
|
2012-11-26 17:44:31 +00:00
|
|
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
2013-04-05 16:55:19 +00:00
|
|
|
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
|
2013-04-11 16:27:43 +00:00
|
|
|
image = sorted(imageUrls)[0]
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"choosing image %s" % image)
|
2013-04-11 16:27:43 +00:00
|
|
|
imageUrls = (image,)
|
2013-03-06 19:00:30 +00:00
|
|
|
elif not imageUrls:
|
2013-04-05 16:55:19 +00:00
|
|
|
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
2013-11-29 19:26:49 +00:00
|
|
|
if self.textSearch:
|
|
|
|
text = fetchText(url, data, self.textSearch)
|
|
|
|
if text:
|
2013-12-04 17:07:13 +00:00
|
|
|
text = unescape(text).strip()
|
2013-11-29 19:26:49 +00:00
|
|
|
else:
|
|
|
|
text = None
|
|
|
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-03-06 19:00:30 +00:00
|
|
|
def getStrips(self, maxstrips=None):
|
|
|
|
"""Get comic strips."""
|
2012-12-07 23:45:18 +00:00
|
|
|
if maxstrips:
|
2013-04-30 04:40:20 +00:00
|
|
|
word = u"strip" if maxstrips == 1 else "strips"
|
|
|
|
msg = u'Retrieving %d %s' % (maxstrips, word)
|
2012-12-07 23:45:18 +00:00
|
|
|
else:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg = u'Retrieving all strips'
|
2013-01-29 17:51:35 +00:00
|
|
|
if self.indexes:
|
2013-03-06 19:00:30 +00:00
|
|
|
if len(self.indexes) == 1:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" for index %s" % self.indexes[0]
|
2013-03-06 19:00:30 +00:00
|
|
|
else:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" for indexes %s" % self.indexes
|
2013-11-07 19:48:10 +00:00
|
|
|
# Always call starter() since it might initialize cookies.
|
|
|
|
# See for example Oglaf comic.
|
|
|
|
self.starter()
|
2013-03-06 19:00:30 +00:00
|
|
|
urls = [self.getIndexStripUrl(index) for index in self.indexes]
|
|
|
|
else:
|
|
|
|
urls = [self.getLatestUrl()]
|
2012-12-12 16:41:29 +00:00
|
|
|
if self.adult:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" (including adult content)"
|
2012-12-12 16:41:29 +00:00
|
|
|
out.info(msg)
|
2013-03-06 19:00:30 +00:00
|
|
|
for url in urls:
|
2012-12-07 23:45:18 +00:00
|
|
|
for strip in self.getStripsFor(url, maxstrips):
|
2012-10-11 17:53:37 +00:00
|
|
|
yield strip
|
|
|
|
|
2012-12-07 23:45:18 +00:00
|
|
|
def getStripsFor(self, url, maxstrips):
|
|
|
|
"""Get comic strips for an URL. If maxstrips is a positive number, stop after
|
|
|
|
retrieving the given number of strips."""
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = False
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls = set()
|
|
|
|
while url:
|
2013-04-30 04:40:20 +00:00
|
|
|
out.info(u'Get strip URL %s' % url, level=1)
|
2013-02-12 16:55:13 +00:00
|
|
|
data, baseUrl = getPageContent(url, self.session)
|
2013-03-06 19:00:30 +00:00
|
|
|
if self.shouldSkipUrl(url):
|
2013-04-30 04:40:20 +00:00
|
|
|
out.info(u'Skipping URL %s' % url)
|
2013-03-06 19:00:30 +00:00
|
|
|
self.skippedUrls.add(url)
|
2013-02-18 19:03:27 +00:00
|
|
|
else:
|
2013-03-15 06:03:54 +00:00
|
|
|
try:
|
|
|
|
yield self.getComicStrip(url, data, baseUrl)
|
|
|
|
except ValueError as msg:
|
|
|
|
# image not found
|
2013-03-25 18:48:47 +00:00
|
|
|
out.exception(msg)
|
2013-02-13 18:59:59 +00:00
|
|
|
if self.firstStripUrl == url:
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"Stop at first URL %s" % url)
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = True
|
2013-02-13 18:59:59 +00:00
|
|
|
break
|
2013-03-06 19:00:30 +00:00
|
|
|
if maxstrips is not None:
|
|
|
|
maxstrips -= 1
|
|
|
|
if maxstrips <= 0:
|
|
|
|
break
|
|
|
|
prevUrl = self.getPrevUrl(url, data, baseUrl)
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls.add(url)
|
2012-12-08 20:29:57 +00:00
|
|
|
if prevUrl in seen_urls:
|
|
|
|
# avoid recursive URL loops
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"Already seen previous URL %r" % prevUrl)
|
2012-12-08 20:29:57 +00:00
|
|
|
break
|
|
|
|
url = prevUrl
|
2013-03-08 21:33:05 +00:00
|
|
|
if url and self.waitSeconds:
|
2013-03-08 05:46:50 +00:00
|
|
|
time.sleep(self.waitSeconds)
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-03-06 19:00:30 +00:00
|
|
|
def getPrevUrl(self, url, data, baseUrl):
|
|
|
|
"""Find previous URL."""
|
|
|
|
prevUrl = None
|
|
|
|
if self.prevSearch:
|
|
|
|
try:
|
|
|
|
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
|
|
|
except ValueError as msg:
|
|
|
|
# assume there is no previous URL, but print a warning
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
2013-03-06 19:00:30 +00:00
|
|
|
else:
|
|
|
|
prevUrl = self.prevUrlModifier(prevUrl)
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"Matched previous URL %s" % prevUrl)
|
2013-03-10 15:23:04 +00:00
|
|
|
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
2013-03-06 19:00:30 +00:00
|
|
|
return prevUrl
|
|
|
|
|
|
|
|
def getIndexStripUrl(self, index):
|
|
|
|
"""Get comic strip URL from index."""
|
|
|
|
return self.stripUrl % index
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
@classmethod
|
2013-03-06 19:00:30 +00:00
|
|
|
def getName(cls):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Get scraper name."""
|
|
|
|
if hasattr(cls, 'name'):
|
|
|
|
return cls.name
|
|
|
|
return cls.__name__
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def starter(cls):
|
|
|
|
"""Get starter URL from where to scrape comic strips."""
|
2013-02-04 20:00:26 +00:00
|
|
|
return cls.url
|
2012-10-11 10:03:12 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def namer(cls, imageUrl, pageUrl):
|
|
|
|
"""Return filename for given image and page URL."""
|
|
|
|
return None
|
|
|
|
|
2012-12-02 17:35:06 +00:00
|
|
|
@classmethod
|
|
|
|
def prevUrlModifier(cls, prevUrl):
|
|
|
|
"""Optional modification of parsed previous URLs. Useful if
|
|
|
|
there are domain redirects. The default implementation does
|
|
|
|
not modify the URL.
|
|
|
|
"""
|
|
|
|
return prevUrl
|
|
|
|
|
2013-03-04 18:10:27 +00:00
|
|
|
@classmethod
|
2013-12-04 16:54:55 +00:00
|
|
|
def imageUrlModifier(cls, imageUrl, data):
|
2013-03-04 18:10:27 +00:00
|
|
|
"""Optional modification of parsed image URLs. Useful if the URL
|
|
|
|
needs to be fixed before usage. The default implementation does
|
2013-12-04 16:54:55 +00:00
|
|
|
not modify the URL. The given data is the URL page data.
|
2013-03-04 18:10:27 +00:00
|
|
|
"""
|
|
|
|
return imageUrl
|
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
def getLatestUrl(self):
|
|
|
|
"""Get starter URL from where to scrape comic strips."""
|
|
|
|
return self.starter()
|
|
|
|
|
2013-04-09 17:33:50 +00:00
|
|
|
@classmethod
|
|
|
|
def vote(cls):
|
2013-04-08 18:19:10 +00:00
|
|
|
"""Cast a public vote for this comic."""
|
2013-04-09 17:33:50 +00:00
|
|
|
url = configuration.VoteUrl + 'count/'
|
2013-04-08 19:20:01 +00:00
|
|
|
uid = get_system_uid()
|
2013-04-09 17:33:50 +00:00
|
|
|
data = {"name": cls.getName().replace('/', '_'), "uid": uid}
|
2013-04-11 16:27:43 +00:00
|
|
|
page = urlopen(url, cls.session, data=data)
|
2013-04-09 17:33:50 +00:00
|
|
|
return page.text
|
2013-04-08 18:19:10 +00:00
|
|
|
|
2013-04-25 20:40:06 +00:00
|
|
|
def getCompleteFile(self, basepath):
|
|
|
|
"""Get filename indicating all comics are downloaded."""
|
|
|
|
dirname = getDirname(self.getName())
|
|
|
|
return os.path.join(basepath, dirname, "complete.txt")
|
|
|
|
|
|
|
|
def isComplete(self, basepath):
|
|
|
|
"""Check if all comics are downloaded."""
|
|
|
|
return os.path.isfile(self.getCompleteFile(basepath))
|
|
|
|
|
|
|
|
def setComplete(self, basepath):
|
|
|
|
"""Set complete flag for this comic, ie. all comics are downloaded."""
|
|
|
|
if self.endOfLife:
|
|
|
|
filename = self.getCompleteFile(basepath)
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
with open(filename, 'w') as f:
|
|
|
|
f.write('All comics should be downloaded here.')
|
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-02-13 21:18:05 +00:00
|
|
|
def find_scraperclasses(comic, multiple_allowed=False):
|
|
|
|
"""Get a list comic scraper classes. Can return more than one entries if
|
|
|
|
multiple_allowed is True, else it raises a ValueError if multiple
|
|
|
|
modules match. The match is a case insensitive substring search."""
|
2012-12-12 16:41:29 +00:00
|
|
|
if not comic:
|
|
|
|
raise ValueError("empty comic name")
|
2012-06-20 19:58:13 +00:00
|
|
|
candidates = []
|
2012-10-11 10:03:12 +00:00
|
|
|
cname = comic.lower()
|
2013-02-13 18:59:13 +00:00
|
|
|
for scraperclass in get_scraperclasses():
|
2013-03-06 19:00:30 +00:00
|
|
|
lname = scraperclass.getName().lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if lname == cname:
|
|
|
|
# perfect match
|
2013-02-13 21:18:05 +00:00
|
|
|
if not multiple_allowed:
|
2013-02-18 18:59:16 +00:00
|
|
|
return [scraperclass]
|
2013-02-13 21:18:05 +00:00
|
|
|
else:
|
|
|
|
candidates.append(scraperclass)
|
|
|
|
elif cname in lname:
|
2012-10-11 10:03:12 +00:00
|
|
|
candidates.append(scraperclass)
|
2013-02-13 21:18:05 +00:00
|
|
|
if len(candidates) > 1 and not multiple_allowed:
|
2013-03-06 19:00:30 +00:00
|
|
|
comics = ", ".join(x.getName() for x in candidates)
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('multiple comics found: %s' % comics)
|
2013-02-13 21:18:05 +00:00
|
|
|
elif not candidates:
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('comic %r not found' % comic)
|
2013-02-13 21:18:05 +00:00
|
|
|
return candidates
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2013-02-13 18:59:13 +00:00
|
|
|
_scraperclasses = None
|
|
|
|
def get_scraperclasses():
|
2012-06-20 19:58:13 +00:00
|
|
|
"""Find all comic scraper classes in the plugins directory.
|
|
|
|
The result is cached.
|
|
|
|
@return: list of _BasicScraper classes
|
|
|
|
@rtype: list of _BasicScraper
|
|
|
|
"""
|
2013-02-13 18:59:13 +00:00
|
|
|
global _scraperclasses
|
|
|
|
if _scraperclasses is None:
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"Loading comic modules...")
|
2013-12-11 16:54:39 +00:00
|
|
|
modules = loader.get_modules('plugins')
|
2012-10-11 10:03:12 +00:00
|
|
|
plugins = loader.get_plugins(modules, _BasicScraper)
|
2013-02-13 18:59:13 +00:00
|
|
|
_scraperclasses = list(plugins)
|
2012-06-20 19:58:13 +00:00
|
|
|
check_scrapers()
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
2013-02-13 18:59:13 +00:00
|
|
|
return _scraperclasses
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def check_scrapers():
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Check for duplicate scraper class names."""
|
2012-06-20 19:58:13 +00:00
|
|
|
d = {}
|
2013-02-13 18:59:13 +00:00
|
|
|
for scraperclass in _scraperclasses:
|
2013-03-06 19:00:30 +00:00
|
|
|
name = scraperclass.getName().lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if name in d:
|
2013-03-06 19:00:30 +00:00
|
|
|
name1 = scraperclass.getName()
|
|
|
|
name2 = d[name].getName()
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('duplicate scrapers %s and %s found' % (name1, name2))
|
2012-10-11 10:03:12 +00:00
|
|
|
d[name] = scraperclass
|
2012-11-26 06:14:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
def make_scraper(classname, **attributes):
|
|
|
|
"""Make a new scraper class with given name and attributes."""
|
|
|
|
return type(classname, (_BasicScraper,), attributes)
|