2016-03-13 19:24:21 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-10-28 22:21:41 +00:00
|
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2020-01-09 16:38:13 +00:00
|
|
|
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
2020-04-12 23:53:45 +00:00
|
|
|
import html
|
2013-04-25 20:40:06 +00:00
|
|
|
import os
|
2014-07-23 18:53:59 +00:00
|
|
|
import re
|
2020-02-03 00:03:31 +00:00
|
|
|
from urllib.parse import urljoin
|
2014-10-13 19:39:13 +00:00
|
|
|
|
2020-04-12 23:53:45 +00:00
|
|
|
import lxml
|
2016-05-02 21:22:06 +00:00
|
|
|
from lxml.html.defs import link_attrs as html_link_attrs
|
2014-10-13 19:39:13 +00:00
|
|
|
|
2014-10-13 20:43:06 +00:00
|
|
|
try:
|
|
|
|
import cssselect
|
|
|
|
except ImportError:
|
|
|
|
cssselect = None
|
|
|
|
|
2015-07-10 23:23:20 +00:00
|
|
|
try:
|
|
|
|
import pycountry
|
|
|
|
except ImportError:
|
|
|
|
pycountry = None
|
|
|
|
|
2019-12-03 19:27:37 +00:00
|
|
|
from . import configuration, http, languages, loader
|
2020-04-12 23:53:45 +00:00
|
|
|
from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
|
|
|
prettyMatcherList, uniq)
|
2012-10-11 10:03:12 +00:00
|
|
|
from .comic import ComicStrip
|
2012-10-11 17:53:37 +00:00
|
|
|
from .output import out
|
2013-03-10 15:23:04 +00:00
|
|
|
from .events import getHandler
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2020-01-09 16:38:13 +00:00
|
|
|
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
|
|
|
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
class Scraper(object):
|
2016-03-13 19:24:21 +00:00
|
|
|
'''Base class for all comic scraper, but without a specific scrape
|
|
|
|
implementation.'''
|
2013-03-07 17:22:39 +00:00
|
|
|
|
|
|
|
# The URL for the comic strip
|
|
|
|
url = None
|
|
|
|
|
2016-03-13 19:24:21 +00:00
|
|
|
# A string that is interpolated with the strip index to yield the URL for a
|
|
|
|
# particular strip.
|
2013-03-07 17:22:39 +00:00
|
|
|
stripUrl = None
|
|
|
|
|
|
|
|
# Stop search for previous URLs at this URL
|
2013-02-13 18:59:59 +00:00
|
|
|
firstStripUrl = None
|
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# if more than one image per URL is expected
|
|
|
|
multipleImagesPerStrip = False
|
2012-12-05 20:52:52 +00:00
|
|
|
|
2012-12-08 20:29:57 +00:00
|
|
|
# set to True if this comic contains adult content
|
|
|
|
adult = False
|
|
|
|
|
2013-04-25 20:40:06 +00:00
|
|
|
# set to True if this comic will not get updated anymore
|
|
|
|
endOfLife = False
|
|
|
|
|
2013-03-08 21:33:05 +00:00
|
|
|
# langauge of the comic (two-letter ISO 639-1 code)
|
|
|
|
lang = 'en'
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
# an expression that will locate the URL for the previous strip in a page
|
|
|
|
# this can also be a list or tuple
|
2013-03-07 17:22:39 +00:00
|
|
|
prevSearch = None
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
# an expression that will locate the strip image URLs strip in a page
|
|
|
|
# this can also be a list or tuple
|
2013-03-07 17:22:39 +00:00
|
|
|
imageSearch = None
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
# an expression to store a text together with the image
|
2013-11-29 19:26:49 +00:00
|
|
|
# sometimes comic strips have additional text info for each comic
|
|
|
|
textSearch = None
|
|
|
|
|
2014-07-24 18:49:57 +00:00
|
|
|
# Is the additional text required or optional? When it is required (the
|
|
|
|
# default), you see an error message whenever a comic page is encountered
|
|
|
|
# that does not have the text
|
|
|
|
textOptional = False
|
|
|
|
|
2012-11-26 17:44:31 +00:00
|
|
|
# usually the index format help
|
2012-12-12 16:41:29 +00:00
|
|
|
help = ''
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2016-11-01 17:25:02 +00:00
|
|
|
# Specifing a list of HTTP error codes which should be handled as a
|
|
|
|
# successful request. This is a workaround for some comics which return
|
|
|
|
# regular pages with strange HTTP codes. By default, all HTTP errors raise
|
|
|
|
# exceptions.
|
|
|
|
allow_errors = ()
|
|
|
|
|
2016-03-13 20:27:31 +00:00
|
|
|
# HTTP session for configuration & cookies
|
2019-12-03 19:27:37 +00:00
|
|
|
session = http.default_session
|
2012-12-05 20:52:52 +00:00
|
|
|
|
2016-05-20 23:18:42 +00:00
|
|
|
@classmethod
|
|
|
|
def getmodules(cls):
|
2016-06-05 08:03:29 +00:00
|
|
|
name = cls.__name__
|
|
|
|
if hasattr(cls, 'name'):
|
|
|
|
name = cls.name
|
|
|
|
return [cls(name)]
|
2016-05-20 23:18:42 +00:00
|
|
|
|
2016-04-13 20:05:44 +00:00
|
|
|
@property
|
|
|
|
def indexes(self):
|
|
|
|
return self._indexes
|
|
|
|
|
|
|
|
@indexes.setter
|
|
|
|
def indexes(self, val):
|
|
|
|
if val:
|
|
|
|
self._indexes = tuple(sorted(val))
|
|
|
|
|
2016-05-20 23:18:42 +00:00
|
|
|
def __init__(self, name):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Initialize internal variables."""
|
2016-05-20 23:18:42 +00:00
|
|
|
self.name = name
|
2012-10-11 10:03:12 +00:00
|
|
|
self.urls = set()
|
2016-04-13 20:05:44 +00:00
|
|
|
self._indexes = tuple()
|
2013-02-20 19:51:39 +00:00
|
|
|
self.skippedUrls = set()
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = False
|
2013-02-13 19:00:16 +00:00
|
|
|
|
|
|
|
def __hash__(self):
|
2013-02-18 19:02:16 +00:00
|
|
|
"""Get hash value from name and index list."""
|
2016-04-13 20:05:44 +00:00
|
|
|
return hash((self.name, self.indexes))
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2014-02-10 20:58:09 +00:00
|
|
|
def shouldSkipUrl(self, url, data):
|
2013-03-06 19:00:30 +00:00
|
|
|
"""Determine if search for images in given URL should be skipped."""
|
|
|
|
return False
|
2013-03-04 18:10:27 +00:00
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
def getComicStrip(self, url, data):
|
2013-03-04 18:10:27 +00:00
|
|
|
"""Get comic strip downloader for given URL and data."""
|
2014-07-23 18:53:59 +00:00
|
|
|
imageUrls = self.fetchUrls(url, data, self.imageSearch)
|
2013-12-04 16:54:55 +00:00
|
|
|
# map modifier function on image URLs
|
|
|
|
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
|
|
|
# remove duplicate URLs
|
2017-04-18 20:58:12 +00:00
|
|
|
imageUrls = uniq(imageUrls)
|
2012-11-26 17:44:31 +00:00
|
|
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
2016-03-13 19:24:21 +00:00
|
|
|
out.warn(
|
|
|
|
u"Found %d images instead of 1 at %s with expressions %s" %
|
|
|
|
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
|
2017-04-18 20:59:04 +00:00
|
|
|
image = imageUrls[0]
|
2014-07-23 18:53:59 +00:00
|
|
|
out.warn(u"Choosing image %s" % image)
|
2013-04-11 16:27:43 +00:00
|
|
|
imageUrls = (image,)
|
2013-03-06 19:00:30 +00:00
|
|
|
elif not imageUrls:
|
2016-03-13 19:24:21 +00:00
|
|
|
out.warn(u"Found no images at %s with expressions %s" % (url,
|
|
|
|
prettyMatcherList(self.imageSearch)))
|
2013-11-29 19:26:49 +00:00
|
|
|
if self.textSearch:
|
2016-03-13 19:24:21 +00:00
|
|
|
text = self.fetchText(url, data, self.textSearch,
|
|
|
|
optional=self.textOptional)
|
2013-11-29 19:26:49 +00:00
|
|
|
else:
|
|
|
|
text = None
|
2016-06-05 21:55:54 +00:00
|
|
|
return ComicStrip(self, url, imageUrls, text=text)
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2013-03-06 19:00:30 +00:00
|
|
|
def getStrips(self, maxstrips=None):
|
|
|
|
"""Get comic strips."""
|
2012-12-07 23:45:18 +00:00
|
|
|
if maxstrips:
|
2013-04-30 04:40:20 +00:00
|
|
|
word = u"strip" if maxstrips == 1 else "strips"
|
|
|
|
msg = u'Retrieving %d %s' % (maxstrips, word)
|
2012-12-07 23:45:18 +00:00
|
|
|
else:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg = u'Retrieving all strips'
|
2013-01-29 17:51:35 +00:00
|
|
|
if self.indexes:
|
2013-03-06 19:00:30 +00:00
|
|
|
if len(self.indexes) == 1:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" for index %s" % self.indexes[0]
|
2013-03-06 19:00:30 +00:00
|
|
|
else:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" for indexes %s" % self.indexes
|
2013-11-07 19:48:10 +00:00
|
|
|
# Always call starter() since it might initialize cookies.
|
|
|
|
# See for example Oglaf comic.
|
|
|
|
self.starter()
|
2013-03-06 19:00:30 +00:00
|
|
|
urls = [self.getIndexStripUrl(index) for index in self.indexes]
|
|
|
|
else:
|
2016-04-15 21:42:24 +00:00
|
|
|
urls = [self.starter()]
|
2012-12-12 16:41:29 +00:00
|
|
|
if self.adult:
|
2013-04-30 04:40:20 +00:00
|
|
|
msg += u" (including adult content)"
|
2012-12-12 16:41:29 +00:00
|
|
|
out.info(msg)
|
2013-03-06 19:00:30 +00:00
|
|
|
for url in urls:
|
2012-12-07 23:45:18 +00:00
|
|
|
for strip in self.getStripsFor(url, maxstrips):
|
2012-10-11 17:53:37 +00:00
|
|
|
yield strip
|
|
|
|
|
2012-12-07 23:45:18 +00:00
|
|
|
def getStripsFor(self, url, maxstrips):
|
|
|
|
"""Get comic strips for an URL. If maxstrips is a positive number, stop after
|
|
|
|
retrieving the given number of strips."""
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = False
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls = set()
|
|
|
|
while url:
|
2013-04-30 04:40:20 +00:00
|
|
|
out.info(u'Get strip URL %s' % url, level=1)
|
2014-07-23 18:53:59 +00:00
|
|
|
data = self.getPage(url)
|
2014-02-10 20:58:09 +00:00
|
|
|
if self.shouldSkipUrl(url, data):
|
2013-04-30 04:40:20 +00:00
|
|
|
out.info(u'Skipping URL %s' % url)
|
2013-03-06 19:00:30 +00:00
|
|
|
self.skippedUrls.add(url)
|
2013-02-18 19:03:27 +00:00
|
|
|
else:
|
2013-03-15 06:03:54 +00:00
|
|
|
try:
|
2014-07-23 18:53:59 +00:00
|
|
|
yield self.getComicStrip(url, data)
|
2013-03-15 06:03:54 +00:00
|
|
|
except ValueError as msg:
|
|
|
|
# image not found
|
2013-03-25 18:48:47 +00:00
|
|
|
out.exception(msg)
|
2020-01-09 16:38:13 +00:00
|
|
|
if self.isfirststrip(url):
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"Stop at first URL %s" % url)
|
2013-02-21 18:48:21 +00:00
|
|
|
self.hitFirstStripUrl = True
|
2013-02-13 18:59:59 +00:00
|
|
|
break
|
2013-03-06 19:00:30 +00:00
|
|
|
if maxstrips is not None:
|
|
|
|
maxstrips -= 1
|
|
|
|
if maxstrips <= 0:
|
|
|
|
break
|
2014-07-23 18:53:59 +00:00
|
|
|
prevUrl = self.getPrevUrl(url, data)
|
2012-10-11 10:03:12 +00:00
|
|
|
seen_urls.add(url)
|
2012-12-08 20:29:57 +00:00
|
|
|
if prevUrl in seen_urls:
|
|
|
|
# avoid recursive URL loops
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"Already seen previous URL %r" % prevUrl)
|
2012-12-08 20:29:57 +00:00
|
|
|
break
|
|
|
|
url = prevUrl
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2020-01-09 16:38:13 +00:00
|
|
|
def isfirststrip(self, url):
|
|
|
|
"""Check if the specified URL is the first strip of a comic. This is
|
|
|
|
specially for comics taken from archive.org, since the base URL of
|
|
|
|
archive.org changes whenever pages are taken from a different
|
|
|
|
snapshot."""
|
|
|
|
if not self.firstStripUrl:
|
|
|
|
return False
|
|
|
|
firsturl = ARCHIVE_ORG_URL.sub('', self.firstStripUrl)
|
|
|
|
currenturl = ARCHIVE_ORG_URL.sub('', url)
|
|
|
|
return firsturl == currenturl
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
def getPrevUrl(self, url, data):
|
2013-03-06 19:00:30 +00:00
|
|
|
"""Find previous URL."""
|
|
|
|
prevUrl = None
|
|
|
|
if self.prevSearch:
|
|
|
|
try:
|
2014-07-23 18:53:59 +00:00
|
|
|
prevUrl = self.fetchUrl(url, data, self.prevSearch)
|
2013-03-06 19:00:30 +00:00
|
|
|
except ValueError as msg:
|
|
|
|
# assume there is no previous URL, but print a warning
|
2013-04-30 04:40:20 +00:00
|
|
|
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
2013-03-06 19:00:30 +00:00
|
|
|
else:
|
2016-11-01 00:12:16 +00:00
|
|
|
prevUrl = self.link_modifier(url, prevUrl)
|
2014-07-23 18:53:59 +00:00
|
|
|
out.debug(u"Found previous URL %s" % prevUrl)
|
2016-06-05 21:55:54 +00:00
|
|
|
getHandler().comicPageLink(self, url, prevUrl)
|
2013-03-06 19:00:30 +00:00
|
|
|
return prevUrl
|
|
|
|
|
|
|
|
def getIndexStripUrl(self, index):
|
|
|
|
"""Get comic strip URL from index."""
|
|
|
|
return self.stripUrl % index
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2016-04-13 20:05:44 +00:00
|
|
|
def starter(self):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Get starter URL from where to scrape comic strips."""
|
2016-04-13 20:05:44 +00:00
|
|
|
return self.url
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2016-04-21 06:20:49 +00:00
|
|
|
def namer(self, image_url, page_url):
|
2012-10-11 10:03:12 +00:00
|
|
|
"""Return filename for given image and page URL."""
|
|
|
|
return None
|
|
|
|
|
2016-11-01 00:12:16 +00:00
|
|
|
def link_modifier(self, fromurl, tourl):
|
|
|
|
"""Optional modification of parsed link (previous/back/latest) URLs.
|
|
|
|
Useful if there are domain redirects. The default implementation does
|
2012-12-02 17:35:06 +00:00
|
|
|
not modify the URL.
|
|
|
|
"""
|
2016-11-01 00:12:16 +00:00
|
|
|
return tourl
|
2012-12-02 17:35:06 +00:00
|
|
|
|
2016-04-21 19:28:41 +00:00
|
|
|
def imageUrlModifier(self, image_url, data):
|
2013-03-04 18:10:27 +00:00
|
|
|
"""Optional modification of parsed image URLs. Useful if the URL
|
|
|
|
needs to be fixed before usage. The default implementation does
|
2013-12-04 16:54:55 +00:00
|
|
|
not modify the URL. The given data is the URL page data.
|
2013-03-04 18:10:27 +00:00
|
|
|
"""
|
2016-04-21 19:28:41 +00:00
|
|
|
return image_url
|
2013-03-04 18:10:27 +00:00
|
|
|
|
2016-04-13 20:05:44 +00:00
|
|
|
def vote(self):
|
2013-04-08 18:19:10 +00:00
|
|
|
"""Cast a public vote for this comic."""
|
2013-04-08 19:20:01 +00:00
|
|
|
uid = get_system_uid()
|
2016-04-13 20:05:44 +00:00
|
|
|
data = {"name": self.name.replace('/', '_'), "uid": uid}
|
2019-11-03 19:44:07 +00:00
|
|
|
response = self.session.post(configuration.VoteUrl, data=data)
|
|
|
|
response.raise_for_status()
|
2013-04-08 18:19:10 +00:00
|
|
|
|
2016-06-05 21:55:54 +00:00
|
|
|
def get_download_dir(self, basepath):
|
|
|
|
"""Try to find the corect download directory, ignoring case
|
|
|
|
differences."""
|
|
|
|
path = basepath
|
|
|
|
for part in self.name.split('/'):
|
|
|
|
done = False
|
|
|
|
if (os.path.isdir(path) and
|
|
|
|
not os.path.isdir(os.path.join(path, part))):
|
|
|
|
for entry in os.listdir(path):
|
|
|
|
if (entry.lower() == part.lower() and
|
|
|
|
os.path.isdir(os.path.join(path, entry))):
|
|
|
|
path = os.path.join(path, entry)
|
|
|
|
done = True
|
|
|
|
break
|
|
|
|
if not done:
|
|
|
|
path = os.path.join(path, part)
|
|
|
|
return path
|
|
|
|
|
2013-04-25 20:40:06 +00:00
|
|
|
def getCompleteFile(self, basepath):
|
|
|
|
"""Get filename indicating all comics are downloaded."""
|
2016-06-05 21:55:54 +00:00
|
|
|
dirname = self.get_download_dir(basepath)
|
|
|
|
return os.path.join(dirname, "complete.txt")
|
2013-04-25 20:40:06 +00:00
|
|
|
|
|
|
|
def isComplete(self, basepath):
|
|
|
|
"""Check if all comics are downloaded."""
|
|
|
|
return os.path.isfile(self.getCompleteFile(basepath))
|
|
|
|
|
|
|
|
def setComplete(self, basepath):
|
|
|
|
"""Set complete flag for this comic, ie. all comics are downloaded."""
|
|
|
|
if self.endOfLife:
|
|
|
|
filename = self.getCompleteFile(basepath)
|
|
|
|
if not os.path.exists(filename):
|
|
|
|
with open(filename, 'w') as f:
|
|
|
|
f.write('All comics should be downloaded here.')
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def getPage(self, url):
|
2014-07-23 18:53:59 +00:00
|
|
|
"""
|
|
|
|
Fetch a page and return the opaque repesentation for the data parameter
|
|
|
|
of fetchUrls and fetchText.
|
|
|
|
|
|
|
|
Implementation notes: While this base class does not restrict how the
|
2016-03-13 19:24:21 +00:00
|
|
|
returned data is structured, subclasses (specific scrapers) should
|
|
|
|
specify how this data works, since the stracture is passed into
|
|
|
|
different methods which can be defined by comic modules and these
|
|
|
|
methods should be able to use the data if they so desire... (Affected
|
|
|
|
methods: shouldSkipUrl, imageUrlModifier)
|
2014-07-23 18:53:59 +00:00
|
|
|
"""
|
2016-11-01 17:25:02 +00:00
|
|
|
return get_page(url, self.session, allow_errors=self.allow_errors)
|
2014-07-23 18:53:59 +00:00
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchUrls(self, url, data, urlsearch):
|
2014-07-23 18:53:59 +00:00
|
|
|
raise ValueError("No implementation for fetchUrls!")
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchUrl(self, url, data, urlsearch):
|
|
|
|
return self.fetchUrls(url, data, urlsearch)[0]
|
2014-07-23 18:53:59 +00:00
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchText(self, url, data, textsearch, optional):
|
2014-07-23 18:53:59 +00:00
|
|
|
raise ValueError("No implementation for fetchText!")
|
|
|
|
|
2016-04-19 21:53:46 +00:00
|
|
|
def getDisabledReasons(self):
|
2014-10-13 19:39:13 +00:00
|
|
|
"""
|
|
|
|
Get a dict of reasons why this comic module is disabled. The key is a
|
|
|
|
short (unique) identifier, the value is a string explaining why the
|
|
|
|
module is deactivated. If the module is not disabled, just return an
|
|
|
|
empty dict.
|
|
|
|
"""
|
|
|
|
return {}
|
|
|
|
|
2016-04-19 21:53:46 +00:00
|
|
|
def language(self):
|
2015-07-10 23:23:20 +00:00
|
|
|
"""
|
|
|
|
Return language of the comic as a human-readable language name instead
|
|
|
|
of a 2-character ISO639-1 code.
|
|
|
|
"""
|
2016-04-19 21:53:46 +00:00
|
|
|
lang = 'Unknown (%s)' % self.lang
|
2015-07-10 23:23:20 +00:00
|
|
|
if pycountry is None:
|
2016-04-19 21:53:46 +00:00
|
|
|
if self.lang in languages.Languages:
|
|
|
|
lang = languages.Languages[self.lang]
|
2015-07-10 23:23:20 +00:00
|
|
|
else:
|
|
|
|
try:
|
2017-11-26 18:26:44 +00:00
|
|
|
lang = pycountry.languages.get(alpha_2=self.lang).name
|
2015-07-10 23:23:20 +00:00
|
|
|
except KeyError:
|
|
|
|
try:
|
2017-11-26 18:26:44 +00:00
|
|
|
lang = pycountry.languages.get(alpha2=self.lang).name
|
2015-07-10 23:23:20 +00:00
|
|
|
except KeyError:
|
|
|
|
pass
|
|
|
|
return lang
|
2014-07-23 18:53:59 +00:00
|
|
|
|
2016-03-13 19:24:21 +00:00
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
class _BasicScraper(Scraper):
|
|
|
|
"""
|
|
|
|
Scraper base class that matches regular expressions against HTML pages.
|
|
|
|
|
|
|
|
Subclasses of this scraper should use compiled regular expressions as
|
|
|
|
values for prevSearch, imageSearch and textSearch.
|
|
|
|
|
|
|
|
Implementation note: The return value of getPage is a tuple: the first
|
|
|
|
element is the raw HTML page text, the second element is the base URL (if
|
|
|
|
any).
|
|
|
|
"""
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def getPage(self, url):
|
2016-11-01 17:25:02 +00:00
|
|
|
content = super(_BasicScraper, self).getPage(url).text
|
2014-07-23 18:53:59 +00:00
|
|
|
# determine base URL
|
|
|
|
baseUrl = None
|
2016-04-21 21:52:31 +00:00
|
|
|
match = self.BASE_SEARCH.search(content)
|
2014-07-23 18:53:59 +00:00
|
|
|
if match:
|
|
|
|
baseUrl = match.group(1)
|
|
|
|
else:
|
|
|
|
baseUrl = url
|
2014-07-23 18:53:59 +00:00
|
|
|
return (content, baseUrl)
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchUrls(self, url, data, urlSearch):
|
2014-07-23 18:53:59 +00:00
|
|
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
2014-07-23 18:53:59 +00:00
|
|
|
searchUrls = []
|
|
|
|
searches = makeSequence(urlSearch)
|
|
|
|
for search in searches:
|
|
|
|
for match in search.finditer(data[0]):
|
|
|
|
searchUrl = match.group(1)
|
|
|
|
if not searchUrl:
|
2016-03-13 19:24:21 +00:00
|
|
|
raise ValueError("Pattern %s matched empty URL at %s." %
|
|
|
|
(search.pattern, url))
|
|
|
|
out.debug(u'matched URL %r with pattern %s' %
|
|
|
|
(searchUrl, search.pattern))
|
2014-07-23 18:53:59 +00:00
|
|
|
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
|
|
|
|
if searchUrls:
|
|
|
|
# do not search other links if one pattern matched
|
|
|
|
break
|
|
|
|
if not searchUrls:
|
|
|
|
patterns = [x.pattern for x in searches]
|
2016-03-13 19:24:21 +00:00
|
|
|
raise ValueError("Patterns %s not found at URL %s." %
|
|
|
|
(patterns, url))
|
2014-07-23 18:53:59 +00:00
|
|
|
return searchUrls
|
2014-07-23 18:53:59 +00:00
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchText(self, url, data, textSearch, optional):
|
2014-07-23 18:53:59 +00:00
|
|
|
"""Search text entry for given text pattern in a HTML page."""
|
2014-07-23 18:53:59 +00:00
|
|
|
if textSearch:
|
|
|
|
match = textSearch.search(data[0])
|
|
|
|
if match:
|
|
|
|
text = match.group(1)
|
2016-03-13 19:24:21 +00:00
|
|
|
out.debug(u'matched text %r with pattern %s' %
|
|
|
|
(text, textSearch.pattern))
|
2020-04-12 23:53:45 +00:00
|
|
|
return html.unescape(text).strip()
|
2014-07-23 18:53:59 +00:00
|
|
|
if optional:
|
|
|
|
return None
|
|
|
|
else:
|
2016-03-13 19:24:21 +00:00
|
|
|
raise ValueError("Pattern %s not found at URL %s." %
|
|
|
|
(textSearch.pattern, url))
|
2014-07-23 18:53:59 +00:00
|
|
|
else:
|
|
|
|
return None
|
2014-07-23 18:53:59 +00:00
|
|
|
|
2012-10-11 10:03:12 +00:00
|
|
|
|
2014-07-23 18:54:00 +00:00
|
|
|
class _ParserScraper(Scraper):
|
|
|
|
"""
|
|
|
|
Scraper base class that uses a HTML parser and XPath expressions.
|
|
|
|
|
|
|
|
All links are resolved before XPath searches are applied, so all URLs are
|
|
|
|
absolute!
|
|
|
|
|
|
|
|
Subclasses of this class should use XPath expressions as values for
|
|
|
|
prevSearch, imageSearch and textSearch. When the XPath directly selects an
|
|
|
|
attribute, it is used as the output.
|
|
|
|
|
|
|
|
All those searches try to do something intelligent when they match a
|
|
|
|
complete HTML Element: prevSearch and imageSearch try to find a "link
|
|
|
|
attribute" and use that as URL. textSearch strips all tags from the content
|
|
|
|
of the HTML element and returns that.
|
|
|
|
"""
|
|
|
|
|
2016-05-02 21:22:06 +00:00
|
|
|
BROKEN_NOT_OPEN_TAGS = re.compile(r'(<+)([ =0-9])')
|
|
|
|
|
2016-04-06 20:22:22 +00:00
|
|
|
# Taken directly from LXML
|
|
|
|
XML_DECL = re.compile(
|
|
|
|
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
|
|
|
|
2016-04-19 21:48:14 +00:00
|
|
|
NS = {
|
|
|
|
"re": "http://exslt.org/regular-expressions"
|
|
|
|
}
|
|
|
|
|
2014-10-13 20:43:06 +00:00
|
|
|
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
|
|
|
# another Python module, XPath is the default for now.
|
|
|
|
css = False
|
|
|
|
|
2016-05-02 21:22:06 +00:00
|
|
|
# Activate a workaround for unescaped < characters on libxml version older
|
|
|
|
# then 2.9.3. This is disabled by default since most sites are not THAT
|
|
|
|
# broken ;)
|
|
|
|
broken_html_bugfix = False
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def getPage(self, url):
|
2016-11-01 17:25:02 +00:00
|
|
|
page = super(_ParserScraper, self).getPage(url)
|
2016-04-06 20:22:22 +00:00
|
|
|
if page.encoding:
|
|
|
|
# Requests figured out the encoding, so we can deliver Unicode to
|
|
|
|
# LXML. Unfortunatly, LXML feels betrayed if there is still an XML
|
|
|
|
# declaration with (probably wrong!) encoding at the top of the
|
|
|
|
# document. Web browsers ignore such if the encoding was specified
|
|
|
|
# in the HTTP header and so do we.
|
2016-04-21 21:52:31 +00:00
|
|
|
text = self.XML_DECL.sub('\1\2', page.text, count=1)
|
2016-04-26 06:05:38 +00:00
|
|
|
tree = self._parse_page(text)
|
2016-04-06 20:22:22 +00:00
|
|
|
else:
|
2016-04-26 06:05:38 +00:00
|
|
|
tree = self._parse_page(page.content)
|
2014-07-23 18:54:00 +00:00
|
|
|
tree.make_links_absolute(url)
|
|
|
|
return tree
|
|
|
|
|
2016-04-26 06:05:38 +00:00
|
|
|
def _parse_page(self, data):
|
2020-04-12 23:53:45 +00:00
|
|
|
if self.broken_html_bugfix and lxml.etree.LIBXML_VERSION < (2, 9, 3):
|
2016-05-02 21:22:06 +00:00
|
|
|
def fix_not_open_tags(match):
|
|
|
|
fix = (len(match.group(1)) * '<') + match.group(2)
|
|
|
|
out.warn("Found possibly broken HTML '%s', fixing as '%s'" % (
|
|
|
|
match.group(0), fix), level=2)
|
|
|
|
return fix
|
|
|
|
data = self.BROKEN_NOT_OPEN_TAGS.sub(fix_not_open_tags, data)
|
|
|
|
|
2020-04-12 23:53:45 +00:00
|
|
|
tree = lxml.html.document_fromstring(data)
|
2016-04-26 06:05:38 +00:00
|
|
|
return tree
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchUrls(self, url, data, urlSearch):
|
2014-07-23 18:54:00 +00:00
|
|
|
"""Search all entries for given XPath in a HTML page."""
|
|
|
|
searchUrls = []
|
2016-04-21 22:42:46 +00:00
|
|
|
for match, search in self._matchPattern(data, urlSearch):
|
|
|
|
searchUrl = None
|
|
|
|
try:
|
|
|
|
for attrib in html_link_attrs:
|
|
|
|
if attrib in match.attrib:
|
|
|
|
searchUrl = match.get(attrib)
|
|
|
|
except AttributeError:
|
|
|
|
searchUrl = str(match)
|
|
|
|
out.debug(u'Matched URL %r with pattern %s' % (searchUrl, search))
|
|
|
|
if searchUrl is not None:
|
2016-04-20 21:51:54 +00:00
|
|
|
searchUrls.append(searchUrl)
|
|
|
|
|
2014-07-23 18:54:00 +00:00
|
|
|
if not searchUrls:
|
2016-04-21 22:42:46 +00:00
|
|
|
raise ValueError("XPath %s not found at URL %s." %
|
|
|
|
(urlSearch, url))
|
2014-07-23 18:54:00 +00:00
|
|
|
return searchUrls
|
|
|
|
|
2016-04-21 21:52:31 +00:00
|
|
|
def fetchText(self, url, data, textSearch, optional):
|
2014-07-23 18:54:00 +00:00
|
|
|
"""Search text entry for given text XPath in a HTML page."""
|
2016-04-21 22:42:46 +00:00
|
|
|
if not textSearch:
|
|
|
|
return None
|
|
|
|
text = []
|
|
|
|
for match, search in self._matchPattern(data, textSearch):
|
|
|
|
try:
|
|
|
|
text.append(match.text_content())
|
|
|
|
except AttributeError:
|
|
|
|
text.append(match)
|
|
|
|
out.debug(u'Matched text %r with XPath %s' % (text, search))
|
|
|
|
text = u' '.join(text)
|
|
|
|
if text.strip() == '':
|
|
|
|
if optional:
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
raise ValueError("XPath %s did not match anything at URL %s." %
|
|
|
|
(textSearch, url))
|
|
|
|
return text.strip()
|
|
|
|
|
|
|
|
def _matchPattern(self, data, patterns):
|
2016-04-21 21:52:31 +00:00
|
|
|
if self.css:
|
2015-07-17 23:22:40 +00:00
|
|
|
searchFun = data.cssselect
|
|
|
|
else:
|
2016-04-21 22:42:46 +00:00
|
|
|
def searchFun(s):
|
|
|
|
return data.xpath(s, namespaces=self.NS)
|
|
|
|
patterns = makeSequence(patterns)
|
|
|
|
for search in patterns:
|
|
|
|
matched = False
|
|
|
|
for match in searchFun(search):
|
|
|
|
matched = True
|
|
|
|
yield match, search
|
|
|
|
|
|
|
|
if matched and not self.multipleImagesPerStrip:
|
|
|
|
# do not search other links if one pattern matched
|
|
|
|
break
|
2014-07-23 18:54:00 +00:00
|
|
|
|
2016-04-19 21:53:46 +00:00
|
|
|
def getDisabledReasons(self):
|
2014-10-13 19:39:13 +00:00
|
|
|
res = {}
|
2016-04-19 21:53:46 +00:00
|
|
|
if self.css and cssselect is None:
|
2016-03-13 19:24:21 +00:00
|
|
|
res['css'] = (u"This module needs the cssselect " +
|
|
|
|
u"(python-cssselect) python module which is " +
|
|
|
|
u"not installed.")
|
2014-10-13 19:39:13 +00:00
|
|
|
return res
|
2014-07-23 18:54:00 +00:00
|
|
|
|
2016-03-13 19:24:21 +00:00
|
|
|
|
2016-04-13 20:05:44 +00:00
|
|
|
def find_scrapers(comic, multiple_allowed=False):
|
|
|
|
"""Get a list comic scraper objects.
|
|
|
|
|
|
|
|
Can return more than one entry if multiple_allowed is True, else it raises
|
|
|
|
a ValueError if multiple modules match. The match is a case insensitive
|
|
|
|
substring search.
|
|
|
|
"""
|
2012-12-12 16:41:29 +00:00
|
|
|
if not comic:
|
|
|
|
raise ValueError("empty comic name")
|
2012-06-20 19:58:13 +00:00
|
|
|
candidates = []
|
2012-10-11 10:03:12 +00:00
|
|
|
cname = comic.lower()
|
2016-06-05 19:47:58 +00:00
|
|
|
for scrapers in get_scrapers(include_removed=True):
|
2016-04-13 20:05:44 +00:00
|
|
|
lname = scrapers.name.lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if lname == cname:
|
|
|
|
# perfect match
|
2013-02-13 21:18:05 +00:00
|
|
|
if not multiple_allowed:
|
2016-04-13 20:05:44 +00:00
|
|
|
return [scrapers]
|
2013-02-13 21:18:05 +00:00
|
|
|
else:
|
2016-04-13 20:05:44 +00:00
|
|
|
candidates.append(scrapers)
|
2016-06-05 19:47:58 +00:00
|
|
|
elif cname in lname and scrapers.url:
|
2016-04-13 20:05:44 +00:00
|
|
|
candidates.append(scrapers)
|
2013-02-13 21:18:05 +00:00
|
|
|
if len(candidates) > 1 and not multiple_allowed:
|
2016-04-13 20:05:44 +00:00
|
|
|
comics = ", ".join(x.name for x in candidates)
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('multiple comics found: %s' % comics)
|
2013-02-13 21:18:05 +00:00
|
|
|
elif not candidates:
|
2012-12-12 16:41:29 +00:00
|
|
|
raise ValueError('comic %r not found' % comic)
|
2013-02-13 21:18:05 +00:00
|
|
|
return candidates
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2016-04-13 20:05:44 +00:00
|
|
|
_scrapers = None
|
2016-03-13 19:24:21 +00:00
|
|
|
|
|
|
|
|
2016-06-05 19:47:58 +00:00
|
|
|
def get_scrapers(include_removed=False):
|
2012-06-20 19:58:13 +00:00
|
|
|
"""Find all comic scraper classes in the plugins directory.
|
|
|
|
The result is cached.
|
2014-07-23 18:53:59 +00:00
|
|
|
@return: list of Scraper classes
|
|
|
|
@rtype: list of Scraper
|
2012-06-20 19:58:13 +00:00
|
|
|
"""
|
2016-04-13 20:05:44 +00:00
|
|
|
global _scrapers
|
|
|
|
if _scrapers is None:
|
2013-04-30 04:40:20 +00:00
|
|
|
out.debug(u"Loading comic modules...")
|
2013-12-11 16:54:39 +00:00
|
|
|
modules = loader.get_modules('plugins')
|
2016-05-20 23:18:42 +00:00
|
|
|
plugins = list(loader.get_plugins(modules, Scraper))
|
|
|
|
_scrapers = sorted([m for x in plugins for m in x.getmodules()],
|
|
|
|
key=lambda p: p.name)
|
2012-06-20 19:58:13 +00:00
|
|
|
check_scrapers()
|
2016-05-20 23:18:42 +00:00
|
|
|
out.debug(u"... %d modules loaded from %d classes." % (
|
|
|
|
len(_scrapers), len(plugins)))
|
2016-06-05 19:47:58 +00:00
|
|
|
if include_removed:
|
|
|
|
return _scrapers
|
|
|
|
else:
|
|
|
|
return [x for x in _scrapers if x.url]
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
def check_scrapers():
|
2016-04-13 20:05:44 +00:00
|
|
|
"""Check for duplicate scraper names."""
|
2012-06-20 19:58:13 +00:00
|
|
|
d = {}
|
2016-04-13 20:05:44 +00:00
|
|
|
for scraper in _scrapers:
|
|
|
|
name = scraper.name.lower()
|
2012-06-20 19:58:13 +00:00
|
|
|
if name in d:
|
2016-04-13 20:05:44 +00:00
|
|
|
name1 = scraper.name
|
|
|
|
name2 = d[name].name
|
2016-03-13 19:24:21 +00:00
|
|
|
raise ValueError('duplicate scrapers %s and %s found' %
|
|
|
|
(name1, name2))
|
2016-04-13 20:05:44 +00:00
|
|
|
d[name] = scraper
|