Refactor: Move regualar expression scraping into a new class.
- This also makes "<base href>" handling an internal detail of the regular expression scraper, future scrapers might not need that or handle it in another way.
This commit is contained in:
parent
3a929ceea6
commit
4265053846
2 changed files with 80 additions and 28 deletions
|
@ -421,7 +421,7 @@ class CyanideAndHappiness(_BasicScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return "/comics/play-button.png" in data
|
return "/comics/play-button.png" in data[0]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def namer(cls, imageUrl, pageUrl):
|
def namer(cls, imageUrl, pageUrl):
|
||||||
|
|
|
@ -5,9 +5,8 @@ import requests
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
from . import loader, configuration
|
from . import loader, configuration, util
|
||||||
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
|
from .util import (makeSequence, get_system_uid, urlopen, getDirname)
|
||||||
makeSequence, get_system_uid, urlopen, getDirname, unescape)
|
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -26,8 +25,8 @@ class Genre:
|
||||||
other = u"Other"
|
other = u"Other"
|
||||||
|
|
||||||
|
|
||||||
class _BasicScraper(object):
|
class Scraper(object):
|
||||||
'''Base class with scrape functions for comics.'''
|
'''Base class for all comic scraper, but without a specific scrape implementation.'''
|
||||||
|
|
||||||
# The URL for the comic strip
|
# The URL for the comic strip
|
||||||
url = None
|
url = None
|
||||||
|
@ -59,15 +58,15 @@ class _BasicScraper(object):
|
||||||
# list of genres for this comic strip
|
# list of genres for this comic strip
|
||||||
genres = (Genre.other,)
|
genres = (Genre.other,)
|
||||||
|
|
||||||
# compiled regular expression that will locate the URL for the previous strip in a page
|
# an expression that will locate the URL for the previous strip in a page
|
||||||
# this can also be a list or tuple of compiled regular expressions
|
# this can also be a list or tuple
|
||||||
prevSearch = None
|
prevSearch = None
|
||||||
|
|
||||||
# compiled regular expression that will locate the strip image URLs strip in a page
|
# an expression that will locate the strip image URLs strip in a page
|
||||||
# this can also be a list or tuple of compiled regular expressions
|
# this can also be a list or tuple
|
||||||
imageSearch = None
|
imageSearch = None
|
||||||
|
|
||||||
# compiled regular expression to store a text together with the image
|
# an expression to store a text together with the image
|
||||||
# sometimes comic strips have additional text info for each comic
|
# sometimes comic strips have additional text info for each comic
|
||||||
textSearch = None
|
textSearch = None
|
||||||
|
|
||||||
|
@ -94,7 +93,7 @@ class _BasicScraper(object):
|
||||||
|
|
||||||
def __cmp__(self, other):
|
def __cmp__(self, other):
|
||||||
"""Compare scraper by name and index list."""
|
"""Compare scraper by name and index list."""
|
||||||
if not isinstance(other, _BasicScraper):
|
if not isinstance(other, Scraper):
|
||||||
return 1
|
return 1
|
||||||
# first, order by name
|
# first, order by name
|
||||||
d = cmp(self.getName(), other.getName())
|
d = cmp(self.getName(), other.getName())
|
||||||
|
@ -111,26 +110,24 @@ class _BasicScraper(object):
|
||||||
"""Determine if search for images in given URL should be skipped."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def getComicStrip(self, url, data, baseUrl):
|
def getComicStrip(self, url, data):
|
||||||
"""Get comic strip downloader for given URL and data."""
|
"""Get comic strip downloader for given URL and data."""
|
||||||
imageUrls = fetchUrls(url, data, baseUrl, self.imageSearch)
|
imageUrls = self.fetchUrls(url, data, self.imageSearch)
|
||||||
# map modifier function on image URLs
|
# map modifier function on image URLs
|
||||||
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
||||||
# remove duplicate URLs
|
# remove duplicate URLs
|
||||||
imageUrls = set(imageUrls)
|
imageUrls = set(imageUrls)
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
||||||
out.warn(u"found %d images instead of 1 at %s with patterns %s" % (len(imageUrls), url, patterns))
|
out.warn(u"found %d images instead of 1 at %s with expressions %s" % (len(imageUrls), url, patterns))
|
||||||
image = sorted(imageUrls)[0]
|
image = sorted(imageUrls)[0]
|
||||||
out.warn(u"choosing image %s" % image)
|
out.warn(u"choosing image %s" % image)
|
||||||
imageUrls = (image,)
|
imageUrls = (image,)
|
||||||
elif not imageUrls:
|
elif not imageUrls:
|
||||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
||||||
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
out.warn(u"found no images at %s with expressions %s" % (url, patterns))
|
||||||
if self.textSearch:
|
if self.textSearch:
|
||||||
text = fetchText(url, data, self.textSearch, optional=self.textOptional)
|
text = self.fetchText(url, data, self.textSearch, optional=self.textOptional)
|
||||||
if text:
|
|
||||||
text = unescape(text).strip()
|
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
||||||
|
@ -167,13 +164,13 @@ class _BasicScraper(object):
|
||||||
seen_urls = set()
|
seen_urls = set()
|
||||||
while url:
|
while url:
|
||||||
out.info(u'Get strip URL %s' % url, level=1)
|
out.info(u'Get strip URL %s' % url, level=1)
|
||||||
data, baseUrl = getPageContent(url, self.session)
|
data = self.getPage(url)
|
||||||
if self.shouldSkipUrl(url, data):
|
if self.shouldSkipUrl(url, data):
|
||||||
out.info(u'Skipping URL %s' % url)
|
out.info(u'Skipping URL %s' % url)
|
||||||
self.skippedUrls.add(url)
|
self.skippedUrls.add(url)
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
yield self.getComicStrip(url, data, baseUrl)
|
yield self.getComicStrip(url, data)
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
# image not found
|
# image not found
|
||||||
out.exception(msg)
|
out.exception(msg)
|
||||||
|
@ -185,7 +182,7 @@ class _BasicScraper(object):
|
||||||
maxstrips -= 1
|
maxstrips -= 1
|
||||||
if maxstrips <= 0:
|
if maxstrips <= 0:
|
||||||
break
|
break
|
||||||
prevUrl = self.getPrevUrl(url, data, baseUrl)
|
prevUrl = self.getPrevUrl(url, data)
|
||||||
seen_urls.add(url)
|
seen_urls.add(url)
|
||||||
if prevUrl in seen_urls:
|
if prevUrl in seen_urls:
|
||||||
# avoid recursive URL loops
|
# avoid recursive URL loops
|
||||||
|
@ -196,18 +193,18 @@ class _BasicScraper(object):
|
||||||
# wait up to 2 seconds for next URL
|
# wait up to 2 seconds for next URL
|
||||||
time.sleep(1.0 + random.random())
|
time.sleep(1.0 + random.random())
|
||||||
|
|
||||||
def getPrevUrl(self, url, data, baseUrl):
|
def getPrevUrl(self, url, data):
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
prevUrl = None
|
prevUrl = None
|
||||||
if self.prevSearch:
|
if self.prevSearch:
|
||||||
try:
|
try:
|
||||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
prevUrl = self.fetchUrl(url, data, self.prevSearch)
|
||||||
except ValueError as msg:
|
except ValueError as msg:
|
||||||
# assume there is no previous URL, but print a warning
|
# assume there is no previous URL, but print a warning
|
||||||
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
out.warn(u"%s Assuming no previous comic strips exist." % msg)
|
||||||
else:
|
else:
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug(u"Matched previous URL %s" % prevUrl)
|
out.debug(u"Found previous URL %s" % prevUrl)
|
||||||
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
getHandler().comicPageLink(self.getName(), url, prevUrl)
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
|
@ -278,6 +275,61 @@ class _BasicScraper(object):
|
||||||
with open(filename, 'w') as f:
|
with open(filename, 'w') as f:
|
||||||
f.write('All comics should be downloaded here.')
|
f.write('All comics should be downloaded here.')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPage(cls, url):
|
||||||
|
"""
|
||||||
|
Fetch a page and return the opaque repesentation for the data parameter
|
||||||
|
of fetchUrls and fetchText.
|
||||||
|
|
||||||
|
Implementation notes: While this base class does not restrict how the
|
||||||
|
returned data is structured, subclasses (specific scrapers) should specify
|
||||||
|
how this data works, since the stracture is passed into different methods
|
||||||
|
which can be defined by comic modules and these methods should be able to
|
||||||
|
use the data if they so desire... (Affected methods: shouldSkipUrl,
|
||||||
|
imageUrlModifier)
|
||||||
|
"""
|
||||||
|
raise ValueError("No implementation for getPage!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
|
raise ValueError("No implementation for fetchUrls!")
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrl(cls, url, data, urlSearch):
|
||||||
|
return cls.fetchUrls(url, data, urlSearch)[0]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
|
raise ValueError("No implementation for fetchText!")
|
||||||
|
|
||||||
|
|
||||||
|
class _BasicScraper(Scraper):
|
||||||
|
"""
|
||||||
|
Scraper base class that matches regular expressions against HTML pages.
|
||||||
|
|
||||||
|
Subclasses of this scraper should use compiled regular expressions as
|
||||||
|
values for prevSearch, imageSearch and textSearch.
|
||||||
|
|
||||||
|
Implementation note: The return value of getPage is a tuple: the first
|
||||||
|
element is the raw HTML page text, the second element is the base URL (if
|
||||||
|
any).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def getPage(cls, url):
|
||||||
|
content, baseUrl = util.getPageContent(url, cls.session)
|
||||||
|
return (content, baseUrl)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchUrls(cls, url, data, urlSearch):
|
||||||
|
"""Search all entries for given URL pattern(s) in a HTML page."""
|
||||||
|
return util.fetchUrls(url, data[0], data[1], urlSearch)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def fetchText(cls, url, data, textSearch, optional):
|
||||||
|
"""Search text entry for given text pattern in a HTML page."""
|
||||||
|
return util.fetchText(url, data[0], textSearch, optional)
|
||||||
|
|
||||||
|
|
||||||
def find_scraperclasses(comic, multiple_allowed=False):
|
def find_scraperclasses(comic, multiple_allowed=False):
|
||||||
"""Get a list comic scraper classes. Can return more than one entries if
|
"""Get a list comic scraper classes. Can return more than one entries if
|
||||||
|
@ -309,14 +361,14 @@ _scraperclasses = None
|
||||||
def get_scraperclasses():
|
def get_scraperclasses():
|
||||||
"""Find all comic scraper classes in the plugins directory.
|
"""Find all comic scraper classes in the plugins directory.
|
||||||
The result is cached.
|
The result is cached.
|
||||||
@return: list of _BasicScraper classes
|
@return: list of Scraper classes
|
||||||
@rtype: list of _BasicScraper
|
@rtype: list of Scraper
|
||||||
"""
|
"""
|
||||||
global _scraperclasses
|
global _scraperclasses
|
||||||
if _scraperclasses is None:
|
if _scraperclasses is None:
|
||||||
out.debug(u"Loading comic modules...")
|
out.debug(u"Loading comic modules...")
|
||||||
modules = loader.get_modules('plugins')
|
modules = loader.get_modules('plugins')
|
||||||
plugins = loader.get_plugins(modules, _BasicScraper)
|
plugins = loader.get_plugins(modules, Scraper)
|
||||||
_scraperclasses = list(plugins)
|
_scraperclasses = list(plugins)
|
||||||
check_scrapers()
|
check_scrapers()
|
||||||
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
out.debug(u"... %d modules loaded." % len(_scraperclasses))
|
||||||
|
|
Loading…
Reference in a new issue