Add text search in comic strips.
This commit is contained in:
parent
468b34034b
commit
0eaf9a3139
3 changed files with 35 additions and 7 deletions
|
@ -2,6 +2,7 @@
|
|||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
import os
|
||||
import codecs
|
||||
|
||||
from .output import out
|
||||
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
|
||||
|
@ -10,13 +11,14 @@ from .events import getHandler
|
|||
class ComicStrip(object):
|
||||
"""A list of comic image URLs."""
|
||||
|
||||
def __init__(self, name, stripUrl, imageUrls, namer, session):
|
||||
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
|
||||
"""Store the image URL list."""
|
||||
self.name = name
|
||||
self.stripUrl = stripUrl
|
||||
self.imageUrls = imageUrls
|
||||
self.namer = namer
|
||||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def getImages(self):
|
||||
"""Get a list of image downloaders."""
|
||||
|
@ -29,7 +31,7 @@ class ComicStrip(object):
|
|||
if filename is None:
|
||||
filename = url.rsplit('/', 1)[1]
|
||||
dirname = getDirname(self.name)
|
||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
|
||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
|
||||
|
||||
|
||||
class ComicImage(object):
|
||||
|
@ -37,7 +39,7 @@ class ComicImage(object):
|
|||
|
||||
ChunkBytes = 1024 * 100 # 100KB
|
||||
|
||||
def __init__(self, name, url, referrer, dirname, filename, session):
|
||||
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
|
||||
"""Set URL and filename."""
|
||||
self.name = name
|
||||
self.referrer = referrer
|
||||
|
@ -46,6 +48,7 @@ class ComicImage(object):
|
|||
filename = getFilename(filename)
|
||||
self.filename, self.ext = os.path.splitext(filename)
|
||||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def connect(self):
|
||||
"""Connect to host and get meta information."""
|
||||
|
@ -99,4 +102,9 @@ class ComicImage(object):
|
|||
else:
|
||||
out.info(u"Saved %s (%s)." % (fn, strsize(size)))
|
||||
getHandler().comicDownloaded(self, fn)
|
||||
if self.text:
|
||||
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
|
||||
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||
with codecs.open(fntext, 'w', 'utf-8') as textOut:
|
||||
textOut.write(self.text)
|
||||
return fn, True
|
||||
|
|
|
@ -5,8 +5,8 @@ import requests
|
|||
import time
|
||||
import os
|
||||
from . import loader, configuration
|
||||
from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence,
|
||||
get_system_uid, urlopen, getDirname)
|
||||
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
|
||||
makeSequence, get_system_uid, urlopen, getDirname, unescape)
|
||||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
from .events import getHandler
|
||||
|
@ -66,6 +66,10 @@ class _BasicScraper(object):
|
|||
# this can also be a list or tuple of compiled regular expressions
|
||||
imageSearch = None
|
||||
|
||||
# compiled regular expression to store a text together with the image
|
||||
# sometimes comic strips have additional text info for each comic
|
||||
textSearch = None
|
||||
|
||||
# usually the index format help
|
||||
help = ''
|
||||
|
||||
|
@ -117,7 +121,13 @@ class _BasicScraper(object):
|
|||
elif not imageUrls:
|
||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
||||
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
||||
if self.textSearch:
|
||||
text = fetchText(url, data, self.textSearch)
|
||||
if text:
|
||||
text = unescape(text)
|
||||
else:
|
||||
text = None
|
||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
||||
|
||||
def getStrips(self, maxstrips=None):
|
||||
"""Get comic strips."""
|
||||
|
|
|
@ -246,10 +246,20 @@ def fetchUrls(url, data, baseUrl, urlSearch):
|
|||
|
||||
|
||||
def fetchUrl(url, data, baseUrl, urlSearch):
|
||||
"""Search first entry for given URL pattern in a HTML page."""
|
||||
"""Search first URL entry for given URL pattern in a HTML page."""
|
||||
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
||||
|
||||
|
||||
def fetchText(url, data, textSearch):
|
||||
"""Search text entry for given text pattern in a HTML page."""#
|
||||
match = textSearch.search(data)
|
||||
if match:
|
||||
text = match.group(1)
|
||||
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
||||
return text
|
||||
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
||||
|
||||
|
||||
_htmlparser = HTMLParser()
|
||||
def unescape(text):
|
||||
"""Replace HTML entities and character references."""
|
||||
|
|
Loading…
Reference in a new issue