Add text search in comic strips.

This commit is contained in:
Bastian Kleineidam 2013-11-29 20:26:49 +01:00
parent 468b34034b
commit 0eaf9a3139
3 changed files with 35 additions and 7 deletions

View file

@ -2,6 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
import os
import codecs
from .output import out
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
@ -10,13 +11,14 @@ from .events import getHandler
class ComicStrip(object):
"""A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer, session):
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
"""Store the image URL list."""
self.name = name
self.stripUrl = stripUrl
self.imageUrls = imageUrls
self.namer = namer
self.session = session
self.text = text
def getImages(self):
"""Get a list of image downloaders."""
@ -29,7 +31,7 @@ class ComicStrip(object):
if filename is None:
filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
class ComicImage(object):
@ -37,7 +39,7 @@ class ComicImage(object):
ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session):
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
"""Set URL and filename."""
self.name = name
self.referrer = referrer
@ -46,6 +48,7 @@ class ComicImage(object):
filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename)
self.session = session
self.text = text
def connect(self):
"""Connect to host and get meta information."""
@ -99,4 +102,9 @@ class ComicImage(object):
else:
out.info(u"Saved %s (%s)." % (fn, strsize(size)))
getHandler().comicDownloaded(self, fn)
if self.text:
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
out.debug(u'Writing comic text to file %s...' % fntext)
with codecs.open(fntext, 'w', 'utf-8') as textOut:
textOut.write(self.text)
return fn, True

View file

@ -5,8 +5,8 @@ import requests
import time
import os
from . import loader, configuration
from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence,
get_system_uid, urlopen, getDirname)
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
makeSequence, get_system_uid, urlopen, getDirname, unescape)
from .comic import ComicStrip
from .output import out
from .events import getHandler
@ -66,6 +66,10 @@ class _BasicScraper(object):
# this can also be a list or tuple of compiled regular expressions
imageSearch = None
# compiled regular expression to store a text together with the image
# sometimes comic strips have additional text info for each comic
textSearch = None
# usually the index format help
help = ''
@ -117,7 +121,13 @@ class _BasicScraper(object):
elif not imageUrls:
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
if self.textSearch:
text = fetchText(url, data, self.textSearch)
if text:
text = unescape(text)
else:
text = None
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
def getStrips(self, maxstrips=None):
"""Get comic strips."""

View file

@ -246,10 +246,20 @@ def fetchUrls(url, data, baseUrl, urlSearch):
def fetchUrl(url, data, baseUrl, urlSearch):
"""Search first entry for given URL pattern in a HTML page."""
"""Search first URL entry for given URL pattern in a HTML page."""
return fetchUrls(url, data, baseUrl, urlSearch)[0]
def fetchText(url, data, textSearch):
"""Search text entry for given text pattern in a HTML page."""#
match = textSearch.search(data)
if match:
text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
return text
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
_htmlparser = HTMLParser()
def unescape(text):
"""Replace HTML entities and character references."""