Add text search in comic strips.

This commit is contained in:
Bastian Kleineidam 2013-11-29 20:26:49 +01:00
parent 468b34034b
commit 0eaf9a3139
3 changed files with 35 additions and 7 deletions

View file

@ -2,6 +2,7 @@
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
import os import os
import codecs
from .output import out from .output import out
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
@ -10,13 +11,14 @@ from .events import getHandler
class ComicStrip(object): class ComicStrip(object):
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, name, stripUrl, imageUrls, namer, session): def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
"""Store the image URL list.""" """Store the image URL list."""
self.name = name self.name = name
self.stripUrl = stripUrl self.stripUrl = stripUrl
self.imageUrls = imageUrls self.imageUrls = imageUrls
self.namer = namer self.namer = namer
self.session = session self.session = session
self.text = text
def getImages(self): def getImages(self):
"""Get a list of image downloaders.""" """Get a list of image downloaders."""
@ -29,7 +31,7 @@ class ComicStrip(object):
if filename is None: if filename is None:
filename = url.rsplit('/', 1)[1] filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name) dirname = getDirname(self.name)
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session) return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
class ComicImage(object): class ComicImage(object):
@ -37,7 +39,7 @@ class ComicImage(object):
ChunkBytes = 1024 * 100 # 100KB ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session): def __init__(self, name, url, referrer, dirname, filename, session, text=None):
"""Set URL and filename.""" """Set URL and filename."""
self.name = name self.name = name
self.referrer = referrer self.referrer = referrer
@ -46,6 +48,7 @@ class ComicImage(object):
filename = getFilename(filename) filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename) self.filename, self.ext = os.path.splitext(filename)
self.session = session self.session = session
self.text = text
def connect(self): def connect(self):
"""Connect to host and get meta information.""" """Connect to host and get meta information."""
@ -99,4 +102,9 @@ class ComicImage(object):
else: else:
out.info(u"Saved %s (%s)." % (fn, strsize(size))) out.info(u"Saved %s (%s)." % (fn, strsize(size)))
getHandler().comicDownloaded(self, fn) getHandler().comicDownloaded(self, fn)
if self.text:
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
out.debug(u'Writing comic text to file %s...' % fntext)
with codecs.open(fntext, 'w', 'utf-8') as textOut:
textOut.write(self.text)
return fn, True return fn, True

View file

@ -5,8 +5,8 @@ import requests
import time import time
import os import os
from . import loader, configuration from . import loader, configuration
from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence, from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
get_system_uid, urlopen, getDirname) makeSequence, get_system_uid, urlopen, getDirname, unescape)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -66,6 +66,10 @@ class _BasicScraper(object):
# this can also be a list or tuple of compiled regular expressions # this can also be a list or tuple of compiled regular expressions
imageSearch = None imageSearch = None
# compiled regular expression to store a text together with the image
# sometimes comic strips have additional text info for each comic
textSearch = None
# usually the index format help # usually the index format help
help = '' help = ''
@ -117,7 +121,13 @@ class _BasicScraper(object):
elif not imageUrls: elif not imageUrls:
patterns = [x.pattern for x in makeSequence(self.imageSearch)] patterns = [x.pattern for x in makeSequence(self.imageSearch)]
out.warn(u"found no images at %s with patterns %s" % (url, patterns)) out.warn(u"found no images at %s with patterns %s" % (url, patterns))
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session) if self.textSearch:
text = fetchText(url, data, self.textSearch)
if text:
text = unescape(text)
else:
text = None
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
"""Get comic strips.""" """Get comic strips."""

View file

@ -246,10 +246,20 @@ def fetchUrls(url, data, baseUrl, urlSearch):
def fetchUrl(url, data, baseUrl, urlSearch): def fetchUrl(url, data, baseUrl, urlSearch):
"""Search first entry for given URL pattern in a HTML page.""" """Search first URL entry for given URL pattern in a HTML page."""
return fetchUrls(url, data, baseUrl, urlSearch)[0] return fetchUrls(url, data, baseUrl, urlSearch)[0]
def fetchText(url, data, textSearch):
"""Search text entry for given text pattern in a HTML page."""#
match = textSearch.search(data)
if match:
text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
return text
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
_htmlparser = HTMLParser() _htmlparser = HTMLParser()
def unescape(text): def unescape(text):
"""Replace HTML entities and character references.""" """Replace HTML entities and character references."""