From 0eaf9a3139681fa6b91e1ce76bfd56fe18cf2384 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Fri, 29 Nov 2013 20:26:49 +0100 Subject: [PATCH] Add text search in comic strips. --- dosagelib/comic.py | 14 +++++++++++--- dosagelib/scraper.py | 16 +++++++++++++--- dosagelib/util.py | 12 +++++++++++- 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/dosagelib/comic.py b/dosagelib/comic.py index 96bcbae67..4a18348c6 100644 --- a/dosagelib/comic.py +++ b/dosagelib/comic.py @@ -2,6 +2,7 @@ # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2013 Bastian Kleineidam import os +import codecs from .output import out from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename @@ -10,13 +11,14 @@ from .events import getHandler class ComicStrip(object): """A list of comic image URLs.""" - def __init__(self, name, stripUrl, imageUrls, namer, session): + def __init__(self, name, stripUrl, imageUrls, namer, session, text=None): """Store the image URL list.""" self.name = name self.stripUrl = stripUrl self.imageUrls = imageUrls self.namer = namer self.session = session + self.text = text def getImages(self): """Get a list of image downloaders.""" @@ -29,7 +31,7 @@ class ComicStrip(object): if filename is None: filename = url.rsplit('/', 1)[1] dirname = getDirname(self.name) - return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session) + return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text) class ComicImage(object): @@ -37,7 +39,7 @@ class ComicImage(object): ChunkBytes = 1024 * 100 # 100KB - def __init__(self, name, url, referrer, dirname, filename, session): + def __init__(self, name, url, referrer, dirname, filename, session, text=None): """Set URL and filename.""" self.name = name self.referrer = referrer @@ -46,6 +48,7 @@ class ComicImage(object): filename = getFilename(filename) self.filename, self.ext = os.path.splitext(filename) self.session = session + self.text = text def connect(self): """Connect to host and get meta information.""" @@ -99,4 +102,9 @@ class ComicImage(object): else: out.info(u"Saved %s (%s)." % (fn, strsize(size))) getHandler().comicDownloaded(self, fn) + if self.text: + fntext = os.path.join(comicDir, "%s.txt" % self.filename) + out.debug(u'Writing comic text to file %s...' % fntext) + with codecs.open(fntext, 'w', 'utf-8') as textOut: + textOut.write(self.text) return fn, True diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index 533d706fe..234db1b8f 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -5,8 +5,8 @@ import requests import time import os from . import loader, configuration -from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence, - get_system_uid, urlopen, getDirname) +from .util import (fetchUrl, fetchUrls, fetchText, getPageContent, + makeSequence, get_system_uid, urlopen, getDirname, unescape) from .comic import ComicStrip from .output import out from .events import getHandler @@ -66,6 +66,10 @@ class _BasicScraper(object): # this can also be a list or tuple of compiled regular expressions imageSearch = None + # compiled regular expression to store a text together with the image + # sometimes comic strips have additional text info for each comic + textSearch = None + # usually the index format help help = '' @@ -117,7 +121,13 @@ class _BasicScraper(object): elif not imageUrls: patterns = [x.pattern for x in makeSequence(self.imageSearch)] out.warn(u"found no images at %s with patterns %s" % (url, patterns)) - return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session) + if self.textSearch: + text = fetchText(url, data, self.textSearch) + if text: + text = unescape(text) + else: + text = None + return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text) def getStrips(self, maxstrips=None): """Get comic strips.""" diff --git a/dosagelib/util.py b/dosagelib/util.py index a813f93e1..f208759f3 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -246,10 +246,20 @@ def fetchUrls(url, data, baseUrl, urlSearch): def fetchUrl(url, data, baseUrl, urlSearch): - """Search first entry for given URL pattern in a HTML page.""" + """Search first URL entry for given URL pattern in a HTML page.""" return fetchUrls(url, data, baseUrl, urlSearch)[0] +def fetchText(url, data, textSearch): + """Search text entry for given text pattern in a HTML page."""# + match = textSearch.search(data) + if match: + text = match.group(1) + out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) + return text + raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) + + _htmlparser = HTMLParser() def unescape(text): """Replace HTML entities and character references."""