Add text search in comic strips.
This commit is contained in:
parent
468b34034b
commit
0eaf9a3139
3 changed files with 35 additions and 7 deletions
dosagelib
|
@ -2,6 +2,7 @@
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||||
import os
|
import os
|
||||||
|
import codecs
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
|
from .util import getImageObject, normaliseURL, unquote, strsize, getDirname, getFilename
|
||||||
|
@ -10,13 +11,14 @@ from .events import getHandler
|
||||||
class ComicStrip(object):
|
class ComicStrip(object):
|
||||||
"""A list of comic image URLs."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, name, stripUrl, imageUrls, namer, session):
|
def __init__(self, name, stripUrl, imageUrls, namer, session, text=None):
|
||||||
"""Store the image URL list."""
|
"""Store the image URL list."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.stripUrl = stripUrl
|
self.stripUrl = stripUrl
|
||||||
self.imageUrls = imageUrls
|
self.imageUrls = imageUrls
|
||||||
self.namer = namer
|
self.namer = namer
|
||||||
self.session = session
|
self.session = session
|
||||||
|
self.text = text
|
||||||
|
|
||||||
def getImages(self):
|
def getImages(self):
|
||||||
"""Get a list of image downloaders."""
|
"""Get a list of image downloaders."""
|
||||||
|
@ -29,7 +31,7 @@ class ComicStrip(object):
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = url.rsplit('/', 1)[1]
|
filename = url.rsplit('/', 1)[1]
|
||||||
dirname = getDirname(self.name)
|
dirname = getDirname(self.name)
|
||||||
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session)
|
return ComicImage(self.name, url, self.stripUrl, dirname, filename, self.session, text=self.text)
|
||||||
|
|
||||||
|
|
||||||
class ComicImage(object):
|
class ComicImage(object):
|
||||||
|
@ -37,7 +39,7 @@ class ComicImage(object):
|
||||||
|
|
||||||
ChunkBytes = 1024 * 100 # 100KB
|
ChunkBytes = 1024 * 100 # 100KB
|
||||||
|
|
||||||
def __init__(self, name, url, referrer, dirname, filename, session):
|
def __init__(self, name, url, referrer, dirname, filename, session, text=None):
|
||||||
"""Set URL and filename."""
|
"""Set URL and filename."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
|
@ -46,6 +48,7 @@ class ComicImage(object):
|
||||||
filename = getFilename(filename)
|
filename = getFilename(filename)
|
||||||
self.filename, self.ext = os.path.splitext(filename)
|
self.filename, self.ext = os.path.splitext(filename)
|
||||||
self.session = session
|
self.session = session
|
||||||
|
self.text = text
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Connect to host and get meta information."""
|
"""Connect to host and get meta information."""
|
||||||
|
@ -99,4 +102,9 @@ class ComicImage(object):
|
||||||
else:
|
else:
|
||||||
out.info(u"Saved %s (%s)." % (fn, strsize(size)))
|
out.info(u"Saved %s (%s)." % (fn, strsize(size)))
|
||||||
getHandler().comicDownloaded(self, fn)
|
getHandler().comicDownloaded(self, fn)
|
||||||
|
if self.text:
|
||||||
|
fntext = os.path.join(comicDir, "%s.txt" % self.filename)
|
||||||
|
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||||
|
with codecs.open(fntext, 'w', 'utf-8') as textOut:
|
||||||
|
textOut.write(self.text)
|
||||||
return fn, True
|
return fn, True
|
||||||
|
|
|
@ -5,8 +5,8 @@ import requests
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
from . import loader, configuration
|
from . import loader, configuration
|
||||||
from .util import (fetchUrl, fetchUrls, getPageContent, makeSequence,
|
from .util import (fetchUrl, fetchUrls, fetchText, getPageContent,
|
||||||
get_system_uid, urlopen, getDirname)
|
makeSequence, get_system_uid, urlopen, getDirname, unescape)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
from .output import out
|
from .output import out
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
@ -66,6 +66,10 @@ class _BasicScraper(object):
|
||||||
# this can also be a list or tuple of compiled regular expressions
|
# this can also be a list or tuple of compiled regular expressions
|
||||||
imageSearch = None
|
imageSearch = None
|
||||||
|
|
||||||
|
# compiled regular expression to store a text together with the image
|
||||||
|
# sometimes comic strips have additional text info for each comic
|
||||||
|
textSearch = None
|
||||||
|
|
||||||
# usually the index format help
|
# usually the index format help
|
||||||
help = ''
|
help = ''
|
||||||
|
|
||||||
|
@ -117,7 +121,13 @@ class _BasicScraper(object):
|
||||||
elif not imageUrls:
|
elif not imageUrls:
|
||||||
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
patterns = [x.pattern for x in makeSequence(self.imageSearch)]
|
||||||
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
out.warn(u"found no images at %s with patterns %s" % (url, patterns))
|
||||||
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session)
|
if self.textSearch:
|
||||||
|
text = fetchText(url, data, self.textSearch)
|
||||||
|
if text:
|
||||||
|
text = unescape(text)
|
||||||
|
else:
|
||||||
|
text = None
|
||||||
|
return ComicStrip(self.getName(), url, imageUrls, self.namer, self.session, text=text)
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
"""Get comic strips."""
|
"""Get comic strips."""
|
||||||
|
|
|
@ -246,10 +246,20 @@ def fetchUrls(url, data, baseUrl, urlSearch):
|
||||||
|
|
||||||
|
|
||||||
def fetchUrl(url, data, baseUrl, urlSearch):
|
def fetchUrl(url, data, baseUrl, urlSearch):
|
||||||
"""Search first entry for given URL pattern in a HTML page."""
|
"""Search first URL entry for given URL pattern in a HTML page."""
|
||||||
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
||||||
|
|
||||||
|
|
||||||
|
def fetchText(url, data, textSearch):
|
||||||
|
"""Search text entry for given text pattern in a HTML page."""#
|
||||||
|
match = textSearch.search(data)
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
|
||||||
|
return text
|
||||||
|
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
|
||||||
|
|
||||||
|
|
||||||
_htmlparser = HTMLParser()
|
_htmlparser = HTMLParser()
|
||||||
def unescape(text):
|
def unescape(text):
|
||||||
"""Replace HTML entities and character references."""
|
"""Replace HTML entities and character references."""
|
||||||
|
|
Loading…
Add table
Reference in a new issue