Ignore case for comic download directories.

Since we already match comics case-insensitive on the command line, this
was a logical step, even if this means changing quite a bit of code that
all tries to resolve the "comic directory" in a slightly different
way...
This commit is contained in:
Tobias Gruetzmacher 2016-06-05 23:55:54 +02:00
parent 215d597573
commit 64c8e502ca
5 changed files with 89 additions and 67 deletions

View file

@ -12,7 +12,7 @@ import contextlib
from datetime import datetime
from .output import out
from .util import unquote, getDirname, getFilename, urlopen, strsize
from .util import unquote, getFilename, urlopen, strsize
from .events import getHandler
@ -25,13 +25,11 @@ RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object):
"""A list of comic image URLs."""
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
def __init__(self, scraper, strip_url, image_urls, text=None):
"""Store the image URL list."""
self.name = name
self.scraper = scraper
self.strip_url = strip_url
self.image_urls = image_urls
self.namer = namer
self.session = session
self.text = text
def getImages(self):
@ -41,12 +39,11 @@ class ComicStrip(object):
def getDownloader(self, url):
"""Get an image downloader."""
filename = self.namer(url, self.strip_url)
filename = self.scraper.namer(url, self.strip_url)
if filename is None:
filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name)
return ComicImage(self.name, url, self.strip_url, dirname, filename,
self.session, text=self.text)
return ComicImage(self.scraper, url, self.strip_url, filename,
text=self.text)
class ComicImage(object):
@ -54,16 +51,13 @@ class ComicImage(object):
ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session,
text=None):
def __init__(self, scraper, url, referrer, filename, text=None):
"""Set URL and filename."""
self.name = name
self.scraper = scraper
self.referrer = referrer
self.url = url
self.dirname = dirname
filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename)
self.session = session
self.text = text
def connect(self, lastchange=None):
@ -71,7 +65,8 @@ class ComicImage(object):
headers = {}
if lastchange:
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
self.urlobj = urlopen(self.url, self.scraper.session,
referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True,
headers=headers)
if self.urlobj.status_code == 304: # Not modified
@ -97,7 +92,7 @@ class ComicImage(object):
def save(self, basepath):
"""Save comic URL to filename on disk."""
comicdir = os.path.join(basepath, self.dirname)
comicdir = self.scraper.get_download_dir(basepath)
if not os.path.isdir(comicdir):
os.makedirs(comicdir)
fnbase = os.path.join(comicdir, self.filename)
@ -125,7 +120,7 @@ class ComicImage(object):
out.debug(u'Writing comic text to file %s...' % fntext)
with self.fileout(fntext, encoding='utf-8') as f:
f.write(self.text)
getHandler().comicDownloaded(self, fn, text=self.text)
getHandler().comicDownloaded(self, fn)
return fn, True
@contextlib.contextmanager

View file

@ -13,7 +13,6 @@ from six.moves.urllib.parse import urlparse
from .output import out
from . import events, scraper
from .util import getDirname
class ComicQueue(Queue):
@ -196,11 +195,8 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
# only scrapers whose directory already exists
if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperobj in scraper.get_scrapers(include_removed=True):
dirname = getDirname(scraperobj.name)
if os.path.isdir(os.path.join(basepath, dirname)):
if shouldRunScraper(scraperobj, adult, listing):
yield scraperobj
for comic in get_existing_comics(basepath, adult, listing):
yield comic
else:
# get only selected comic scrapers
# store them in a set to eliminate duplicates
@ -228,6 +224,14 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
yield scraperobj
def get_existing_comics(basepath=None, adult=True, listing=False):
for scraperobj in scraper.get_scrapers(include_removed=True):
dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing):
yield scraperobj
def shouldRunScraper(scraperobj, adult=True, listing=False):
if listing:
return True

View file

@ -48,12 +48,23 @@ class EventHandler(object):
"""Emit a start event. Should be overridden in subclass."""
pass
def comicDownloaded(self, comic, filename, text=None):
"""Emit a comic downloaded event. Should be overridden in subclass."""
def comicDownloaded(self, comic, filename):
"""Emit a comic downloaded event. Should be overridden in subclass.
Parameters are:
comic: The ComicImage class calling this event
filename: The target filename
"""
pass
def comicPageLink(self, comic, url, prevUrl):
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
def comicPageLink(self, scraper, url, prevUrl):
"""Emit an event to inform the handler about links between comic pages.
Should be overridden in subclass. Parameters are:
scraper: The Scraper class calling this event
url: The current page url
prevUrl: The previous page url
"""
pass
def end(self):
@ -88,20 +99,20 @@ class RSSEventHandler(EventHandler):
self.newfile = True
self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today))
def comicDownloaded(self, comic, filename, text=None):
def comicDownloaded(self, comic, filename):
"""Write RSS entry for downloaded comic."""
imageUrl = self.getUrlFromFilename(filename)
size = None
if self.allowdownscale:
size = getDimensionForImage(filename, MaxImageSize)
title = '%s - %s' % (comic.name, os.path.basename(filename))
title = '%s - %s' % (comic.scraper.name, os.path.basename(filename))
pageUrl = comic.referrer
description = '<img src="%s"' % imageUrl
if size:
description += ' width="%d" height="%d"' % size
description += '/>'
if text:
description += '<br/>%s' % text
if comic.text:
description += '<br/>%s' % comic.text
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
args = (
title,
@ -202,7 +213,7 @@ class HtmlEventHandler(EventHandler):
def comicDownloaded(self, comic, filename, text=None):
"""Write HTML entry for downloaded comic."""
if self.lastComic != comic.name:
if self.lastComic != comic.scraper.name:
self.newComic(comic)
size = None
if self.allowdownscale:
@ -217,7 +228,7 @@ class HtmlEventHandler(EventHandler):
self.html.write('/>\n')
if text:
self.html.write(u'<br/>%s\n' % text)
self.lastComic = comic.name
self.lastComic = comic.scraper.name
self.lastUrl = pageUrl
def newComic(self, comic):
@ -226,7 +237,7 @@ class HtmlEventHandler(EventHandler):
self.html.write(u'</li>\n')
if self.lastComic is not None:
self.html.write(u'</ul>\n')
self.html.write(u'<li>%s</li>\n' % comic.name)
self.html.write(u'<li>%s</li>\n' % comic.scraper.name)
self.html.write(u'<ul>\n')
def end(self):
@ -250,44 +261,44 @@ class JSONEventHandler(EventHandler):
"""Start with empty data."""
self.data = {}
def jsonFn(self, comic):
def jsonFn(self, scraper):
"""Get filename for the JSON file for a comic."""
fn = os.path.join(self.basepath, comic, 'dosage.json')
fn = os.path.join(scraper.get_download_dir(self.basepath), 'dosage.json')
fn = os.path.abspath(fn)
return fn
def getComicData(self, comic):
def getComicData(self, scraper):
"""Return dictionary with comic info."""
if comic not in self.data:
if os.path.exists(self.jsonFn(comic)):
with codecs.open(self.jsonFn(comic), 'r', self.encoding) as f:
self.data[comic] = json.load(f)
if scraper not in self.data:
if os.path.exists(self.jsonFn(scraper)):
with codecs.open(self.jsonFn(scraper), 'r', self.encoding) as f:
self.data[scraper] = json.load(f)
else:
self.data[comic] = {'pages': {}}
return self.data[comic]
self.data[scraper] = {'pages': {}}
return self.data[scraper]
def getPageInfo(self, comic, url):
def getPageInfo(self, scraper, url):
"""Return dictionary with comic page info."""
comicData = self.getComicData(comic)
comicData = self.getComicData(scraper)
if url not in comicData['pages']:
comicData['pages'][url] = {'images': {}}
return comicData['pages'][url]
def comicDownloaded(self, comic, filename, text=None):
def comicDownloaded(self, comic, filename):
"""Add URL-to-filename mapping into JSON."""
pageInfo = self.getPageInfo(comic.name, comic.referrer)
pageInfo = self.getPageInfo(comic.scraper, comic.referrer)
pageInfo['images'][comic.url] = os.path.basename(filename)
def comicPageLink(self, comic, url, prevUrl):
def comicPageLink(self, scraper, url, prevUrl):
"""Write previous link into JSON."""
pageInfo = self.getPageInfo(comic, url)
pageInfo = self.getPageInfo(scraper, url)
pageInfo['prev'] = prevUrl
def end(self):
"""Write all JSON data to files."""
for comic in self.data:
with codecs.open(self.jsonFn(comic), 'w', self.encoding) as f:
json.dump(self.data[comic], f, indent=2, separators=(',', ': '), sort_keys=True)
for scraper in self.data:
with codecs.open(self.jsonFn(scraper), 'w', self.encoding) as f:
json.dump(self.data[scraper], f, indent=2, separators=(',', ': '), sort_keys=True)
_handler_classes = {}
@ -327,15 +338,15 @@ class MultiHandler(object):
for handler in _handlers:
handler.start()
def comicDownloaded(self, comic, filename, text=None):
def comicDownloaded(self, comic, filename):
"""Emit comic downloaded events for handlers."""
for handler in _handlers:
handler.comicDownloaded(comic, filename, text=text)
handler.comicDownloaded(comic, filename)
def comicPageLink(self, comic, url, prevUrl):
def comicPageLink(self, scraper, url, prevUrl):
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
for handler in _handlers:
handler.comicPageLink(comic, url, prevUrl)
handler.comicPageLink(scraper, url, prevUrl)
def end(self):
"""Emit end events for handlers."""

View file

@ -25,7 +25,7 @@ except ImportError:
pycountry = None
from . import loader, configuration, languages
from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
from .util import (get_page, makeSequence, get_system_uid, urlopen,
unescape, tagre, normaliseURL, prettyMatcherList,
requests_session)
from .comic import ComicStrip
@ -147,8 +147,7 @@ class Scraper(object):
optional=self.textOptional)
else:
text = None
return ComicStrip(self.name, url, imageUrls, self.namer,
self.session, text=text)
return ComicStrip(self, url, imageUrls, text=text)
def getStrips(self, maxstrips=None):
"""Get comic strips."""
@ -223,7 +222,7 @@ class Scraper(object):
else:
prevUrl = self.prevUrlModifier(prevUrl)
out.debug(u"Found previous URL %s" % prevUrl)
getHandler().comicPageLink(self.name, url, prevUrl)
getHandler().comicPageLink(self, url, prevUrl)
return prevUrl
def getIndexStripUrl(self, index):
@ -260,10 +259,28 @@ class Scraper(object):
page = urlopen(url, self.session, data=data)
return page.text
def get_download_dir(self, basepath):
"""Try to find the corect download directory, ignoring case
differences."""
path = basepath
for part in self.name.split('/'):
done = False
if (os.path.isdir(path) and
not os.path.isdir(os.path.join(path, part))):
for entry in os.listdir(path):
if (entry.lower() == part.lower() and
os.path.isdir(os.path.join(path, entry))):
path = os.path.join(path, entry)
done = True
break
if not done:
path = os.path.join(path, part)
return path
def getCompleteFile(self, basepath):
"""Get filename indicating all comics are downloaded."""
dirname = getDirname(self.name)
return os.path.join(basepath, dirname, "complete.txt")
dirname = self.get_download_dir(basepath)
return os.path.join(dirname, "complete.txt")
def isComplete(self, basepath):
"""Check if all comics are downloaded."""

View file

@ -453,11 +453,6 @@ def strsize(b):
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
def getDirname(name):
"""Replace slashes with path separator of name."""
return name.replace('/', os.sep)
def getFilename(name):
"""Get a filename from given name without dangerous or incompatible
characters."""