Ignore case for comic download directories.

Since we already match comics case-insensitive on the command line, this
was a logical step, even if this means changing quite a bit of code that
all tries to resolve the "comic directory" in a slightly different
way...
This commit is contained in:
Tobias Gruetzmacher 2016-06-05 23:55:54 +02:00
parent 215d597573
commit 64c8e502ca
5 changed files with 89 additions and 67 deletions

View file

@ -12,7 +12,7 @@ import contextlib
from datetime import datetime from datetime import datetime
from .output import out from .output import out
from .util import unquote, getDirname, getFilename, urlopen, strsize from .util import unquote, getFilename, urlopen, strsize
from .events import getHandler from .events import getHandler
@ -25,13 +25,11 @@ RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object): class ComicStrip(object):
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, name, strip_url, image_urls, namer, session, text=None): def __init__(self, scraper, strip_url, image_urls, text=None):
"""Store the image URL list.""" """Store the image URL list."""
self.name = name self.scraper = scraper
self.strip_url = strip_url self.strip_url = strip_url
self.image_urls = image_urls self.image_urls = image_urls
self.namer = namer
self.session = session
self.text = text self.text = text
def getImages(self): def getImages(self):
@ -41,12 +39,11 @@ class ComicStrip(object):
def getDownloader(self, url): def getDownloader(self, url):
"""Get an image downloader.""" """Get an image downloader."""
filename = self.namer(url, self.strip_url) filename = self.scraper.namer(url, self.strip_url)
if filename is None: if filename is None:
filename = url.rsplit('/', 1)[1] filename = url.rsplit('/', 1)[1]
dirname = getDirname(self.name) return ComicImage(self.scraper, url, self.strip_url, filename,
return ComicImage(self.name, url, self.strip_url, dirname, filename, text=self.text)
self.session, text=self.text)
class ComicImage(object): class ComicImage(object):
@ -54,16 +51,13 @@ class ComicImage(object):
ChunkBytes = 1024 * 100 # 100KB ChunkBytes = 1024 * 100 # 100KB
def __init__(self, name, url, referrer, dirname, filename, session, def __init__(self, scraper, url, referrer, filename, text=None):
text=None):
"""Set URL and filename.""" """Set URL and filename."""
self.name = name self.scraper = scraper
self.referrer = referrer self.referrer = referrer
self.url = url self.url = url
self.dirname = dirname
filename = getFilename(filename) filename = getFilename(filename)
self.filename, self.ext = os.path.splitext(filename) self.filename, self.ext = os.path.splitext(filename)
self.session = session
self.text = text self.text = text
def connect(self, lastchange=None): def connect(self, lastchange=None):
@ -71,7 +65,8 @@ class ComicImage(object):
headers = {} headers = {}
if lastchange: if lastchange:
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR) headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer, self.urlobj = urlopen(self.url, self.scraper.session,
referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True, max_content_bytes=MaxImageBytes, stream=True,
headers=headers) headers=headers)
if self.urlobj.status_code == 304: # Not modified if self.urlobj.status_code == 304: # Not modified
@ -97,7 +92,7 @@ class ComicImage(object):
def save(self, basepath): def save(self, basepath):
"""Save comic URL to filename on disk.""" """Save comic URL to filename on disk."""
comicdir = os.path.join(basepath, self.dirname) comicdir = self.scraper.get_download_dir(basepath)
if not os.path.isdir(comicdir): if not os.path.isdir(comicdir):
os.makedirs(comicdir) os.makedirs(comicdir)
fnbase = os.path.join(comicdir, self.filename) fnbase = os.path.join(comicdir, self.filename)
@ -125,7 +120,7 @@ class ComicImage(object):
out.debug(u'Writing comic text to file %s...' % fntext) out.debug(u'Writing comic text to file %s...' % fntext)
with self.fileout(fntext, encoding='utf-8') as f: with self.fileout(fntext, encoding='utf-8') as f:
f.write(self.text) f.write(self.text)
getHandler().comicDownloaded(self, fn, text=self.text) getHandler().comicDownloaded(self, fn)
return fn, True return fn, True
@contextlib.contextmanager @contextlib.contextmanager

View file

@ -13,7 +13,6 @@ from six.moves.urllib.parse import urlparse
from .output import out from .output import out
from . import events, scraper from . import events, scraper
from .util import getDirname
class ComicQueue(Queue): class ComicQueue(Queue):
@ -196,11 +195,8 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
# only scrapers whose directory already exists # only scrapers whose directory already exists
if len(comics) > 1: if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.") out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperobj in scraper.get_scrapers(include_removed=True): for comic in get_existing_comics(basepath, adult, listing):
dirname = getDirname(scraperobj.name) yield comic
if os.path.isdir(os.path.join(basepath, dirname)):
if shouldRunScraper(scraperobj, adult, listing):
yield scraperobj
else: else:
# get only selected comic scrapers # get only selected comic scrapers
# store them in a set to eliminate duplicates # store them in a set to eliminate duplicates
@ -228,6 +224,14 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
yield scraperobj yield scraperobj
def get_existing_comics(basepath=None, adult=True, listing=False):
for scraperobj in scraper.get_scrapers(include_removed=True):
dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing):
yield scraperobj
def shouldRunScraper(scraperobj, adult=True, listing=False): def shouldRunScraper(scraperobj, adult=True, listing=False):
if listing: if listing:
return True return True

View file

@ -48,12 +48,23 @@ class EventHandler(object):
"""Emit a start event. Should be overridden in subclass.""" """Emit a start event. Should be overridden in subclass."""
pass pass
def comicDownloaded(self, comic, filename, text=None): def comicDownloaded(self, comic, filename):
"""Emit a comic downloaded event. Should be overridden in subclass.""" """Emit a comic downloaded event. Should be overridden in subclass.
Parameters are:
comic: The ComicImage class calling this event
filename: The target filename
"""
pass pass
def comicPageLink(self, comic, url, prevUrl): def comicPageLink(self, scraper, url, prevUrl):
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass.""" """Emit an event to inform the handler about links between comic pages.
Should be overridden in subclass. Parameters are:
scraper: The Scraper class calling this event
url: The current page url
prevUrl: The previous page url
"""
pass pass
def end(self): def end(self):
@ -88,20 +99,20 @@ class RSSEventHandler(EventHandler):
self.newfile = True self.newfile = True
self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today)) self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today))
def comicDownloaded(self, comic, filename, text=None): def comicDownloaded(self, comic, filename):
"""Write RSS entry for downloaded comic.""" """Write RSS entry for downloaded comic."""
imageUrl = self.getUrlFromFilename(filename) imageUrl = self.getUrlFromFilename(filename)
size = None size = None
if self.allowdownscale: if self.allowdownscale:
size = getDimensionForImage(filename, MaxImageSize) size = getDimensionForImage(filename, MaxImageSize)
title = '%s - %s' % (comic.name, os.path.basename(filename)) title = '%s - %s' % (comic.scraper.name, os.path.basename(filename))
pageUrl = comic.referrer pageUrl = comic.referrer
description = '<img src="%s"' % imageUrl description = '<img src="%s"' % imageUrl
if size: if size:
description += ' width="%d" height="%d"' % size description += ' width="%d" height="%d"' % size
description += '/>' description += '/>'
if text: if comic.text:
description += '<br/>%s' % text description += '<br/>%s' % comic.text
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
args = ( args = (
title, title,
@ -202,7 +213,7 @@ class HtmlEventHandler(EventHandler):
def comicDownloaded(self, comic, filename, text=None): def comicDownloaded(self, comic, filename, text=None):
"""Write HTML entry for downloaded comic.""" """Write HTML entry for downloaded comic."""
if self.lastComic != comic.name: if self.lastComic != comic.scraper.name:
self.newComic(comic) self.newComic(comic)
size = None size = None
if self.allowdownscale: if self.allowdownscale:
@ -217,7 +228,7 @@ class HtmlEventHandler(EventHandler):
self.html.write('/>\n') self.html.write('/>\n')
if text: if text:
self.html.write(u'<br/>%s\n' % text) self.html.write(u'<br/>%s\n' % text)
self.lastComic = comic.name self.lastComic = comic.scraper.name
self.lastUrl = pageUrl self.lastUrl = pageUrl
def newComic(self, comic): def newComic(self, comic):
@ -226,7 +237,7 @@ class HtmlEventHandler(EventHandler):
self.html.write(u'</li>\n') self.html.write(u'</li>\n')
if self.lastComic is not None: if self.lastComic is not None:
self.html.write(u'</ul>\n') self.html.write(u'</ul>\n')
self.html.write(u'<li>%s</li>\n' % comic.name) self.html.write(u'<li>%s</li>\n' % comic.scraper.name)
self.html.write(u'<ul>\n') self.html.write(u'<ul>\n')
def end(self): def end(self):
@ -250,44 +261,44 @@ class JSONEventHandler(EventHandler):
"""Start with empty data.""" """Start with empty data."""
self.data = {} self.data = {}
def jsonFn(self, comic): def jsonFn(self, scraper):
"""Get filename for the JSON file for a comic.""" """Get filename for the JSON file for a comic."""
fn = os.path.join(self.basepath, comic, 'dosage.json') fn = os.path.join(scraper.get_download_dir(self.basepath), 'dosage.json')
fn = os.path.abspath(fn) fn = os.path.abspath(fn)
return fn return fn
def getComicData(self, comic): def getComicData(self, scraper):
"""Return dictionary with comic info.""" """Return dictionary with comic info."""
if comic not in self.data: if scraper not in self.data:
if os.path.exists(self.jsonFn(comic)): if os.path.exists(self.jsonFn(scraper)):
with codecs.open(self.jsonFn(comic), 'r', self.encoding) as f: with codecs.open(self.jsonFn(scraper), 'r', self.encoding) as f:
self.data[comic] = json.load(f) self.data[scraper] = json.load(f)
else: else:
self.data[comic] = {'pages': {}} self.data[scraper] = {'pages': {}}
return self.data[comic] return self.data[scraper]
def getPageInfo(self, comic, url): def getPageInfo(self, scraper, url):
"""Return dictionary with comic page info.""" """Return dictionary with comic page info."""
comicData = self.getComicData(comic) comicData = self.getComicData(scraper)
if url not in comicData['pages']: if url not in comicData['pages']:
comicData['pages'][url] = {'images': {}} comicData['pages'][url] = {'images': {}}
return comicData['pages'][url] return comicData['pages'][url]
def comicDownloaded(self, comic, filename, text=None): def comicDownloaded(self, comic, filename):
"""Add URL-to-filename mapping into JSON.""" """Add URL-to-filename mapping into JSON."""
pageInfo = self.getPageInfo(comic.name, comic.referrer) pageInfo = self.getPageInfo(comic.scraper, comic.referrer)
pageInfo['images'][comic.url] = os.path.basename(filename) pageInfo['images'][comic.url] = os.path.basename(filename)
def comicPageLink(self, comic, url, prevUrl): def comicPageLink(self, scraper, url, prevUrl):
"""Write previous link into JSON.""" """Write previous link into JSON."""
pageInfo = self.getPageInfo(comic, url) pageInfo = self.getPageInfo(scraper, url)
pageInfo['prev'] = prevUrl pageInfo['prev'] = prevUrl
def end(self): def end(self):
"""Write all JSON data to files.""" """Write all JSON data to files."""
for comic in self.data: for scraper in self.data:
with codecs.open(self.jsonFn(comic), 'w', self.encoding) as f: with codecs.open(self.jsonFn(scraper), 'w', self.encoding) as f:
json.dump(self.data[comic], f, indent=2, separators=(',', ': '), sort_keys=True) json.dump(self.data[scraper], f, indent=2, separators=(',', ': '), sort_keys=True)
_handler_classes = {} _handler_classes = {}
@ -327,15 +338,15 @@ class MultiHandler(object):
for handler in _handlers: for handler in _handlers:
handler.start() handler.start()
def comicDownloaded(self, comic, filename, text=None): def comicDownloaded(self, comic, filename):
"""Emit comic downloaded events for handlers.""" """Emit comic downloaded events for handlers."""
for handler in _handlers: for handler in _handlers:
handler.comicDownloaded(comic, filename, text=text) handler.comicDownloaded(comic, filename)
def comicPageLink(self, comic, url, prevUrl): def comicPageLink(self, scraper, url, prevUrl):
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass.""" """Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
for handler in _handlers: for handler in _handlers:
handler.comicPageLink(comic, url, prevUrl) handler.comicPageLink(scraper, url, prevUrl)
def end(self): def end(self):
"""Emit end events for handlers.""" """Emit end events for handlers."""

View file

@ -25,7 +25,7 @@ except ImportError:
pycountry = None pycountry = None
from . import loader, configuration, languages from . import loader, configuration, languages
from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname, from .util import (get_page, makeSequence, get_system_uid, urlopen,
unescape, tagre, normaliseURL, prettyMatcherList, unescape, tagre, normaliseURL, prettyMatcherList,
requests_session) requests_session)
from .comic import ComicStrip from .comic import ComicStrip
@ -147,8 +147,7 @@ class Scraper(object):
optional=self.textOptional) optional=self.textOptional)
else: else:
text = None text = None
return ComicStrip(self.name, url, imageUrls, self.namer, return ComicStrip(self, url, imageUrls, text=text)
self.session, text=text)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
"""Get comic strips.""" """Get comic strips."""
@ -223,7 +222,7 @@ class Scraper(object):
else: else:
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug(u"Found previous URL %s" % prevUrl) out.debug(u"Found previous URL %s" % prevUrl)
getHandler().comicPageLink(self.name, url, prevUrl) getHandler().comicPageLink(self, url, prevUrl)
return prevUrl return prevUrl
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index):
@ -260,10 +259,28 @@ class Scraper(object):
page = urlopen(url, self.session, data=data) page = urlopen(url, self.session, data=data)
return page.text return page.text
def get_download_dir(self, basepath):
"""Try to find the corect download directory, ignoring case
differences."""
path = basepath
for part in self.name.split('/'):
done = False
if (os.path.isdir(path) and
not os.path.isdir(os.path.join(path, part))):
for entry in os.listdir(path):
if (entry.lower() == part.lower() and
os.path.isdir(os.path.join(path, entry))):
path = os.path.join(path, entry)
done = True
break
if not done:
path = os.path.join(path, part)
return path
def getCompleteFile(self, basepath): def getCompleteFile(self, basepath):
"""Get filename indicating all comics are downloaded.""" """Get filename indicating all comics are downloaded."""
dirname = getDirname(self.name) dirname = self.get_download_dir(basepath)
return os.path.join(basepath, dirname, "complete.txt") return os.path.join(dirname, "complete.txt")
def isComplete(self, basepath): def isComplete(self, basepath):
"""Check if all comics are downloaded.""" """Check if all comics are downloaded."""

View file

@ -453,11 +453,6 @@ def strsize(b):
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024)) return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
def getDirname(name):
"""Replace slashes with path separator of name."""
return name.replace('/', os.sep)
def getFilename(name): def getFilename(name):
"""Get a filename from given name without dangerous or incompatible """Get a filename from given name without dangerous or incompatible
characters.""" characters."""