Ignore case for comic download directories.
Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way...
This commit is contained in:
parent
215d597573
commit
64c8e502ca
5 changed files with 89 additions and 67 deletions
|
@ -12,7 +12,7 @@ import contextlib
|
|||
from datetime import datetime
|
||||
|
||||
from .output import out
|
||||
from .util import unquote, getDirname, getFilename, urlopen, strsize
|
||||
from .util import unquote, getFilename, urlopen, strsize
|
||||
from .events import getHandler
|
||||
|
||||
|
||||
|
@ -25,13 +25,11 @@ RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
|||
class ComicStrip(object):
|
||||
"""A list of comic image URLs."""
|
||||
|
||||
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
|
||||
def __init__(self, scraper, strip_url, image_urls, text=None):
|
||||
"""Store the image URL list."""
|
||||
self.name = name
|
||||
self.scraper = scraper
|
||||
self.strip_url = strip_url
|
||||
self.image_urls = image_urls
|
||||
self.namer = namer
|
||||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def getImages(self):
|
||||
|
@ -41,12 +39,11 @@ class ComicStrip(object):
|
|||
|
||||
def getDownloader(self, url):
|
||||
"""Get an image downloader."""
|
||||
filename = self.namer(url, self.strip_url)
|
||||
filename = self.scraper.namer(url, self.strip_url)
|
||||
if filename is None:
|
||||
filename = url.rsplit('/', 1)[1]
|
||||
dirname = getDirname(self.name)
|
||||
return ComicImage(self.name, url, self.strip_url, dirname, filename,
|
||||
self.session, text=self.text)
|
||||
return ComicImage(self.scraper, url, self.strip_url, filename,
|
||||
text=self.text)
|
||||
|
||||
|
||||
class ComicImage(object):
|
||||
|
@ -54,16 +51,13 @@ class ComicImage(object):
|
|||
|
||||
ChunkBytes = 1024 * 100 # 100KB
|
||||
|
||||
def __init__(self, name, url, referrer, dirname, filename, session,
|
||||
text=None):
|
||||
def __init__(self, scraper, url, referrer, filename, text=None):
|
||||
"""Set URL and filename."""
|
||||
self.name = name
|
||||
self.scraper = scraper
|
||||
self.referrer = referrer
|
||||
self.url = url
|
||||
self.dirname = dirname
|
||||
filename = getFilename(filename)
|
||||
self.filename, self.ext = os.path.splitext(filename)
|
||||
self.session = session
|
||||
self.text = text
|
||||
|
||||
def connect(self, lastchange=None):
|
||||
|
@ -71,7 +65,8 @@ class ComicImage(object):
|
|||
headers = {}
|
||||
if lastchange:
|
||||
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
||||
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
|
||||
self.urlobj = urlopen(self.url, self.scraper.session,
|
||||
referrer=self.referrer,
|
||||
max_content_bytes=MaxImageBytes, stream=True,
|
||||
headers=headers)
|
||||
if self.urlobj.status_code == 304: # Not modified
|
||||
|
@ -97,7 +92,7 @@ class ComicImage(object):
|
|||
|
||||
def save(self, basepath):
|
||||
"""Save comic URL to filename on disk."""
|
||||
comicdir = os.path.join(basepath, self.dirname)
|
||||
comicdir = self.scraper.get_download_dir(basepath)
|
||||
if not os.path.isdir(comicdir):
|
||||
os.makedirs(comicdir)
|
||||
fnbase = os.path.join(comicdir, self.filename)
|
||||
|
@ -125,7 +120,7 @@ class ComicImage(object):
|
|||
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||
with self.fileout(fntext, encoding='utf-8') as f:
|
||||
f.write(self.text)
|
||||
getHandler().comicDownloaded(self, fn, text=self.text)
|
||||
getHandler().comicDownloaded(self, fn)
|
||||
return fn, True
|
||||
|
||||
@contextlib.contextmanager
|
||||
|
|
|
@ -13,7 +13,6 @@ from six.moves.urllib.parse import urlparse
|
|||
|
||||
from .output import out
|
||||
from . import events, scraper
|
||||
from .util import getDirname
|
||||
|
||||
|
||||
class ComicQueue(Queue):
|
||||
|
@ -196,11 +195,8 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
|||
# only scrapers whose directory already exists
|
||||
if len(comics) > 1:
|
||||
out.warn(u"using '@' as comic name ignores all other specified comics.")
|
||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
||||
dirname = getDirname(scraperobj.name)
|
||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
||||
if shouldRunScraper(scraperobj, adult, listing):
|
||||
yield scraperobj
|
||||
for comic in get_existing_comics(basepath, adult, listing):
|
||||
yield comic
|
||||
else:
|
||||
# get only selected comic scrapers
|
||||
# store them in a set to eliminate duplicates
|
||||
|
@ -228,6 +224,14 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
|||
yield scraperobj
|
||||
|
||||
|
||||
def get_existing_comics(basepath=None, adult=True, listing=False):
|
||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
||||
dirname = scraperobj.get_download_dir(basepath)
|
||||
if os.path.isdir(dirname):
|
||||
if shouldRunScraper(scraperobj, adult, listing):
|
||||
yield scraperobj
|
||||
|
||||
|
||||
def shouldRunScraper(scraperobj, adult=True, listing=False):
|
||||
if listing:
|
||||
return True
|
||||
|
|
|
@ -48,12 +48,23 @@ class EventHandler(object):
|
|||
"""Emit a start event. Should be overridden in subclass."""
|
||||
pass
|
||||
|
||||
def comicDownloaded(self, comic, filename, text=None):
|
||||
"""Emit a comic downloaded event. Should be overridden in subclass."""
|
||||
def comicDownloaded(self, comic, filename):
|
||||
"""Emit a comic downloaded event. Should be overridden in subclass.
|
||||
Parameters are:
|
||||
|
||||
comic: The ComicImage class calling this event
|
||||
filename: The target filename
|
||||
"""
|
||||
pass
|
||||
|
||||
def comicPageLink(self, comic, url, prevUrl):
|
||||
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
|
||||
def comicPageLink(self, scraper, url, prevUrl):
|
||||
"""Emit an event to inform the handler about links between comic pages.
|
||||
Should be overridden in subclass. Parameters are:
|
||||
|
||||
scraper: The Scraper class calling this event
|
||||
url: The current page url
|
||||
prevUrl: The previous page url
|
||||
"""
|
||||
pass
|
||||
|
||||
def end(self):
|
||||
|
@ -88,20 +99,20 @@ class RSSEventHandler(EventHandler):
|
|||
self.newfile = True
|
||||
self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today))
|
||||
|
||||
def comicDownloaded(self, comic, filename, text=None):
|
||||
def comicDownloaded(self, comic, filename):
|
||||
"""Write RSS entry for downloaded comic."""
|
||||
imageUrl = self.getUrlFromFilename(filename)
|
||||
size = None
|
||||
if self.allowdownscale:
|
||||
size = getDimensionForImage(filename, MaxImageSize)
|
||||
title = '%s - %s' % (comic.name, os.path.basename(filename))
|
||||
title = '%s - %s' % (comic.scraper.name, os.path.basename(filename))
|
||||
pageUrl = comic.referrer
|
||||
description = '<img src="%s"' % imageUrl
|
||||
if size:
|
||||
description += ' width="%d" height="%d"' % size
|
||||
description += '/>'
|
||||
if text:
|
||||
description += '<br/>%s' % text
|
||||
if comic.text:
|
||||
description += '<br/>%s' % comic.text
|
||||
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
|
||||
args = (
|
||||
title,
|
||||
|
@ -202,7 +213,7 @@ class HtmlEventHandler(EventHandler):
|
|||
|
||||
def comicDownloaded(self, comic, filename, text=None):
|
||||
"""Write HTML entry for downloaded comic."""
|
||||
if self.lastComic != comic.name:
|
||||
if self.lastComic != comic.scraper.name:
|
||||
self.newComic(comic)
|
||||
size = None
|
||||
if self.allowdownscale:
|
||||
|
@ -217,7 +228,7 @@ class HtmlEventHandler(EventHandler):
|
|||
self.html.write('/>\n')
|
||||
if text:
|
||||
self.html.write(u'<br/>%s\n' % text)
|
||||
self.lastComic = comic.name
|
||||
self.lastComic = comic.scraper.name
|
||||
self.lastUrl = pageUrl
|
||||
|
||||
def newComic(self, comic):
|
||||
|
@ -226,7 +237,7 @@ class HtmlEventHandler(EventHandler):
|
|||
self.html.write(u'</li>\n')
|
||||
if self.lastComic is not None:
|
||||
self.html.write(u'</ul>\n')
|
||||
self.html.write(u'<li>%s</li>\n' % comic.name)
|
||||
self.html.write(u'<li>%s</li>\n' % comic.scraper.name)
|
||||
self.html.write(u'<ul>\n')
|
||||
|
||||
def end(self):
|
||||
|
@ -250,44 +261,44 @@ class JSONEventHandler(EventHandler):
|
|||
"""Start with empty data."""
|
||||
self.data = {}
|
||||
|
||||
def jsonFn(self, comic):
|
||||
def jsonFn(self, scraper):
|
||||
"""Get filename for the JSON file for a comic."""
|
||||
fn = os.path.join(self.basepath, comic, 'dosage.json')
|
||||
fn = os.path.join(scraper.get_download_dir(self.basepath), 'dosage.json')
|
||||
fn = os.path.abspath(fn)
|
||||
return fn
|
||||
|
||||
def getComicData(self, comic):
|
||||
def getComicData(self, scraper):
|
||||
"""Return dictionary with comic info."""
|
||||
if comic not in self.data:
|
||||
if os.path.exists(self.jsonFn(comic)):
|
||||
with codecs.open(self.jsonFn(comic), 'r', self.encoding) as f:
|
||||
self.data[comic] = json.load(f)
|
||||
if scraper not in self.data:
|
||||
if os.path.exists(self.jsonFn(scraper)):
|
||||
with codecs.open(self.jsonFn(scraper), 'r', self.encoding) as f:
|
||||
self.data[scraper] = json.load(f)
|
||||
else:
|
||||
self.data[comic] = {'pages': {}}
|
||||
return self.data[comic]
|
||||
self.data[scraper] = {'pages': {}}
|
||||
return self.data[scraper]
|
||||
|
||||
def getPageInfo(self, comic, url):
|
||||
def getPageInfo(self, scraper, url):
|
||||
"""Return dictionary with comic page info."""
|
||||
comicData = self.getComicData(comic)
|
||||
comicData = self.getComicData(scraper)
|
||||
if url not in comicData['pages']:
|
||||
comicData['pages'][url] = {'images': {}}
|
||||
return comicData['pages'][url]
|
||||
|
||||
def comicDownloaded(self, comic, filename, text=None):
|
||||
def comicDownloaded(self, comic, filename):
|
||||
"""Add URL-to-filename mapping into JSON."""
|
||||
pageInfo = self.getPageInfo(comic.name, comic.referrer)
|
||||
pageInfo = self.getPageInfo(comic.scraper, comic.referrer)
|
||||
pageInfo['images'][comic.url] = os.path.basename(filename)
|
||||
|
||||
def comicPageLink(self, comic, url, prevUrl):
|
||||
def comicPageLink(self, scraper, url, prevUrl):
|
||||
"""Write previous link into JSON."""
|
||||
pageInfo = self.getPageInfo(comic, url)
|
||||
pageInfo = self.getPageInfo(scraper, url)
|
||||
pageInfo['prev'] = prevUrl
|
||||
|
||||
def end(self):
|
||||
"""Write all JSON data to files."""
|
||||
for comic in self.data:
|
||||
with codecs.open(self.jsonFn(comic), 'w', self.encoding) as f:
|
||||
json.dump(self.data[comic], f, indent=2, separators=(',', ': '), sort_keys=True)
|
||||
for scraper in self.data:
|
||||
with codecs.open(self.jsonFn(scraper), 'w', self.encoding) as f:
|
||||
json.dump(self.data[scraper], f, indent=2, separators=(',', ': '), sort_keys=True)
|
||||
|
||||
|
||||
_handler_classes = {}
|
||||
|
@ -327,15 +338,15 @@ class MultiHandler(object):
|
|||
for handler in _handlers:
|
||||
handler.start()
|
||||
|
||||
def comicDownloaded(self, comic, filename, text=None):
|
||||
def comicDownloaded(self, comic, filename):
|
||||
"""Emit comic downloaded events for handlers."""
|
||||
for handler in _handlers:
|
||||
handler.comicDownloaded(comic, filename, text=text)
|
||||
handler.comicDownloaded(comic, filename)
|
||||
|
||||
def comicPageLink(self, comic, url, prevUrl):
|
||||
def comicPageLink(self, scraper, url, prevUrl):
|
||||
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
|
||||
for handler in _handlers:
|
||||
handler.comicPageLink(comic, url, prevUrl)
|
||||
handler.comicPageLink(scraper, url, prevUrl)
|
||||
|
||||
def end(self):
|
||||
"""Emit end events for handlers."""
|
||||
|
|
|
@ -25,7 +25,7 @@ except ImportError:
|
|||
pycountry = None
|
||||
|
||||
from . import loader, configuration, languages
|
||||
from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
|
||||
from .util import (get_page, makeSequence, get_system_uid, urlopen,
|
||||
unescape, tagre, normaliseURL, prettyMatcherList,
|
||||
requests_session)
|
||||
from .comic import ComicStrip
|
||||
|
@ -147,8 +147,7 @@ class Scraper(object):
|
|||
optional=self.textOptional)
|
||||
else:
|
||||
text = None
|
||||
return ComicStrip(self.name, url, imageUrls, self.namer,
|
||||
self.session, text=text)
|
||||
return ComicStrip(self, url, imageUrls, text=text)
|
||||
|
||||
def getStrips(self, maxstrips=None):
|
||||
"""Get comic strips."""
|
||||
|
@ -223,7 +222,7 @@ class Scraper(object):
|
|||
else:
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug(u"Found previous URL %s" % prevUrl)
|
||||
getHandler().comicPageLink(self.name, url, prevUrl)
|
||||
getHandler().comicPageLink(self, url, prevUrl)
|
||||
return prevUrl
|
||||
|
||||
def getIndexStripUrl(self, index):
|
||||
|
@ -260,10 +259,28 @@ class Scraper(object):
|
|||
page = urlopen(url, self.session, data=data)
|
||||
return page.text
|
||||
|
||||
def get_download_dir(self, basepath):
|
||||
"""Try to find the corect download directory, ignoring case
|
||||
differences."""
|
||||
path = basepath
|
||||
for part in self.name.split('/'):
|
||||
done = False
|
||||
if (os.path.isdir(path) and
|
||||
not os.path.isdir(os.path.join(path, part))):
|
||||
for entry in os.listdir(path):
|
||||
if (entry.lower() == part.lower() and
|
||||
os.path.isdir(os.path.join(path, entry))):
|
||||
path = os.path.join(path, entry)
|
||||
done = True
|
||||
break
|
||||
if not done:
|
||||
path = os.path.join(path, part)
|
||||
return path
|
||||
|
||||
def getCompleteFile(self, basepath):
|
||||
"""Get filename indicating all comics are downloaded."""
|
||||
dirname = getDirname(self.name)
|
||||
return os.path.join(basepath, dirname, "complete.txt")
|
||||
dirname = self.get_download_dir(basepath)
|
||||
return os.path.join(dirname, "complete.txt")
|
||||
|
||||
def isComplete(self, basepath):
|
||||
"""Check if all comics are downloaded."""
|
||||
|
|
|
@ -453,11 +453,6 @@ def strsize(b):
|
|||
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
|
||||
|
||||
|
||||
def getDirname(name):
|
||||
"""Replace slashes with path separator of name."""
|
||||
return name.replace('/', os.sep)
|
||||
|
||||
|
||||
def getFilename(name):
|
||||
"""Get a filename from given name without dangerous or incompatible
|
||||
characters."""
|
||||
|
|
Loading…
Reference in a new issue