Ignore case for comic download directories.
Since we already match comics case-insensitive on the command line, this was a logical step, even if this means changing quite a bit of code that all tries to resolve the "comic directory" in a slightly different way...
This commit is contained in:
parent
215d597573
commit
64c8e502ca
5 changed files with 89 additions and 67 deletions
|
@ -12,7 +12,7 @@ import contextlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import unquote, getDirname, getFilename, urlopen, strsize
|
from .util import unquote, getFilename, urlopen, strsize
|
||||||
from .events import getHandler
|
from .events import getHandler
|
||||||
|
|
||||||
|
|
||||||
|
@ -25,13 +25,11 @@ RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
||||||
class ComicStrip(object):
|
class ComicStrip(object):
|
||||||
"""A list of comic image URLs."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, name, strip_url, image_urls, namer, session, text=None):
|
def __init__(self, scraper, strip_url, image_urls, text=None):
|
||||||
"""Store the image URL list."""
|
"""Store the image URL list."""
|
||||||
self.name = name
|
self.scraper = scraper
|
||||||
self.strip_url = strip_url
|
self.strip_url = strip_url
|
||||||
self.image_urls = image_urls
|
self.image_urls = image_urls
|
||||||
self.namer = namer
|
|
||||||
self.session = session
|
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
def getImages(self):
|
def getImages(self):
|
||||||
|
@ -41,12 +39,11 @@ class ComicStrip(object):
|
||||||
|
|
||||||
def getDownloader(self, url):
|
def getDownloader(self, url):
|
||||||
"""Get an image downloader."""
|
"""Get an image downloader."""
|
||||||
filename = self.namer(url, self.strip_url)
|
filename = self.scraper.namer(url, self.strip_url)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
filename = url.rsplit('/', 1)[1]
|
filename = url.rsplit('/', 1)[1]
|
||||||
dirname = getDirname(self.name)
|
return ComicImage(self.scraper, url, self.strip_url, filename,
|
||||||
return ComicImage(self.name, url, self.strip_url, dirname, filename,
|
text=self.text)
|
||||||
self.session, text=self.text)
|
|
||||||
|
|
||||||
|
|
||||||
class ComicImage(object):
|
class ComicImage(object):
|
||||||
|
@ -54,16 +51,13 @@ class ComicImage(object):
|
||||||
|
|
||||||
ChunkBytes = 1024 * 100 # 100KB
|
ChunkBytes = 1024 * 100 # 100KB
|
||||||
|
|
||||||
def __init__(self, name, url, referrer, dirname, filename, session,
|
def __init__(self, scraper, url, referrer, filename, text=None):
|
||||||
text=None):
|
|
||||||
"""Set URL and filename."""
|
"""Set URL and filename."""
|
||||||
self.name = name
|
self.scraper = scraper
|
||||||
self.referrer = referrer
|
self.referrer = referrer
|
||||||
self.url = url
|
self.url = url
|
||||||
self.dirname = dirname
|
|
||||||
filename = getFilename(filename)
|
filename = getFilename(filename)
|
||||||
self.filename, self.ext = os.path.splitext(filename)
|
self.filename, self.ext = os.path.splitext(filename)
|
||||||
self.session = session
|
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
def connect(self, lastchange=None):
|
def connect(self, lastchange=None):
|
||||||
|
@ -71,7 +65,8 @@ class ComicImage(object):
|
||||||
headers = {}
|
headers = {}
|
||||||
if lastchange:
|
if lastchange:
|
||||||
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
||||||
self.urlobj = urlopen(self.url, self.session, referrer=self.referrer,
|
self.urlobj = urlopen(self.url, self.scraper.session,
|
||||||
|
referrer=self.referrer,
|
||||||
max_content_bytes=MaxImageBytes, stream=True,
|
max_content_bytes=MaxImageBytes, stream=True,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
if self.urlobj.status_code == 304: # Not modified
|
if self.urlobj.status_code == 304: # Not modified
|
||||||
|
@ -97,7 +92,7 @@ class ComicImage(object):
|
||||||
|
|
||||||
def save(self, basepath):
|
def save(self, basepath):
|
||||||
"""Save comic URL to filename on disk."""
|
"""Save comic URL to filename on disk."""
|
||||||
comicdir = os.path.join(basepath, self.dirname)
|
comicdir = self.scraper.get_download_dir(basepath)
|
||||||
if not os.path.isdir(comicdir):
|
if not os.path.isdir(comicdir):
|
||||||
os.makedirs(comicdir)
|
os.makedirs(comicdir)
|
||||||
fnbase = os.path.join(comicdir, self.filename)
|
fnbase = os.path.join(comicdir, self.filename)
|
||||||
|
@ -125,7 +120,7 @@ class ComicImage(object):
|
||||||
out.debug(u'Writing comic text to file %s...' % fntext)
|
out.debug(u'Writing comic text to file %s...' % fntext)
|
||||||
with self.fileout(fntext, encoding='utf-8') as f:
|
with self.fileout(fntext, encoding='utf-8') as f:
|
||||||
f.write(self.text)
|
f.write(self.text)
|
||||||
getHandler().comicDownloaded(self, fn, text=self.text)
|
getHandler().comicDownloaded(self, fn)
|
||||||
return fn, True
|
return fn, True
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
|
|
|
@ -13,7 +13,6 @@ from six.moves.urllib.parse import urlparse
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from . import events, scraper
|
from . import events, scraper
|
||||||
from .util import getDirname
|
|
||||||
|
|
||||||
|
|
||||||
class ComicQueue(Queue):
|
class ComicQueue(Queue):
|
||||||
|
@ -196,11 +195,8 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
# only scrapers whose directory already exists
|
# only scrapers whose directory already exists
|
||||||
if len(comics) > 1:
|
if len(comics) > 1:
|
||||||
out.warn(u"using '@' as comic name ignores all other specified comics.")
|
out.warn(u"using '@' as comic name ignores all other specified comics.")
|
||||||
for scraperobj in scraper.get_scrapers(include_removed=True):
|
for comic in get_existing_comics(basepath, adult, listing):
|
||||||
dirname = getDirname(scraperobj.name)
|
yield comic
|
||||||
if os.path.isdir(os.path.join(basepath, dirname)):
|
|
||||||
if shouldRunScraper(scraperobj, adult, listing):
|
|
||||||
yield scraperobj
|
|
||||||
else:
|
else:
|
||||||
# get only selected comic scrapers
|
# get only selected comic scrapers
|
||||||
# store them in a set to eliminate duplicates
|
# store them in a set to eliminate duplicates
|
||||||
|
@ -228,6 +224,14 @@ def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False, listi
|
||||||
yield scraperobj
|
yield scraperobj
|
||||||
|
|
||||||
|
|
||||||
|
def get_existing_comics(basepath=None, adult=True, listing=False):
|
||||||
|
for scraperobj in scraper.get_scrapers(include_removed=True):
|
||||||
|
dirname = scraperobj.get_download_dir(basepath)
|
||||||
|
if os.path.isdir(dirname):
|
||||||
|
if shouldRunScraper(scraperobj, adult, listing):
|
||||||
|
yield scraperobj
|
||||||
|
|
||||||
|
|
||||||
def shouldRunScraper(scraperobj, adult=True, listing=False):
|
def shouldRunScraper(scraperobj, adult=True, listing=False):
|
||||||
if listing:
|
if listing:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -48,12 +48,23 @@ class EventHandler(object):
|
||||||
"""Emit a start event. Should be overridden in subclass."""
|
"""Emit a start event. Should be overridden in subclass."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def comicDownloaded(self, comic, filename, text=None):
|
def comicDownloaded(self, comic, filename):
|
||||||
"""Emit a comic downloaded event. Should be overridden in subclass."""
|
"""Emit a comic downloaded event. Should be overridden in subclass.
|
||||||
|
Parameters are:
|
||||||
|
|
||||||
|
comic: The ComicImage class calling this event
|
||||||
|
filename: The target filename
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def comicPageLink(self, comic, url, prevUrl):
|
def comicPageLink(self, scraper, url, prevUrl):
|
||||||
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
|
"""Emit an event to inform the handler about links between comic pages.
|
||||||
|
Should be overridden in subclass. Parameters are:
|
||||||
|
|
||||||
|
scraper: The Scraper class calling this event
|
||||||
|
url: The current page url
|
||||||
|
prevUrl: The previous page url
|
||||||
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
|
@ -88,20 +99,20 @@ class RSSEventHandler(EventHandler):
|
||||||
self.newfile = True
|
self.newfile = True
|
||||||
self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today))
|
self.rss = rss.Feed('Daily Dosage', link, 'Comics for %s' % time.strftime('%Y/%m/%d', today))
|
||||||
|
|
||||||
def comicDownloaded(self, comic, filename, text=None):
|
def comicDownloaded(self, comic, filename):
|
||||||
"""Write RSS entry for downloaded comic."""
|
"""Write RSS entry for downloaded comic."""
|
||||||
imageUrl = self.getUrlFromFilename(filename)
|
imageUrl = self.getUrlFromFilename(filename)
|
||||||
size = None
|
size = None
|
||||||
if self.allowdownscale:
|
if self.allowdownscale:
|
||||||
size = getDimensionForImage(filename, MaxImageSize)
|
size = getDimensionForImage(filename, MaxImageSize)
|
||||||
title = '%s - %s' % (comic.name, os.path.basename(filename))
|
title = '%s - %s' % (comic.scraper.name, os.path.basename(filename))
|
||||||
pageUrl = comic.referrer
|
pageUrl = comic.referrer
|
||||||
description = '<img src="%s"' % imageUrl
|
description = '<img src="%s"' % imageUrl
|
||||||
if size:
|
if size:
|
||||||
description += ' width="%d" height="%d"' % size
|
description += ' width="%d" height="%d"' % size
|
||||||
description += '/>'
|
description += '/>'
|
||||||
if text:
|
if comic.text:
|
||||||
description += '<br/>%s' % text
|
description += '<br/>%s' % comic.text
|
||||||
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
|
description += '<br/><a href="%s">View Comic Online</a>' % pageUrl
|
||||||
args = (
|
args = (
|
||||||
title,
|
title,
|
||||||
|
@ -202,7 +213,7 @@ class HtmlEventHandler(EventHandler):
|
||||||
|
|
||||||
def comicDownloaded(self, comic, filename, text=None):
|
def comicDownloaded(self, comic, filename, text=None):
|
||||||
"""Write HTML entry for downloaded comic."""
|
"""Write HTML entry for downloaded comic."""
|
||||||
if self.lastComic != comic.name:
|
if self.lastComic != comic.scraper.name:
|
||||||
self.newComic(comic)
|
self.newComic(comic)
|
||||||
size = None
|
size = None
|
||||||
if self.allowdownscale:
|
if self.allowdownscale:
|
||||||
|
@ -217,7 +228,7 @@ class HtmlEventHandler(EventHandler):
|
||||||
self.html.write('/>\n')
|
self.html.write('/>\n')
|
||||||
if text:
|
if text:
|
||||||
self.html.write(u'<br/>%s\n' % text)
|
self.html.write(u'<br/>%s\n' % text)
|
||||||
self.lastComic = comic.name
|
self.lastComic = comic.scraper.name
|
||||||
self.lastUrl = pageUrl
|
self.lastUrl = pageUrl
|
||||||
|
|
||||||
def newComic(self, comic):
|
def newComic(self, comic):
|
||||||
|
@ -226,7 +237,7 @@ class HtmlEventHandler(EventHandler):
|
||||||
self.html.write(u'</li>\n')
|
self.html.write(u'</li>\n')
|
||||||
if self.lastComic is not None:
|
if self.lastComic is not None:
|
||||||
self.html.write(u'</ul>\n')
|
self.html.write(u'</ul>\n')
|
||||||
self.html.write(u'<li>%s</li>\n' % comic.name)
|
self.html.write(u'<li>%s</li>\n' % comic.scraper.name)
|
||||||
self.html.write(u'<ul>\n')
|
self.html.write(u'<ul>\n')
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
|
@ -250,44 +261,44 @@ class JSONEventHandler(EventHandler):
|
||||||
"""Start with empty data."""
|
"""Start with empty data."""
|
||||||
self.data = {}
|
self.data = {}
|
||||||
|
|
||||||
def jsonFn(self, comic):
|
def jsonFn(self, scraper):
|
||||||
"""Get filename for the JSON file for a comic."""
|
"""Get filename for the JSON file for a comic."""
|
||||||
fn = os.path.join(self.basepath, comic, 'dosage.json')
|
fn = os.path.join(scraper.get_download_dir(self.basepath), 'dosage.json')
|
||||||
fn = os.path.abspath(fn)
|
fn = os.path.abspath(fn)
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
def getComicData(self, comic):
|
def getComicData(self, scraper):
|
||||||
"""Return dictionary with comic info."""
|
"""Return dictionary with comic info."""
|
||||||
if comic not in self.data:
|
if scraper not in self.data:
|
||||||
if os.path.exists(self.jsonFn(comic)):
|
if os.path.exists(self.jsonFn(scraper)):
|
||||||
with codecs.open(self.jsonFn(comic), 'r', self.encoding) as f:
|
with codecs.open(self.jsonFn(scraper), 'r', self.encoding) as f:
|
||||||
self.data[comic] = json.load(f)
|
self.data[scraper] = json.load(f)
|
||||||
else:
|
else:
|
||||||
self.data[comic] = {'pages': {}}
|
self.data[scraper] = {'pages': {}}
|
||||||
return self.data[comic]
|
return self.data[scraper]
|
||||||
|
|
||||||
def getPageInfo(self, comic, url):
|
def getPageInfo(self, scraper, url):
|
||||||
"""Return dictionary with comic page info."""
|
"""Return dictionary with comic page info."""
|
||||||
comicData = self.getComicData(comic)
|
comicData = self.getComicData(scraper)
|
||||||
if url not in comicData['pages']:
|
if url not in comicData['pages']:
|
||||||
comicData['pages'][url] = {'images': {}}
|
comicData['pages'][url] = {'images': {}}
|
||||||
return comicData['pages'][url]
|
return comicData['pages'][url]
|
||||||
|
|
||||||
def comicDownloaded(self, comic, filename, text=None):
|
def comicDownloaded(self, comic, filename):
|
||||||
"""Add URL-to-filename mapping into JSON."""
|
"""Add URL-to-filename mapping into JSON."""
|
||||||
pageInfo = self.getPageInfo(comic.name, comic.referrer)
|
pageInfo = self.getPageInfo(comic.scraper, comic.referrer)
|
||||||
pageInfo['images'][comic.url] = os.path.basename(filename)
|
pageInfo['images'][comic.url] = os.path.basename(filename)
|
||||||
|
|
||||||
def comicPageLink(self, comic, url, prevUrl):
|
def comicPageLink(self, scraper, url, prevUrl):
|
||||||
"""Write previous link into JSON."""
|
"""Write previous link into JSON."""
|
||||||
pageInfo = self.getPageInfo(comic, url)
|
pageInfo = self.getPageInfo(scraper, url)
|
||||||
pageInfo['prev'] = prevUrl
|
pageInfo['prev'] = prevUrl
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
"""Write all JSON data to files."""
|
"""Write all JSON data to files."""
|
||||||
for comic in self.data:
|
for scraper in self.data:
|
||||||
with codecs.open(self.jsonFn(comic), 'w', self.encoding) as f:
|
with codecs.open(self.jsonFn(scraper), 'w', self.encoding) as f:
|
||||||
json.dump(self.data[comic], f, indent=2, separators=(',', ': '), sort_keys=True)
|
json.dump(self.data[scraper], f, indent=2, separators=(',', ': '), sort_keys=True)
|
||||||
|
|
||||||
|
|
||||||
_handler_classes = {}
|
_handler_classes = {}
|
||||||
|
@ -327,15 +338,15 @@ class MultiHandler(object):
|
||||||
for handler in _handlers:
|
for handler in _handlers:
|
||||||
handler.start()
|
handler.start()
|
||||||
|
|
||||||
def comicDownloaded(self, comic, filename, text=None):
|
def comicDownloaded(self, comic, filename):
|
||||||
"""Emit comic downloaded events for handlers."""
|
"""Emit comic downloaded events for handlers."""
|
||||||
for handler in _handlers:
|
for handler in _handlers:
|
||||||
handler.comicDownloaded(comic, filename, text=text)
|
handler.comicDownloaded(comic, filename)
|
||||||
|
|
||||||
def comicPageLink(self, comic, url, prevUrl):
|
def comicPageLink(self, scraper, url, prevUrl):
|
||||||
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
|
"""Emit an event to inform the handler about links between comic pages. Should be overridden in subclass."""
|
||||||
for handler in _handlers:
|
for handler in _handlers:
|
||||||
handler.comicPageLink(comic, url, prevUrl)
|
handler.comicPageLink(scraper, url, prevUrl)
|
||||||
|
|
||||||
def end(self):
|
def end(self):
|
||||||
"""Emit end events for handlers."""
|
"""Emit end events for handlers."""
|
||||||
|
|
|
@ -25,7 +25,7 @@ except ImportError:
|
||||||
pycountry = None
|
pycountry = None
|
||||||
|
|
||||||
from . import loader, configuration, languages
|
from . import loader, configuration, languages
|
||||||
from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
|
from .util import (get_page, makeSequence, get_system_uid, urlopen,
|
||||||
unescape, tagre, normaliseURL, prettyMatcherList,
|
unescape, tagre, normaliseURL, prettyMatcherList,
|
||||||
requests_session)
|
requests_session)
|
||||||
from .comic import ComicStrip
|
from .comic import ComicStrip
|
||||||
|
@ -147,8 +147,7 @@ class Scraper(object):
|
||||||
optional=self.textOptional)
|
optional=self.textOptional)
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self.name, url, imageUrls, self.namer,
|
return ComicStrip(self, url, imageUrls, text=text)
|
||||||
self.session, text=text)
|
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
"""Get comic strips."""
|
"""Get comic strips."""
|
||||||
|
@ -223,7 +222,7 @@ class Scraper(object):
|
||||||
else:
|
else:
|
||||||
prevUrl = self.prevUrlModifier(prevUrl)
|
prevUrl = self.prevUrlModifier(prevUrl)
|
||||||
out.debug(u"Found previous URL %s" % prevUrl)
|
out.debug(u"Found previous URL %s" % prevUrl)
|
||||||
getHandler().comicPageLink(self.name, url, prevUrl)
|
getHandler().comicPageLink(self, url, prevUrl)
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index):
|
||||||
|
@ -260,10 +259,28 @@ class Scraper(object):
|
||||||
page = urlopen(url, self.session, data=data)
|
page = urlopen(url, self.session, data=data)
|
||||||
return page.text
|
return page.text
|
||||||
|
|
||||||
|
def get_download_dir(self, basepath):
|
||||||
|
"""Try to find the corect download directory, ignoring case
|
||||||
|
differences."""
|
||||||
|
path = basepath
|
||||||
|
for part in self.name.split('/'):
|
||||||
|
done = False
|
||||||
|
if (os.path.isdir(path) and
|
||||||
|
not os.path.isdir(os.path.join(path, part))):
|
||||||
|
for entry in os.listdir(path):
|
||||||
|
if (entry.lower() == part.lower() and
|
||||||
|
os.path.isdir(os.path.join(path, entry))):
|
||||||
|
path = os.path.join(path, entry)
|
||||||
|
done = True
|
||||||
|
break
|
||||||
|
if not done:
|
||||||
|
path = os.path.join(path, part)
|
||||||
|
return path
|
||||||
|
|
||||||
def getCompleteFile(self, basepath):
|
def getCompleteFile(self, basepath):
|
||||||
"""Get filename indicating all comics are downloaded."""
|
"""Get filename indicating all comics are downloaded."""
|
||||||
dirname = getDirname(self.name)
|
dirname = self.get_download_dir(basepath)
|
||||||
return os.path.join(basepath, dirname, "complete.txt")
|
return os.path.join(dirname, "complete.txt")
|
||||||
|
|
||||||
def isComplete(self, basepath):
|
def isComplete(self, basepath):
|
||||||
"""Check if all comics are downloaded."""
|
"""Check if all comics are downloaded."""
|
||||||
|
|
|
@ -453,11 +453,6 @@ def strsize(b):
|
||||||
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
|
return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
|
||||||
|
|
||||||
|
|
||||||
def getDirname(name):
|
|
||||||
"""Replace slashes with path separator of name."""
|
|
||||||
return name.replace('/', os.sep)
|
|
||||||
|
|
||||||
|
|
||||||
def getFilename(name):
|
def getFilename(name):
|
||||||
"""Get a filename from given name without dangerous or incompatible
|
"""Get a filename from given name without dangerous or incompatible
|
||||||
characters."""
|
characters."""
|
||||||
|
|
Loading…
Reference in a new issue