dosage/dosage

505 lines
17 KiB
Text
Raw Normal View History

2012-06-20 19:58:13 +00:00
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
2012-06-20 19:58:13 +00:00
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
2014-01-05 15:50:57 +00:00
# Copyright (C) 2012-2014 Bastian Kleineidam
2013-03-07 17:19:20 +00:00
# ___
# / \___ ___ __ _ __ _ ___
# / /\ / _ \/ __|/ _` |/ _` |/ _ \
# / /_// (_) \__ \ (_| | (_| | __/
# /___,' \___/|___/\__,_|\__, |\___|
# |___/
from __future__ import division, print_function
2012-06-20 19:58:13 +00:00
import sys
import os
2013-02-22 17:29:58 +00:00
import argparse
import pydoc
2014-01-05 15:01:11 +00:00
import threading
2013-04-04 16:30:04 +00:00
from io import StringIO
2014-01-05 15:01:11 +00:00
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
2012-06-20 19:58:13 +00:00
from dosagelib import events, scraper, configuration, singleton
2012-06-20 19:58:13 +00:00
from dosagelib.output import out
2013-03-08 21:33:05 +00:00
from dosagelib.util import internal_error, getDirname, strlimit, getLangName
2012-12-12 16:41:29 +00:00
from dosagelib.ansicolor import get_columns
2012-06-20 19:58:13 +00:00
class ArgumentParser(argparse.ArgumentParser):
"""Custom argument parser."""
def print_help(self, file=None):
"""Paginate help message on TTYs."""
msg = self.format_help()
if file is None:
file = sys.stdout
if hasattr(file, "isatty") and file.isatty():
pydoc.pager(msg)
else:
print(msg, file=file)
Examples = """\
EXAMPLES
List available comics (ca. 3000 at the moment):
dosage -l
Get the latest comic of for example CalvinAndHobbes and save it in the "Comics"
directory:
dosage CalvinAndHobbes
If you already have downloaded several comics and want to get the latest
strips of all of them:
dosage --continue @
"""
2012-06-20 19:58:13 +00:00
def setupOptions():
2012-09-26 14:47:39 +00:00
"""Construct option parser.
@return: new option parser
2013-02-22 17:29:58 +00:00
@rtype argparse.ArgumentParser
2012-09-26 14:47:39 +00:00
"""
kwargs = dict(
2013-03-27 18:59:24 +00:00
description = "A comic downloader and archiver.",
epilog = Examples,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser = ArgumentParser(**kwargs)
2013-02-22 17:29:58 +00:00
parser.add_argument('-v', '--verbose', action='count', default=0, help='provides verbose output, use multiple times for more verbosity')
parser.add_argument('-n', '--numstrips', action='store', type=int, default=0, help='traverse and retrieve the given number of comic strips; use --all to retrieve all comic strips')
parser.add_argument('-a', '--all', action='store_true', help='traverse and retrieve all comic strips')
parser.add_argument('-c', '--continue', action='store_true', dest='cont', help='traverse and retrieve comic strips until an existing one is found')
2013-02-22 17:29:58 +00:00
parser.add_argument('-b', '--basepath', action='store', default='Comics', help='set the path to create invidivual comic directories in, default is Comics', metavar='PATH')
parser.add_argument('--baseurl', action='store', help='the base URL of your comics directory (for RSS, HTML, etc.); this should correspond to --base-path', metavar='PATH')
parser.add_argument('-l', '--list', action='store_true', help='list available comic modules')
parser.add_argument('--singlelist', action='store_true', help='list available comic modules in a single list')
2013-02-22 17:29:58 +00:00
parser.add_argument('--version', action='store_true', help='display the version number')
2013-04-08 18:19:10 +00:00
parser.add_argument('--vote', action='store_true', help='vote for the selected comics')
2013-02-22 17:29:58 +00:00
parser.add_argument('-m', '--modulehelp', action='store_true', help='display help for comic modules')
parser.add_argument('-t', '--timestamps', action='store_true', help='print timestamps for all output at any info level')
parser.add_argument('-o', '--output', action='append', dest='handler', choices=events.getHandlerNames(), help='sets output handlers for downloaded comics')
parser.add_argument('--adult', action='store_true', help='confirms that you are old enough to view adult content')
2013-04-25 17:00:34 +00:00
# used for development testing prev/next matching
2013-04-10 19:47:05 +00:00
parser.add_argument('--dry-run', action='store_true', help=argparse.SUPPRESS)
2013-04-08 18:17:02 +00:00
# multimatch is only used for development, eg. testing if all comics of a scripted plugin are working
2013-03-07 17:19:20 +00:00
parser.add_argument('--multimatch', action='store_true', help=argparse.SUPPRESS)
parser.add_argument('comic', nargs='*', help='comic module name (including case insensitive substrings)')
2013-02-27 05:50:34 +00:00
try:
import argcomplete
argcomplete.autocomplete(parser)
2013-02-27 05:51:20 +00:00
except ImportError:
pass
2012-06-20 19:58:13 +00:00
return parser
2012-09-26 14:47:39 +00:00
2013-04-08 18:17:02 +00:00
def displayVersion(verbose):
2012-09-26 14:47:39 +00:00
"""Display application name, version, copyright and license."""
2013-04-08 18:17:02 +00:00
print(configuration.App)
print(configuration.Copyright)
print(configuration.Freeware)
print("For support see", configuration.SupportUrl)
if verbose:
# search for updates
from dosagelib.updater import check_update
result, value = check_update()
if result:
if value:
version, url = value
if url is None:
# current version is newer than online version
text = ('Detected local or development version %(currentversion)s. '
'Available version of %(app)s is %(version)s.')
else:
# display update link
text = ('A new version %(version)s of %(app)s is '
'available at %(url)s.')
attrs = dict(version=version, app=configuration.AppName,
url=url, currentversion=configuration.Version)
print(text % attrs)
else:
if value is None:
value = 'invalid update file syntax'
text = ('An error occured while checking for an '
'update of %(app)s: %(error)s.')
attrs = dict(error=value, app=configuration.AppName)
print(text % attrs)
2012-10-11 10:03:12 +00:00
return 0
2012-09-26 14:47:39 +00:00
2012-10-11 10:03:12 +00:00
def setOutputInfo(options):
"""Set global output level and timestamp option."""
out.level = 0
out.level += options.verbose
out.timestamps = options.timestamps
2013-04-11 16:27:44 +00:00
# debug urllib3
#from requests.packages.urllib3 import add_stderr_logger
#add_stderr_logger()
2012-06-20 19:58:13 +00:00
2013-04-10 19:47:05 +00:00
def saveComicStrip(strip, basepath, dryrun):
2012-10-11 10:03:12 +00:00
"""Save a comic strip which can consist of multiple images."""
errors = 0
2012-10-11 17:53:37 +00:00
allskipped = True
2012-10-11 10:03:12 +00:00
for image in strip.getImages():
2012-06-20 19:58:13 +00:00
try:
2013-04-10 19:47:05 +00:00
if dryrun:
filename, saved = "", False
else:
filename, saved = image.save(basepath)
2012-10-11 17:53:37 +00:00
if saved:
allskipped = False
2013-03-08 21:33:05 +00:00
except Exception as msg:
out.exception('Could not save image at %s to %s: %r' % (image.referrer, image.filename, msg))
2012-10-11 10:03:12 +00:00
errors += 1
2012-10-11 17:53:37 +00:00
return errors, allskipped
2012-10-11 10:03:12 +00:00
2013-04-29 05:36:08 +00:00
def displayHelp(options):
2012-10-11 10:03:12 +00:00
"""Print help for comic strips."""
2013-04-29 05:36:08 +00:00
errors = 0
2013-03-11 19:05:06 +00:00
try:
2013-04-29 05:36:08 +00:00
for scraperobj in getScrapers(options.comic, options.basepath):
errors += displayComicHelp(scraperobj)
2013-03-11 19:05:06 +00:00
except ValueError as msg:
2013-03-25 18:39:37 +00:00
out.exception(msg)
2013-04-29 05:36:08 +00:00
return 2
return errors
2012-10-11 10:03:12 +00:00
2012-12-07 23:45:18 +00:00
2012-12-12 16:41:29 +00:00
def displayComicHelp(scraperobj):
"""Print description and help for a comic."""
2013-03-08 21:33:05 +00:00
out.context = getScraperName(scraperobj)
2012-12-12 16:41:29 +00:00
try:
2013-04-29 19:58:31 +00:00
out.info(u"URL: " + scraperobj.url)
2012-12-12 16:41:29 +00:00
if scraperobj.description:
2013-04-29 19:58:31 +00:00
out.info(u"Description: " + scraperobj.description)
2013-03-08 21:33:05 +00:00
if scraperobj.lang:
2013-04-29 19:58:31 +00:00
out.info(u"Language: " + getLangName(scraperobj.lang))
2013-11-12 19:09:13 +00:00
if scraperobj.genres:
out.info(u"Genres: " + ", ".join(scraperobj.genres))
2012-12-12 16:41:29 +00:00
if scraperobj.help:
for line in scraperobj.help.splitlines():
out.info(line)
2013-04-29 05:36:08 +00:00
return 0
except ValueError as msg:
out.exception(msg)
return 1
2012-12-12 16:41:29 +00:00
finally:
2013-04-29 19:58:31 +00:00
out.context = u''
2012-12-12 16:41:29 +00:00
2014-01-05 15:01:11 +00:00
# the comic scraper job queue
jobs = Queue()
# ensure threads download only from one host at a time
host_locks = {}
def get_hostname(url):
"""Get hostname from URL."""
return list(urlparse(url))[1].lower()
lock = threading.Lock()
def get_host_lock(url):
hostname = get_hostname(url)
return host_locks.setdefault(hostname, threading.Lock())
comic_errors = 0
class ComicGetter(threading.Thread):
"""Get all strips of a comic in a thread."""
def __init__(self, options):
"""Store options."""
super(ComicGetter, self).__init__()
self.options = options
2014-01-05 15:17:34 +00:00
self.origname = self.getName()
2014-01-05 15:01:11 +00:00
def run(self):
"""Process from queue until it is empty."""
global comic_errors
while True:
try:
scraperobj = jobs.get(False)
2014-01-05 15:17:34 +00:00
self.setName(scraperobj.getName())
2014-01-05 15:01:11 +00:00
with lock:
host_lock = get_host_lock(scraperobj.url)
with host_lock:
errors = getStrips(scraperobj, self.options)
with lock:
comic_errors += errors
jobs.task_done()
2014-01-05 15:17:34 +00:00
self.setName(self.origname)
2014-01-05 15:01:11 +00:00
except Empty:
break
2013-02-22 17:29:58 +00:00
def getComics(options):
"""Retrieve comics."""
if options.handler:
2013-03-25 18:39:37 +00:00
for name in set(options.handler):
events.addHandler(name, options.basepath, options.baseurl)
2012-10-12 20:07:50 +00:00
events.getHandler().start()
2013-04-08 18:19:10 +00:00
errors = 0
2012-12-12 16:41:29 +00:00
try:
2013-02-22 17:29:58 +00:00
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
2014-01-05 15:01:11 +00:00
jobs.put(scraperobj)
# start threads
num_threads = max(1, min(10, jobs.qsize()))
for i in range(num_threads):
ComicGetter(options).start()
# wait for threads to finish
jobs.join()
2013-03-11 19:05:06 +00:00
except ValueError as msg:
2013-03-25 18:39:37 +00:00
out.exception(msg)
2013-03-11 19:05:06 +00:00
errors += 1
2012-12-12 16:41:29 +00:00
finally:
events.getHandler().end()
2014-01-05 15:01:11 +00:00
return errors + comic_errors
def voteComics(options):
"""Vote for comics."""
errors = 0
try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
errors += voteComic(scraperobj)
except ValueError as msg:
out.exception(msg)
errors += 1
2012-12-12 16:41:29 +00:00
return errors
2014-01-05 15:01:11 +00:00
def voteComic(scraperobj):
2013-04-09 17:33:50 +00:00
"""Vote for given comic scraper."""
errors = 0
2014-01-05 15:17:34 +00:00
out.context = getScraperName(scraperobj)
2013-04-09 17:33:50 +00:00
try:
2014-01-05 15:17:34 +00:00
name = scraperobj.getName()
2013-04-09 17:33:50 +00:00
answer = scraperobj.vote()
2013-04-30 16:09:42 +00:00
out.debug(u'Vote answer %r' % answer)
2013-04-09 17:33:50 +00:00
if answer == 'counted':
url = configuration.Url + 'comics/%s.html' % name.replace('/', '_')
2013-04-30 04:40:20 +00:00
out.info(u'Vote submitted. Votes are updated regularly at %s.' % url)
2013-04-09 17:33:50 +00:00
elif answer == 'no':
2013-04-30 04:40:20 +00:00
out.info(u'Vote not submitted - your vote has already been submitted before.')
2013-04-09 17:33:50 +00:00
elif answer == 'noname':
2013-04-30 04:40:20 +00:00
out.warn(u'The comic %s cannot be voted.' % name)
2013-04-09 17:33:50 +00:00
else:
2013-04-30 04:40:20 +00:00
out.warn(u'Error submitting vote parameters: %r' % answer)
2013-04-09 17:33:50 +00:00
except Exception as msg:
out.exception(msg)
errors += 1
finally:
2013-04-29 19:58:31 +00:00
out.context = u''
2013-04-09 17:33:50 +00:00
return errors
2012-12-12 16:41:29 +00:00
def getStrips(scraperobj, options):
"""Get all strips from a scraper."""
errors = 0
2013-06-24 20:13:28 +00:00
if options.all or options.cont:
numstrips = None
2012-12-12 16:41:29 +00:00
elif options.numstrips:
numstrips = options.numstrips
2012-12-12 16:41:29 +00:00
else:
# get current strip
numstrips = 1
2012-12-12 16:41:29 +00:00
try:
2013-04-25 20:40:06 +00:00
if scraperobj.isComplete(options.basepath):
2013-04-30 04:40:20 +00:00
out.info(u"All comics are already downloaded.")
2013-04-25 20:40:06 +00:00
return 0
for strip in scraperobj.getStrips(numstrips):
2013-04-10 19:47:05 +00:00
_errors, skipped = saveComicStrip(strip, options.basepath, options.dry_run)
2012-12-12 16:41:29 +00:00
errors += _errors
2013-02-04 19:22:54 +00:00
if skipped and options.cont:
# stop when retrieval skipped an image for one comic strip
2013-04-30 04:40:20 +00:00
out.info(u"Stop retrieval because image file already exists")
2012-12-12 16:41:29 +00:00
break
2013-04-25 20:40:06 +00:00
if options.all and not (errors or options.dry_run or
options.cont or scraperobj.indexes):
scraperobj.setComplete(options.basepath)
2013-03-07 23:06:50 +00:00
except Exception as msg:
2013-03-25 18:39:37 +00:00
out.exception(msg)
2012-12-12 16:41:29 +00:00
errors += 1
2012-10-11 10:03:12 +00:00
return errors
2013-02-22 17:29:58 +00:00
def run(options):
2012-10-11 10:03:12 +00:00
"""Execute comic commands."""
setOutputInfo(options)
# ensure only one instance of dosage is running
me = singleton.SingleInstance()
2012-10-11 10:03:12 +00:00
if options.version:
2013-04-08 18:17:02 +00:00
return displayVersion(options.verbose)
2012-10-11 10:03:12 +00:00
if options.list:
return doList()
if options.singlelist:
2013-03-08 21:33:05 +00:00
return doList(columnList=False, verbose=options.verbose)
# after this a list of comic strips is needed
if not options.comic:
2013-04-30 16:09:42 +00:00
out.warn(u'No comics specified, bailing out!')
return 1
2013-02-22 17:29:58 +00:00
if options.modulehelp:
2013-04-29 05:36:08 +00:00
return displayHelp(options)
2014-01-05 15:01:11 +00:00
if options.vote:
return voteComics(options)
2013-02-22 17:29:58 +00:00
return getComics(options)
2012-10-11 10:03:12 +00:00
2013-03-08 21:33:05 +00:00
def doList(columnList=True, verbose=False):
2012-10-11 10:03:12 +00:00
"""List available comics."""
2013-03-09 08:00:50 +00:00
page = hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
if page:
2013-04-30 05:24:54 +00:00
fd = StringIO(u'')
out.setStream(fd)
2013-04-30 04:40:20 +00:00
out.info(u'Available comic scrapers:')
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName())
2013-03-08 21:33:05 +00:00
if columnList:
num = doColumnList(scrapers)
else:
num = doSingleList(scrapers, verbose=verbose)
2013-04-30 04:40:20 +00:00
out.info(u'%d supported comics.' % num)
2013-03-09 08:00:50 +00:00
if page:
pydoc.pager(fd.getvalue())
2012-10-11 10:03:12 +00:00
return 0
2013-03-08 21:33:05 +00:00
def doSingleList(scrapers, verbose=False):
2012-10-11 10:03:12 +00:00
"""Get list of scraper names, one per line."""
2012-10-12 19:57:06 +00:00
for num, scraperobj in enumerate(scrapers):
2013-03-08 21:33:05 +00:00
if verbose:
displayComicHelp(scraperobj)
else:
2013-03-09 08:00:50 +00:00
out.info(getScraperName(scraperobj))
2012-10-12 19:57:06 +00:00
return num
2012-10-11 10:03:12 +00:00
def doColumnList(scrapers):
"""Get list of scraper names with multiple names per line."""
2012-10-12 19:57:06 +00:00
screenWidth = get_columns(sys.stdout)
2012-12-12 16:41:29 +00:00
# limit name length so at least two columns are there
limit = (screenWidth // 2) - 8
2012-12-12 16:41:29 +00:00
names = [getScraperName(scraperobj, limit=limit) for scraperobj in scrapers]
2012-10-12 19:57:06 +00:00
num = len(names)
2012-12-12 16:41:29 +00:00
maxlen = max(len(name) for name in names)
namesPerLine = max(screenWidth // (maxlen + 1), 1)
2012-10-11 10:03:12 +00:00
while names:
2013-04-30 04:40:20 +00:00
out.info(u''.join(name.ljust(maxlen) for name in names[:namesPerLine]))
2012-10-11 10:03:12 +00:00
del names[:namesPerLine]
2012-10-12 19:57:06 +00:00
return num
2012-10-11 10:03:12 +00:00
2013-03-18 17:16:05 +00:00
TAG_ADULT = "adult"
TAG_LANG = "lang"
2012-10-11 10:03:12 +00:00
2012-12-12 16:41:29 +00:00
def getScraperName(scraperobj, limit=None):
"""Get comic scraper name."""
2013-03-18 17:16:05 +00:00
tags = []
if scraperobj.adult:
tags.append(TAG_ADULT)
if scraperobj.lang != "en":
tags.append("%s:%s" % (TAG_LANG, scraperobj.lang))
if tags:
suffix = " [" + ", ".join(tags) + "]"
else:
suffix = ""
name = scraperobj.getName()
2012-12-12 16:41:29 +00:00
if limit is not None:
name = strlimit(name, limit)
return name + suffix
2013-02-13 21:18:05 +00:00
def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
2012-10-11 10:03:12 +00:00
"""Get scraper objects for the given comics."""
if '@' in comics:
# only scrapers whose directory already exists
if len(comics) > 1:
2013-04-30 16:09:42 +00:00
out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperclass in scraper.get_scraperclasses():
dirname = getDirname(scraperclass.getName())
2012-10-11 10:03:12 +00:00
if os.path.isdir(os.path.join(basepath, dirname)):
2013-03-26 16:28:36 +00:00
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
2012-10-11 10:03:12 +00:00
yield scraperclass()
elif '@@' in comics:
# all scrapers
for scraperclass in scraper.get_scraperclasses():
2012-12-08 20:29:41 +00:00
if not adult and scraperclass.adult:
2012-12-12 16:41:29 +00:00
warn_adult(scraperclass)
2012-12-08 20:29:41 +00:00
continue
2012-10-11 10:03:12 +00:00
yield scraperclass()
else:
2012-12-07 23:45:18 +00:00
# get only selected comic scrapers
2013-03-26 16:29:03 +00:00
# store them in a set to eliminate duplicates
scrapers = set()
2012-10-11 10:03:12 +00:00
for comic in comics:
2013-04-25 17:00:34 +00:00
# Helpful when using shell completion to pick comics to get
comic.rstrip(os.path.sep)
2013-03-13 06:18:00 +00:00
if basepath and comic.startswith(basepath):
2013-03-13 17:31:50 +00:00
# make the following command work:
# find Comics -type d | xargs -n1 -P10 dosage -b Comics
2013-03-26 16:29:03 +00:00
comic = comic[len(basepath):].lstrip(os.sep)
2012-10-11 12:17:25 +00:00
if ':' in comic:
name, index = comic.split(':', 1)
2012-10-11 17:53:37 +00:00
indexes = index.split(',')
2012-06-20 19:58:13 +00:00
else:
2012-10-11 12:17:25 +00:00
name = comic
2012-10-11 17:53:37 +00:00
indexes = None
2013-03-11 19:05:06 +00:00
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
2013-03-11 18:44:42 +00:00
for scraperclass in scraperclasses:
2013-02-13 21:18:05 +00:00
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
scraperobj = scraperclass(indexes=indexes)
if scraperobj not in scrapers:
2013-03-26 16:29:03 +00:00
scrapers.add(scraperobj)
yield scraperobj
2012-12-07 23:45:18 +00:00
2012-06-20 19:58:13 +00:00
2012-12-12 16:41:29 +00:00
def warn_adult(scraperclass):
"""Print warning about adult content."""
2013-04-30 16:09:42 +00:00
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
2012-12-12 16:41:29 +00:00
2012-06-20 19:58:13 +00:00
def main():
2012-09-26 14:47:39 +00:00
"""Parse options and execute commands."""
2012-06-20 19:58:13 +00:00
try:
parser = setupOptions()
2013-02-22 17:29:58 +00:00
res = run(parser.parse_args())
2012-06-20 19:58:13 +00:00
except KeyboardInterrupt:
print("Aborted.")
2012-06-20 19:58:13 +00:00
res = 1
except Exception:
internal_error()
res = 2
return res
def profile():
"""Profile the loading of all scrapers."""
import cProfile
cProfile.run("scraper.get_scraperclasses()", "dosage.prof")
def viewprof():
"""View profile stats."""
import pstats
stats = pstats.Stats("dosage.prof")
stats.strip_dirs().sort_stats("cumulative").print_stats(100)
2012-06-20 19:58:13 +00:00
if __name__ == '__main__':
sys.exit(main())
#profile()
#viewprof()