Make threads interruptable.

This commit is contained in:
Bastian Kleineidam 2014-03-04 18:38:46 +01:00
parent 1a96b63137
commit 15ef59262a
3 changed files with 251 additions and 190 deletions

View file

@ -1,10 +1,12 @@
Dosage 2.13 (released xx.xx.2014) Dosage 2.13 (released xx.xx.2014)
Features: Features:
- comics: Added OhJoySexToy, TheGentlemansArmchair. - comics: Added OhJoySexToy, TheGentlemansArmchair, Underling, DongeonsAndDenizens,
GrimTalesFromDownBelow, TheLandscaper, DieFruehreifen, MonsieurLeChien.
Fixes: Fixes:
- comics: Fixed EvilInc, FredoAndPidjin. - comics: Fixed EvilInc, FredoAndPidjin.
- cmdline: Make download threads interruptable with Ctrl-C
Dosage 2.12 (released 24.1.2014) Dosage 2.12 (released 24.1.2014)

195
dosage
View file

@ -14,20 +14,10 @@ import sys
import os import os
import argparse import argparse
import pydoc import pydoc
import threading
from io import StringIO from io import StringIO
try: from dosagelib import events, configuration, singleton, director
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
from dosagelib import events, scraper, configuration, singleton
from dosagelib.output import out from dosagelib.output import out
from dosagelib.util import internal_error, getDirname, strlimit, getLangName from dosagelib.util import internal_error, strlimit, getLangName
from dosagelib.ansicolor import get_columns from dosagelib.ansicolor import get_columns
@ -142,29 +132,11 @@ def setOutputInfo(options):
#add_stderr_logger() #add_stderr_logger()
def saveComicStrip(strip, basepath, dryrun):
"""Save a comic strip which can consist of multiple images."""
errors = 0
allskipped = True
for image in strip.getImages():
try:
if dryrun:
filename, saved = "", False
else:
filename, saved = image.save(basepath)
if saved:
allskipped = False
except Exception as msg:
out.exception('Could not save image at %s to %s: %r' % (image.referrer, image.filename, msg))
errors += 1
return errors, allskipped
def displayHelp(options): def displayHelp(options):
"""Print help for comic strips.""" """Print help for comic strips."""
errors = 0 errors = 0
try: try:
for scraperobj in getScrapers(options.comic, options.basepath): for scraperobj in director.getScrapers(options.comic, options.basepath):
errors += displayComicHelp(scraperobj) errors += displayComicHelp(scraperobj)
except ValueError as msg: except ValueError as msg:
out.exception(msg) out.exception(msg)
@ -195,83 +167,11 @@ def displayComicHelp(scraperobj):
out.context = orig_context out.context = orig_context
# the comic scraper job queue
jobs = Queue()
# ensure threads download only from one host at a time
host_locks = {}
def get_hostname(url):
"""Get hostname from URL."""
return list(urlparse(url))[1].lower()
lock = threading.Lock()
def get_host_lock(url):
"""Get lock object for given URL host."""
hostname = get_hostname(url)
return host_locks.setdefault(hostname, threading.Lock())
comic_errors = 0
class ComicGetter(threading.Thread):
"""Get all strips of a comic in a thread."""
def __init__(self, options):
"""Store options."""
super(ComicGetter, self).__init__()
self.options = options
self.origname = self.getName()
def run(self):
"""Process from queue until it is empty."""
global comic_errors
while True:
try:
scraperobj = jobs.get(False)
self.setName(scraperobj.getName())
with lock:
host_lock = get_host_lock(scraperobj.url)
with host_lock:
errors = getStrips(scraperobj, self.options)
with lock:
comic_errors += errors
jobs.task_done()
self.setName(self.origname)
except Empty:
break
def getComics(options):
"""Retrieve comics."""
if options.handler:
for name in set(options.handler):
events.addHandler(name, options.basepath, options.baseurl)
events.getHandler().start()
errors = 0
try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
jobs.put(scraperobj)
# start threads
num_threads = max(1, min(10, jobs.qsize()))
for i in range(num_threads):
ComicGetter(options).start()
# wait for threads to finish
jobs.join()
except ValueError as msg:
out.exception(msg)
errors += 1
finally:
events.getHandler().end()
return errors + comic_errors
def voteComics(options): def voteComics(options):
"""Vote for comics.""" """Vote for comics."""
errors = 0 errors = 0
try: try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch): for scraperobj in director.getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
errors += voteComic(scraperobj) errors += voteComic(scraperobj)
except ValueError as msg: except ValueError as msg:
out.exception(msg) out.exception(msg)
@ -305,36 +205,6 @@ def voteComic(scraperobj):
return errors return errors
def getStrips(scraperobj, options):
"""Get all strips from a scraper."""
errors = 0
if options.all or options.cont:
numstrips = None
elif options.numstrips:
numstrips = options.numstrips
else:
# get current strip
numstrips = 1
try:
if scraperobj.isComplete(options.basepath):
out.info(u"All comics are already downloaded.")
return 0
for strip in scraperobj.getStrips(numstrips):
_errors, skipped = saveComicStrip(strip, options.basepath, options.dry_run)
errors += _errors
if skipped and options.cont:
# stop when retrieval skipped an image for one comic strip
out.info(u"Stop retrieval because image file already exists")
break
if options.all and not (errors or options.dry_run or
options.cont or scraperobj.indexes):
scraperobj.setComplete(options.basepath)
except Exception as msg:
out.exception(msg)
errors += 1
return errors
def run(options): def run(options):
"""Execute comic commands.""" """Execute comic commands."""
setOutputInfo(options) setOutputInfo(options)
@ -354,7 +224,7 @@ def run(options):
return displayHelp(options) return displayHelp(options)
if options.vote: if options.vote:
return voteComics(options) return voteComics(options)
return getComics(options) return director.getComics(options)
def doList(columnList=True, verbose=False): def doList(columnList=True, verbose=False):
@ -369,7 +239,7 @@ def doList(columnList=True, verbose=False):
out.info(u'Available comic scrapers:') out.info(u'Available comic scrapers:')
out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT) out.info(u'Comics tagged with [%s] require age confirmation with the --adult option.' % TAG_ADULT)
out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG) out.info(u'Non-english comics are tagged with [%s].' % TAG_LANG)
scrapers = sorted(getScrapers(['@@']), key=lambda s: s.getName()) scrapers = sorted(director.getAllScrapers(), key=lambda s: s.getName())
if columnList: if columnList:
num = doColumnList(scrapers) num = doColumnList(scrapers)
else: else:
@ -426,59 +296,6 @@ def getScraperName(scraperobj, limit=None):
return name + suffix return name + suffix
def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
"""Get scraper objects for the given comics."""
if '@' in comics:
# only scrapers whose directory already exists
if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperclass in scraper.get_scraperclasses():
dirname = getDirname(scraperclass.getName())
if os.path.isdir(os.path.join(basepath, dirname)):
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
yield scraperclass()
elif '@@' in comics:
# all scrapers
for scraperclass in scraper.get_scraperclasses():
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
yield scraperclass()
else:
# get only selected comic scrapers
# store them in a set to eliminate duplicates
scrapers = set()
for comic in comics:
# Helpful when using shell completion to pick comics to get
comic.rstrip(os.path.sep)
if basepath and comic.startswith(basepath):
# make the following command work:
# find Comics -type d | xargs -n1 -P10 dosage -b Comics
comic = comic[len(basepath):].lstrip(os.sep)
if ':' in comic:
name, index = comic.split(':', 1)
indexes = index.split(',')
else:
name = comic
indexes = None
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
for scraperclass in scraperclasses:
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
scraperobj = scraperclass(indexes=indexes)
if scraperobj not in scrapers:
scrapers.add(scraperobj)
yield scraperobj
def warn_adult(scraperclass):
"""Print warning about adult content."""
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())
def main(): def main():
"""Parse options and execute commands.""" """Parse options and execute commands."""
try: try:

242
dosagelib/director.py Normal file
View file

@ -0,0 +1,242 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
import threading
import thread
import os
try:
from Queue import Queue, Empty
except ImportError:
from queue import Queue, Empty
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
from .output import out
from . import events, scraper
from .util import getDirname
class ComicQueue(Queue):
"""The comic scraper job queue."""
def join(self, timeout=None):
"""Blocks until all items in the Queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls task_done()
to indicate the item was retrieved and all work on it is complete.
When the count of unfinished tasks drops to zero, join() unblocks.
"""
self.all_tasks_done.acquire()
try:
while self.unfinished_tasks:
self.all_tasks_done.wait(timeout)
finally:
self.all_tasks_done.release()
def clear(self):
"""Remove all queue entries."""
self.mutex.acquire()
self.queue.clear()
self.mutex.release()
# ensure threads download only from one host at a time
host_locks = {}
def get_hostname(url):
"""Get hostname from URL."""
return list(urlparse(url))[1].lower()
lock = threading.Lock()
def get_host_lock(url):
"""Get lock object for given URL host."""
hostname = get_hostname(url)
return host_locks.setdefault(hostname, threading.Lock())
class ComicGetter(threading.Thread):
"""Get all strips of a comic in a thread."""
def __init__(self, options):
"""Store options."""
super(ComicGetter, self).__init__()
self.options = options
self.origname = self.getName()
self.stopped = False
self.errors = 0
def run(self):
"""Process from queue until it is empty."""
try:
while not self.stopped:
scraperobj = jobs.get(False)
self.setName(scraperobj.getName())
try:
self.getStrips(scraperobj)
finally:
jobs.task_done()
self.setName(self.origname)
except Empty:
pass
except KeyboardInterrupt:
thread.interrupt_main()
def getStrips(self, scraperobj):
"""Download comic strips."""
with lock:
host_lock = get_host_lock(scraperobj.url)
with host_lock:
self._getStrips(scraperobj)
def _getStrips(self, scraperobj):
"""Get all strips from a scraper."""
if self.options.all or self.options.cont:
numstrips = None
elif self.options.numstrips:
numstrips = self.options.numstrips
else:
# get current strip
numstrips = 1
try:
if scraperobj.isComplete(self.options.basepath):
out.info(u"All comics are already downloaded.")
return 0
for strip in scraperobj.getStrips(numstrips):
skipped = self.saveComicStrip(strip)
if skipped and self.options.cont:
# stop when retrieval skipped an image for one comic strip
out.info(u"Stop retrieval because image file already exists")
break
if self.stopped:
break
if self.options.all and not (self.errors or self.options.dry_run or
self.options.cont or scraperobj.indexes):
scraperobj.setComplete(self.options.basepath)
except Exception as msg:
out.exception(msg)
self.errors += 1
def saveComicStrip(self, strip):
"""Save a comic strip which can consist of multiple images."""
allskipped = True
for image in strip.getImages():
try:
if self.options.dry_run:
filename, saved = "", False
else:
filename, saved = image.save(self.options.basepath)
if saved:
allskipped = False
if self.stopped:
break
except Exception as msg:
out.exception('Could not save image at %s to %s: %r' % (image.referrer, image.filename, msg))
self.errors += 1
return allskipped
def stop(self):
self.stopped = True
jobs = ComicQueue()
threads = []
def getComics(options):
"""Retrieve comics."""
if options.handler:
for name in set(options.handler):
events.addHandler(name, options.basepath, options.baseurl)
events.getHandler().start()
errors = 0
try:
for scraperobj in getScrapers(options.comic, options.basepath, options.adult, options.multimatch):
jobs.put(scraperobj)
# start threads
num_threads = 1# XXX max(1, min(10, jobs.qsize()))
for i in range(num_threads):
t = ComicGetter(options)
threads.append(t)
t.start()
# wait for threads to finish
jobs.join(1)
for t in threads:
errors += t.errors
except ValueError as msg:
out.exception(msg)
errors += 1
except KeyboardInterrupt:
finish()
finally:
events.getHandler().end()
return errors
def finish():
out.warn("Interrupted!")
for t in threads:
t.stop()
jobs.clear()
out.warn("Waiting for download threads to finish.")
def getAllScrapers():
"""Get all scrapers."""
return getScrapers(['@@'])
def getScrapers(comics, basepath=None, adult=True, multiple_allowed=False):
"""Get scraper objects for the given comics."""
if '@' in comics:
# only scrapers whose directory already exists
if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.")
for scraperclass in scraper.get_scraperclasses():
dirname = getDirname(scraperclass.getName())
if os.path.isdir(os.path.join(basepath, dirname)):
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
yield scraperclass()
elif '@@' in comics:
# all scrapers
for scraperclass in scraper.get_scraperclasses():
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
yield scraperclass()
else:
# get only selected comic scrapers
# store them in a set to eliminate duplicates
scrapers = set()
for comic in comics:
# Helpful when using shell completion to pick comics to get
comic.rstrip(os.path.sep)
if basepath and comic.startswith(basepath):
# make the following command work:
# find Comics -type d | xargs -n1 -P10 dosage -b Comics
comic = comic[len(basepath):].lstrip(os.sep)
if ':' in comic:
name, index = comic.split(':', 1)
indexes = index.split(',')
else:
name = comic
indexes = None
scraperclasses = scraper.find_scraperclasses(name, multiple_allowed=multiple_allowed)
for scraperclass in scraperclasses:
if not adult and scraperclass.adult:
warn_adult(scraperclass)
continue
scraperobj = scraperclass(indexes=indexes)
if scraperobj not in scrapers:
scrapers.add(scraperobj)
yield scraperobj
def warn_adult(scraperclass):
"""Print warning about adult content."""
out.warn(u"skipping adult comic %s; use the --adult option to confirm your age" % scraperclass.getName())