dosage/dosagelib/director.py

256 lines
8.4 KiB
Python
Raw Permalink Normal View History

# SPDX-License-Identifier: MIT
2016-10-28 22:21:41 +00:00
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
2022-05-28 17:33:16 +00:00
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
2014-03-04 17:38:46 +00:00
import os
import re
2014-03-04 19:50:34 +00:00
import threading
import _thread
from queue import Queue, Empty
from typing import Collection, Dict
from urllib.parse import urlparse
2014-03-04 17:38:46 +00:00
from .output import out
2022-06-04 08:56:25 +00:00
from .scraper import scrapers as scrapercache
from . import events
2014-03-04 17:38:46 +00:00
class ComicQueue(Queue):
"""The comic scraper job queue."""
def join(self, timeout=None):
"""Blocks until all items in the Queue have been gotten and processed.
The count of unfinished tasks goes up whenever an item is added to the
queue. The count goes down whenever a consumer thread calls task_done()
to indicate the item was retrieved and all work on it is complete.
When the count of unfinished tasks drops to zero, join() unblocks.
"""
self.all_tasks_done.acquire()
try:
while self.unfinished_tasks:
self.all_tasks_done.wait(timeout)
finally:
self.all_tasks_done.release()
def clear(self):
"""Remove all queue entries."""
self.mutex.acquire()
self.queue.clear()
self.mutex.release()
# ensure threads download only from one host at a time
2022-05-28 17:33:16 +00:00
host_locks: Dict[str, threading.Lock] = {}
2014-03-04 17:38:46 +00:00
def get_hostname(url):
"""Get hostname from URL."""
return list(urlparse(url))[1].lower()
lock = threading.Lock()
2014-03-04 17:38:46 +00:00
def get_host_lock(url):
"""Get lock object for given URL host."""
hostname = get_hostname(url)
return host_locks.setdefault(hostname, threading.Lock())
class ComicGetter(threading.Thread):
"""Get all strips of a comic in a thread."""
def __init__(self, options, jobs):
2014-03-04 17:38:46 +00:00
"""Store options."""
super(ComicGetter, self).__init__()
self.options = options
self.jobs = jobs
self.origname = self.name
2014-03-04 17:38:46 +00:00
self.stopped = False
self.errors = 0
def run(self):
"""Process from queue until it is empty."""
try:
while not self.stopped:
scraperobj = self.jobs.get(False)
2016-04-13 20:43:34 +00:00
self.name = scraperobj.name
2014-03-04 17:38:46 +00:00
try:
2016-04-13 20:43:34 +00:00
self.getStrips(scraperobj)
2014-03-04 17:38:46 +00:00
finally:
self.jobs.task_done()
2016-04-13 20:43:34 +00:00
self.name = self.origname
2014-03-04 17:38:46 +00:00
except Empty:
pass
except KeyboardInterrupt:
_thread.interrupt_main()
2014-03-04 17:38:46 +00:00
def getStrips(self, scraperobj):
"""Download comic strips."""
with lock:
host_lock = get_host_lock(scraperobj.url)
with host_lock:
2016-04-13 20:43:34 +00:00
self._getStrips(scraperobj)
2014-03-04 17:38:46 +00:00
def _getStrips(self, scraperobj):
"""Get all strips from a scraper."""
if self.options.numstrips:
2014-03-04 17:38:46 +00:00
numstrips = self.options.numstrips
elif self.options.cont or self.options.all:
numstrips = None
2014-03-04 17:38:46 +00:00
else:
# get current strip
numstrips = 1
try:
if scraperobj.isComplete(self.options.basepath):
out.info(u"All comics are already downloaded.")
return 0
for strip in scraperobj.getStrips(numstrips):
skipped = self.saveComicStrip(strip)
if skipped and self.options.cont:
# stop when retrieval skipped an image for one comic strip
out.info(u"Stop retrieval because image file already exists")
break
if self.stopped:
break
if (self.options.all or
self.options.cont) and not (self.errors or
self.options.dry_run or
scraperobj.indexes):
2014-03-04 17:38:46 +00:00
scraperobj.setComplete(self.options.basepath)
except Exception as msg:
out.exception(msg)
self.errors += 1
def saveComicStrip(self, strip):
"""Save a comic strip which can consist of multiple images."""
allskipped = True
for image in strip.getImages():
try:
if self.options.dry_run:
filename, saved = "", False
else:
filename, saved = image.save(self.options.basepath)
if saved:
allskipped = False
if self.stopped:
break
except Exception as msg:
2020-01-12 23:36:46 +00:00
out.exception('Could not save image at {} to {}: {!r}'.format(
image.referrer, image.filename, msg))
2014-03-04 17:38:46 +00:00
self.errors += 1
return allskipped
def stop(self):
2014-03-04 19:53:19 +00:00
"""Mark this thread as stopped."""
2014-03-04 17:38:46 +00:00
self.stopped = True
def getComics(options):
"""Retrieve comics."""
threads = []
jobs = ComicQueue()
2014-03-04 17:38:46 +00:00
if options.handler:
for name in set(options.handler):
2015-05-20 10:38:29 +00:00
events.addHandler(name, options.basepath, options.baseurl, options.allowdownscale)
2014-03-04 17:38:46 +00:00
events.getHandler().start()
errors = 0
try:
for scraperobj in getScrapers(options.comic, options.basepath,
2022-06-04 08:56:25 +00:00
options.adult):
2014-03-04 17:38:46 +00:00
jobs.put(scraperobj)
# start threads
2015-04-25 12:23:45 +00:00
num_threads = min(options.parallel, jobs.qsize())
for _i in range(num_threads):
t = ComicGetter(options, jobs)
2014-03-04 17:38:46 +00:00
threads.append(t)
t.start()
# wait for threads to finish
jobs.join(1)
for t in threads:
errors += t.errors
except ValueError as msg:
out.exception(msg)
errors += 1
except KeyboardInterrupt:
out.warn("Interrupted! Waiting for download threads to finish.")
2014-03-04 17:38:46 +00:00
finally:
for t in threads:
t.stop()
jobs.clear()
2014-03-04 17:38:46 +00:00
events.getHandler().end()
2017-10-12 21:47:59 +00:00
events.clear_handlers()
2014-03-04 17:38:46 +00:00
return errors
def getScrapers(comics: Collection[str], basepath: str, adult=True, listing=False):
2014-03-04 17:38:46 +00:00
"""Get scraper objects for the given comics."""
if '@' in comics:
# only scrapers whose directory already exists
if len(comics) > 1:
out.warn(u"using '@' as comic name ignores all other specified comics.")
for comic in get_existing_comics(basepath, adult, listing):
yield comic
2014-03-04 17:38:46 +00:00
else:
# get only selected comic scrapers
# store them in a set to eliminate duplicates
scrapers = set()
basere = re.compile(r'^' + re.escape(basepath) + r'[/\\]')
2014-03-04 17:38:46 +00:00
for comic in comics:
# Helpful when using shell completion to pick comics to get
comic = comic.rstrip(os.path.sep)
if basere.match(comic):
2014-03-04 17:38:46 +00:00
# make the following command work:
# find Comics -type d | xargs -n1 -P10 dosage -b Comics
comic = comic[len(basepath) + 1:].lstrip(os.sep)
2014-03-04 17:38:46 +00:00
if ':' in comic:
name, index = comic.split(':', 1)
indexes = index.split(',')
else:
name = comic
indexes = None
2022-06-04 08:56:25 +00:00
scraper = scrapercache.find(name)
if shouldRunScraper(scraper, adult, listing):
# FIXME: Find a better way to work with indexes
scraper.indexes = indexes
if scraper not in scrapers:
scrapers.add(scraper)
yield scraper
def get_existing_comics(basepath=None, adult=True, listing=False):
2022-06-04 08:56:25 +00:00
for scraperobj in scrapercache.all(include_removed=True):
dirname = scraperobj.get_download_dir(basepath)
if os.path.isdir(dirname):
if shouldRunScraper(scraperobj, adult, listing):
yield scraperobj
def shouldRunScraper(scraperobj, adult=True, listing=False):
if listing:
return True
if not adult and scraperobj.adult:
warn_adult(scraperobj)
return False
reasons = scraperobj.getDisabledReasons()
if reasons:
warn_disabled(scraperobj, reasons)
return False
return True
2014-03-04 17:38:46 +00:00
def warn_adult(scraperobj):
2014-03-04 17:38:46 +00:00
"""Print warning about adult content."""
2020-01-12 23:36:46 +00:00
out.warn(u"skipping adult comic {};"
" use the --adult option to confirm your age".format(scraperobj.name))
def warn_disabled(scraperobj, reasons):
"""Print warning about disabled comic modules."""
2020-01-12 23:36:46 +00:00
out.warn(u"Skipping comic {}: {}".format(
scraperobj.name, ' '.join(reasons.values())))