From 9b95171f373cb610893649412b1548e8f1143ebc Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Sat, 28 May 2022 19:33:16 +0200 Subject: [PATCH] Add some basic type annotations --- dosagelib/director.py | 5 +++-- dosagelib/plugins/c.py | 5 +++-- dosagelib/plugins/common.py | 6 ++++-- dosagelib/plugins/j.py | 4 ++-- dosagelib/scraper.py | 33 +++++++++++++++++---------------- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/dosagelib/director.py b/dosagelib/director.py index e8dc4bc16..5c0516967 100644 --- a/dosagelib/director.py +++ b/dosagelib/director.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher +# Copyright (C) 2015-2022 Tobias Gruetzmacher # Copyright (C) 2019-2020 Daniel Ring import os import threading import _thread from queue import Queue, Empty +from typing import Dict from urllib.parse import urlparse from .output import out @@ -41,7 +42,7 @@ class ComicQueue(Queue): # ensure threads download only from one host at a time -host_locks = {} +host_locks: Dict[str, threading.Lock] = {} def get_hostname(url): diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index 7f26ce0bf..0bc432023 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -1,9 +1,10 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher +# Copyright (C) 2015-2022 Tobias Gruetzmacher # Copyright (C) 2019-2020 Daniel Ring from re import compile, escape +from typing import List from ..scraper import _BasicScraper, _ParserScraper from ..helpers import bounceStarter, indirectStarter @@ -132,7 +133,7 @@ class CatenaManor(_ParserScraper): imageSearch = '//img[@class="comicthumbnail"]' multipleImagesPerStrip = True endOfLife = True - strips = [] + strips: List[str] = [] def starter(self): # Retrieve archive links and select valid range diff --git a/dosagelib/plugins/common.py b/dosagelib/plugins/common.py index 27d9caf30..00afc71a2 100644 --- a/dosagelib/plugins/common.py +++ b/dosagelib/plugins/common.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher +# Copyright (C) 2015-2022 Tobias Gruetzmacher # Copyright (C) 2019-2020 Daniel Ring +from typing import Sequence, Union + from ..scraper import _ParserScraper # Common base classes for comics with the same structure (same hosting @@ -39,7 +41,7 @@ class _WPWebcomic(_ParserScraper): class _ComicControlScraper(_ParserScraper): - imageSearch = '//img[@id="cc-comic"]' + imageSearch: Union[Sequence[str], str] = '//img[@id="cc-comic"]' prevSearch = '//a[@rel="prev"]' nextSearch = '//a[@rel="next"]' latestSearch = '//a[@rel="last"]' diff --git a/dosagelib/plugins/j.py b/dosagelib/plugins/j.py index 0454a1672..c7be98daf 100644 --- a/dosagelib/plugins/j.py +++ b/dosagelib/plugins/j.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher +# Copyright (C) 2015-2022 Tobias Gruetzmacher from re import compile, escape from ..scraper import _BasicScraper @@ -42,5 +42,5 @@ class JoeAndMonkey(_BasicScraper): class JohnnyWander(_ComicControlScraper): imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src', - _ComicControlScraper.imageSearch) + '//img[@id="cc-comic"]') url = 'http://www.johnnywander.com/' diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index a14076c1f..acdbeb385 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -1,12 +1,13 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher +# Copyright (C) 2015-2022 Tobias Gruetzmacher import html import os import re import warnings from urllib.parse import urljoin +from typing import Optional, Union, Pattern, Sequence import lxml from lxml.html.defs import link_attrs as html_link_attrs @@ -42,60 +43,60 @@ class GeoblockedException(IOError): super().__init__('It seems your current location is geo-blocked.') -class Scraper(object): +class Scraper: '''Base class for all comic scraper, but without a specific scrape implementation.''' # The URL for the comic strip - url = None + url: Optional[str] = None # A string that is interpolated with the strip index to yield the URL for a # particular strip. - stripUrl = None + stripUrl: Optional[str] = None # Stop search for previous URLs at this URL - firstStripUrl = None + firstStripUrl: Optional[str] = None # if more than one image per URL is expected - multipleImagesPerStrip = False + multipleImagesPerStrip: bool = False # set to True if this comic contains adult content - adult = False + adult: bool = False # set to True if this comic will not get updated anymore - endOfLife = False + endOfLife: bool = False # langauge of the comic (two-letter ISO 639-1 code) - lang = 'en' + lang: str = 'en' # an expression that will locate the URL for the previous strip in a page # this can also be a list or tuple - prevSearch = None + prevSearch: Optional[Union[Sequence[Union[str, Pattern]], str, Pattern]] = None # an expression that will locate the strip image URLs strip in a page # this can also be a list or tuple - imageSearch = None + imageSearch: Optional[Union[Sequence[Union[str, Pattern]], str, Pattern]] = None # an expression to store a text together with the image # sometimes comic strips have additional text info for each comic - textSearch = None + textSearch: Optional[Union[Sequence[Union[str, Pattern]], str, Pattern]] = None # Is the additional text required or optional? When it is required (the # default), you see an error message whenever a comic page is encountered # that does not have the text - textOptional = False + textOptional: bool = False # usually the index format help - help = '' + help: str = '' # Specifing a list of HTTP error codes which should be handled as a # successful request. This is a workaround for some comics which return # regular pages with strange HTTP codes. By default, all HTTP errors raise # exceptions. - allow_errors = () + allow_errors: Sequence[int] = () # HTTP session for configuration & cookies - session = http.default_session + session: http.Session = http.default_session @classmethod def getmodules(cls):