Add some more type annotations
This commit is contained in:
parent
ee22169cc5
commit
7b9ca867fb
2 changed files with 42 additions and 39 deletions
|
@ -1,12 +1,15 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
import codecs
|
import codecs
|
||||||
import contextlib
|
import contextlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
from .output import out
|
from .output import out
|
||||||
from .util import unquote, getFilename, urlopen, strsize
|
from .util import unquote, getFilename, urlopen, strsize
|
||||||
|
@ -14,27 +17,27 @@ from .events import getHandler
|
||||||
|
|
||||||
|
|
||||||
# Maximum content size for images
|
# Maximum content size for images
|
||||||
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
|
MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB
|
||||||
# RFC 1123 format, as preferred by RFC 2616
|
# RFC 1123 format, as preferred by RFC 2616
|
||||||
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
|
||||||
|
|
||||||
|
|
||||||
class ComicStrip(object):
|
class ComicStrip:
|
||||||
"""A list of comic image URLs."""
|
"""A list of comic image URLs."""
|
||||||
|
|
||||||
def __init__(self, scraper, strip_url, image_urls, text=None):
|
def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:
|
||||||
"""Store the image URL list."""
|
"""Store the image URL list."""
|
||||||
self.scraper = scraper
|
self.scraper = scraper
|
||||||
self.strip_url = strip_url
|
self.strip_url = strip_url
|
||||||
self.image_urls = image_urls
|
self.image_urls = image_urls
|
||||||
self.text = text
|
self.text = text
|
||||||
|
|
||||||
def getImages(self):
|
def getImages(self) -> Iterator[ComicImage]:
|
||||||
"""Get a list of image downloaders."""
|
"""Get a list of image downloaders."""
|
||||||
for image_url in self.image_urls:
|
for image_url in self.image_urls:
|
||||||
yield self.getDownloader(image_url)
|
yield self.getDownloader(image_url)
|
||||||
|
|
||||||
def getDownloader(self, url):
|
def getDownloader(self, url: str) -> ComicImage:
|
||||||
"""Get an image downloader."""
|
"""Get an image downloader."""
|
||||||
filename = self.scraper.namer(url, self.strip_url)
|
filename = self.scraper.namer(url, self.strip_url)
|
||||||
if filename is None:
|
if filename is None:
|
||||||
|
@ -43,7 +46,7 @@ class ComicStrip(object):
|
||||||
text=self.text)
|
text=self.text)
|
||||||
|
|
||||||
|
|
||||||
class ComicImage(object):
|
class ComicImage:
|
||||||
"""A comic image downloader."""
|
"""A comic image downloader."""
|
||||||
|
|
||||||
ChunkBytes = 1024 * 100 # 100KB
|
ChunkBytes = 1024 * 100 # 100KB
|
||||||
|
@ -64,7 +67,7 @@ class ComicImage(object):
|
||||||
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
|
||||||
self.urlobj = urlopen(self.url, self.scraper.session,
|
self.urlobj = urlopen(self.url, self.scraper.session,
|
||||||
referrer=self.referrer,
|
referrer=self.referrer,
|
||||||
max_content_bytes=MaxImageBytes, stream=True,
|
max_content_bytes=MAX_IMAGE_BYTES, stream=True,
|
||||||
headers=headers)
|
headers=headers)
|
||||||
if self.urlobj.status_code == 304: # Not modified
|
if self.urlobj.status_code == 304: # Not modified
|
||||||
return
|
return
|
||||||
|
|
|
@ -119,45 +119,45 @@ class Scraper:
|
||||||
if val:
|
if val:
|
||||||
self._indexes = tuple(sorted(val))
|
self._indexes = tuple(sorted(val))
|
||||||
|
|
||||||
def __init__(self, name):
|
def __init__(self, name: str) -> None:
|
||||||
"""Initialize internal variables."""
|
"""Initialize internal variables."""
|
||||||
self.name = name
|
self.name = name
|
||||||
self.urls = set()
|
self.urls: set[str] = set()
|
||||||
self._indexes = ()
|
self._indexes = ()
|
||||||
self.skippedUrls = set()
|
self.skippedUrls: set[str] = set()
|
||||||
self.hitFirstStripUrl = False
|
self.hitFirstStripUrl = False
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self) -> int:
|
||||||
"""Get hash value from name and index list."""
|
"""Get hash value from name and index list."""
|
||||||
return hash((self.name, self.indexes))
|
return hash((self.name, self.indexes))
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url: str, data) -> bool:
|
||||||
"""Determine if search for images in given URL should be skipped."""
|
"""Determine if search for images in given URL should be skipped."""
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def getComicStrip(self, url, data):
|
def getComicStrip(self, url, data) -> ComicStrip:
|
||||||
"""Get comic strip downloader for given URL and data."""
|
"""Get comic strip downloader for given URL and data."""
|
||||||
imageUrls = self.extract_image_urls(url, data)
|
urls = self.extract_image_urls(url, data)
|
||||||
# map modifier function on image URLs
|
# map modifier function on image URLs
|
||||||
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
|
urls = [self.imageUrlModifier(x, data) for x in urls]
|
||||||
# remove duplicate URLs
|
# remove duplicate URLs
|
||||||
imageUrls = uniq(imageUrls)
|
urls = uniq(urls)
|
||||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
if len(urls) > 1 and not self.multipleImagesPerStrip:
|
||||||
out.warn(
|
out.warn(
|
||||||
u"Found %d images instead of 1 at %s with expressions %s" %
|
u"Found %d images instead of 1 at %s with expressions %s" %
|
||||||
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
|
(len(urls), url, prettyMatcherList(self.imageSearch)))
|
||||||
image = imageUrls[0]
|
image = urls[0]
|
||||||
out.warn(u"Choosing image %s" % image)
|
out.warn("Choosing image %s" % image)
|
||||||
imageUrls = (image,)
|
urls = (image,)
|
||||||
elif not imageUrls:
|
elif not urls:
|
||||||
out.warn(u"Found no images at %s with expressions %s" % (url,
|
out.warn("Found no images at %s with expressions %s" % (url,
|
||||||
prettyMatcherList(self.imageSearch)))
|
prettyMatcherList(self.imageSearch)))
|
||||||
if self.textSearch:
|
if self.textSearch:
|
||||||
text = self.fetchText(url, data, self.textSearch,
|
text = self.fetchText(url, data, self.textSearch,
|
||||||
optional=self.textOptional)
|
optional=self.textOptional)
|
||||||
else:
|
else:
|
||||||
text = None
|
text = None
|
||||||
return ComicStrip(self, url, imageUrls, text=text)
|
return ComicStrip(self, url, urls, text=text)
|
||||||
|
|
||||||
def getStrips(self, maxstrips=None):
|
def getStrips(self, maxstrips=None):
|
||||||
"""Get comic strips."""
|
"""Get comic strips."""
|
||||||
|
@ -217,7 +217,7 @@ class Scraper:
|
||||||
break
|
break
|
||||||
url = prevUrl
|
url = prevUrl
|
||||||
|
|
||||||
def isfirststrip(self, url):
|
def isfirststrip(self, url: str) -> bool:
|
||||||
"""Check if the specified URL is the first strip of a comic. This is
|
"""Check if the specified URL is the first strip of a comic. This is
|
||||||
specially for comics taken from archive.org, since the base URL of
|
specially for comics taken from archive.org, since the base URL of
|
||||||
archive.org changes whenever pages are taken from a different
|
archive.org changes whenever pages are taken from a different
|
||||||
|
@ -228,7 +228,7 @@ class Scraper:
|
||||||
currenturl = ARCHIVE_ORG_URL.sub('', url)
|
currenturl = ARCHIVE_ORG_URL.sub('', url)
|
||||||
return firsturl == currenturl
|
return firsturl == currenturl
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url: str, data) -> str | None:
|
||||||
"""Find previous URL."""
|
"""Find previous URL."""
|
||||||
prevUrl = None
|
prevUrl = None
|
||||||
if self.prevSearch:
|
if self.prevSearch:
|
||||||
|
@ -243,40 +243,40 @@ class Scraper:
|
||||||
getHandler().comicPageLink(self, url, prevUrl)
|
getHandler().comicPageLink(self, url, prevUrl)
|
||||||
return prevUrl
|
return prevUrl
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
def getIndexStripUrl(self, index: str) -> str:
|
||||||
"""Get comic strip URL from index."""
|
"""Get comic strip URL from index."""
|
||||||
return self.stripUrl % index
|
return self.stripUrl % index
|
||||||
|
|
||||||
def starter(self):
|
def starter(self) -> str:
|
||||||
"""Get starter URL from where to scrape comic strips."""
|
"""Get starter URL from where to scrape comic strips."""
|
||||||
return self.url
|
return self.url
|
||||||
|
|
||||||
def namer(self, image_url, page_url):
|
def namer(self, image_url: str, page_url: str) -> str | None:
|
||||||
"""Return filename for given image and page URL."""
|
"""Return filename for given image and page URL."""
|
||||||
return
|
return
|
||||||
|
|
||||||
def link_modifier(self, fromurl, tourl):
|
def link_modifier(self, fromurl: str, tourl: str) -> str:
|
||||||
"""Optional modification of parsed link (previous/back/latest) URLs.
|
"""Optional modification of parsed link (previous/back/latest) URLs.
|
||||||
Useful if there are domain redirects. The default implementation does
|
Useful if there are domain redirects. The default implementation does
|
||||||
not modify the URL.
|
not modify the URL.
|
||||||
"""
|
"""
|
||||||
return tourl
|
return tourl
|
||||||
|
|
||||||
def imageUrlModifier(self, image_url, data):
|
def imageUrlModifier(self, image_url: str, data) -> str:
|
||||||
"""Optional modification of parsed image URLs. Useful if the URL
|
"""Optional modification of parsed image URLs. Useful if the URL
|
||||||
needs to be fixed before usage. The default implementation does
|
needs to be fixed before usage. The default implementation does
|
||||||
not modify the URL. The given data is the URL page data.
|
not modify the URL. The given data is the URL page data.
|
||||||
"""
|
"""
|
||||||
return image_url
|
return image_url
|
||||||
|
|
||||||
def vote(self):
|
def vote(self) -> None:
|
||||||
"""Cast a public vote for this comic."""
|
"""Cast a public vote for this comic."""
|
||||||
uid = get_system_uid()
|
uid = get_system_uid()
|
||||||
data = {"name": self.name.replace('/', '_'), "uid": uid}
|
data = {"name": self.name.replace('/', '_'), "uid": uid}
|
||||||
response = self.session.post(configuration.VoteUrl, data=data)
|
response = self.session.post(configuration.VoteUrl, data=data)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
def get_download_dir(self, basepath):
|
def get_download_dir(self, basepath: str) -> str:
|
||||||
"""Try to find the corect download directory, ignoring case
|
"""Try to find the corect download directory, ignoring case
|
||||||
differences."""
|
differences."""
|
||||||
path = basepath
|
path = basepath
|
||||||
|
@ -294,16 +294,16 @@ class Scraper:
|
||||||
path = os.path.join(path, part)
|
path = os.path.join(path, part)
|
||||||
return path
|
return path
|
||||||
|
|
||||||
def getCompleteFile(self, basepath):
|
def getCompleteFile(self, basepath: str) -> str:
|
||||||
"""Get filename indicating all comics are downloaded."""
|
"""Get filename indicating all comics are downloaded."""
|
||||||
dirname = self.get_download_dir(basepath)
|
dirname = self.get_download_dir(basepath)
|
||||||
return os.path.join(dirname, "complete.txt")
|
return os.path.join(dirname, "complete.txt")
|
||||||
|
|
||||||
def isComplete(self, basepath):
|
def isComplete(self, basepath: str) -> bool:
|
||||||
"""Check if all comics are downloaded."""
|
"""Check if all comics are downloaded."""
|
||||||
return os.path.isfile(self.getCompleteFile(basepath))
|
return os.path.isfile(self.getCompleteFile(basepath))
|
||||||
|
|
||||||
def setComplete(self, basepath):
|
def setComplete(self, basepath: str) -> None:
|
||||||
"""Set complete flag for this comic, ie. all comics are downloaded."""
|
"""Set complete flag for this comic, ie. all comics are downloaded."""
|
||||||
if self.endOfLife:
|
if self.endOfLife:
|
||||||
filename = self.getCompleteFile(basepath)
|
filename = self.getCompleteFile(basepath)
|
||||||
|
|
Loading…
Reference in a new issue