dosage/dosagelib/helpers.py

75 lines
2.7 KiB
Python
Raw Permalink Normal View History

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations
from typing import Protocol
from .util import getQueryParams
from .scraper import Scraper
class Namer(Protocol):
"""A protocol for generic callbacks to name web comic images."""
def __call__(_, self: Scraper, image_url: str, page_url: str) -> str | None:
...
2012-06-20 19:58:13 +00:00
def queryNamer(param, use_page_url=False) -> Namer:
2012-09-26 14:47:39 +00:00
"""Get name from URL query part."""
def _namer(self, image_url: str, page_url: str) -> str | None:
2012-12-12 16:41:29 +00:00
"""Get URL query part."""
url = page_url if use_page_url else image_url
return getQueryParams(url)[param][0]
2012-06-20 19:58:13 +00:00
return _namer
def regexNamer(regex, use_page_url=False) -> Namer:
2012-09-26 14:47:39 +00:00
"""Get name from regular expression."""
def _namer(self, image_url: str, page_url: str) -> str | None:
2012-12-12 16:41:29 +00:00
"""Get first regular expression group."""
url = page_url if use_page_url else image_url
2013-03-07 17:22:49 +00:00
mo = regex.search(url)
return mo.group(1) if mo else None
2012-06-20 19:58:13 +00:00
return _namer
def joinPathPartsNamer(pageparts=(), imageparts=(), joinchar='_') -> Namer:
"""Get name by mashing path parts together with underscores."""
def _namer(self: Scraper, image_url: str, page_url: str) -> str | None:
# Split and drop host name
pagesplit = page_url.split('/')[3:]
imagesplit = image_url.split('/')[3:]
joinparts = ([pagesplit[i] for i in pageparts] +
[imagesplit[i] for i in imageparts])
return joinchar.join(joinparts)
return _namer
def bounceStarter(self):
"""Get start URL by "bouncing" back and forth one time.
This needs the url and nextSearch properties be defined on the class.
"""
data = self.getPage(self.url)
prevurl = self.fetchUrl(self.url, data, self.prevSearch)
prevurl = self.link_modifier(self.url, prevurl)
data = self.getPage(prevurl)
nexturl = self.fetchUrl(prevurl, data, self.nextSearch)
return self.link_modifier(prevurl, nexturl)
2012-06-20 19:58:13 +00:00
def indirectStarter(self):
"""Get start URL by indirection.
This is useful for comics where the latest comic can't be reached at a
stable URL. If the class has an attribute 'startUrl', this page is fetched
first, otherwise the page at 'url' is fetched. After that, the attribute
'latestSearch' is used on the page content to find the latest strip."""
url = self.startUrl if hasattr(self, "startUrl") else self.url
data = self.getPage(url)
newurl = self.fetchUrl(url, data, self.latestSearch)
return self.link_modifier(url, newurl)