Update joinPathPartsNamer: Remove defaults

This commit is contained in:
Tobias Gruetzmacher 2024-02-18 18:02:02 +01:00
parent 15423eab21
commit 3722fbe7e4
No known key found for this signature in database
7 changed files with 41 additions and 34 deletions

View file

@ -1,39 +1,49 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations
from typing import Protocol
from .util import getQueryParams
from .scraper import Scraper
def queryNamer(param, use_page_url=False):
class Namer(Protocol):
"""A protocol for generic callbacks to name web comic images."""
def __call__(_, self: Scraper, image_url: str, page_url: str) -> str | None:
...
def queryNamer(param, use_page_url=False) -> Namer:
"""Get name from URL query part."""
def _namer(self, image_url, page_url):
def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get URL query part."""
url = page_url if use_page_url else image_url
return getQueryParams(url)[param][0]
return _namer
def regexNamer(regex, use_page_url=False):
def regexNamer(regex, use_page_url=False) -> Namer:
"""Get name from regular expression."""
def _namer(self, image_url, page_url):
def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get first regular expression group."""
url = page_url if use_page_url else image_url
mo = regex.search(url)
if mo:
return mo.group(1)
return mo.group(1) if mo else None
return _namer
def joinPathPartsNamer(pageurlparts, imageurlparts=(-1,), joinchar='_'):
def joinPathPartsNamer(pageparts=(), imageparts=(), joinchar='_') -> Namer:
"""Get name by mashing path parts together with underscores."""
def _namer(self, imageurl, pageurl):
def _namer(self: Scraper, image_url: str, page_url: str) -> str | None:
# Split and drop host name
pageurlsplit = pageurl.split('/')[3:]
imageurlsplit = imageurl.split('/')[3:]
joinparts = ([pageurlsplit[i] for i in pageurlparts] +
[imageurlsplit[i] for i in imageurlparts])
pagesplit = page_url.split('/')[3:]
imagesplit = image_url.split('/')[3:]
joinparts = ([pagesplit[i] for i in pageparts] +
[imagesplit[i] for i in imageparts])
return joinchar.join(joinparts)
return _namer

View file

@ -404,7 +404,7 @@ class CrossTimeCafe(_ParserScraper):
class CSectionComics(WordPressScraper):
url = 'https://www.csectioncomics.com/'
firstStripUrl = url + 'comics/one-day-in-country'
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
multipleImagesPerStrip = True
@ -466,7 +466,7 @@ class CyanideAndHappiness(ParserScraper):
prevSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="180deg"]]'
nextSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="0deg"]]'
starter = bounceStarter
namer = joinPathPartsNamer((), range(-4, 0))
namer = joinPathPartsNamer(imageparts=range(-4, 0))
class CynWolf(_ParserScraper):

View file

@ -1,11 +1,6 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer
try:
from importlib_resources import as_file, files
except ImportError:
from importlib.resources import as_file, files
from ..helpers import bounceStarter, joinPathPartsNamer
from ..scraper import ParserScraper
@ -15,7 +10,7 @@ class ComicsKingdom(ParserScraper):
prevSearch = '//a[./img[contains(@alt, "Previous")]]'
nextSearch = '//a[./img[contains(@alt, "Next")]]'
starter = bounceStarter
namer = joinPathPartsNamer((-2, -1), ())
namer = joinPathPartsNamer(pageparts=(-2, -1))
help = 'Index format: yyyy-mm-dd'
def __init__(self, name, path, lang=None):

View file

@ -171,7 +171,7 @@ class Fragile(_ParserScraper):
endOfLife = True
class FredoAndPidjin(_ParserScraper):
class FredoAndPidjin(ParserScraper):
url = 'https://www.pidjin.net/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
@ -180,7 +180,7 @@ class FredoAndPidjin(_ParserScraper):
prevSearch = '//span[d:class("prev")]/a'
latestSearch = '//section[d:class("latest")]//a'
starter = indirectStarter
namer = joinPathPartsNamer((0, 1, 2))
namer = joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))
class Freefall(_ParserScraper):

View file

@ -272,7 +272,7 @@ class ToonHole(ParserScraper):
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]'
starter = indirectStarter
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
class TrippingOverYou(_BasicScraper):

View file

@ -23,7 +23,7 @@ class Zapiro(ParserScraper):
imageSearch = '//div[@id="cartoon"]/img'
prevSearch = '//a[d:class("left")]'
nextSearch = '//a[d:class("right")]'
namer = joinPathPartsNamer((-1,), ())
namer = joinPathPartsNamer(pageparts=(-1,))
class ZenPencils(WordPressNavi):
@ -60,7 +60,7 @@ class Zwarwald(BasicScraper):
tagre("img", "src",
r'http://zwarwald\.de/images/prev\.jpg',
quote="'"))
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
help = 'Index format: number'
def shouldSkipUrl(self, url, data):

View file

@ -1,9 +1,9 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
from dosagelib.helpers import joinPathPartsNamer, queryNamer
class TestNamer(object):
class TestNamer:
"""
Tests for comic namer.
"""
@ -16,6 +16,8 @@ class TestNamer(object):
def test_joinPathPartsNamer(self):
imgurl = 'https://HOST/wp-content/uploads/2019/02/tennis5wp-1.png'
pageurl = 'https://HOST/2019/03/11/12450/'
assert joinPathPartsNamer((0, 1, 2))(self, imgurl, pageurl) == '2019_03_11_tennis5wp-1.png'
assert joinPathPartsNamer((0, 1, 2), (-1,), '-')(self, imgurl, pageurl) == '2019-03-11-tennis5wp-1.png'
assert joinPathPartsNamer((0, -2), ())(self, imgurl, pageurl) == '2019_12450'
assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))(self,
imgurl, pageurl) == '2019_03_11_tennis5wp-1.png'
assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,), joinchar='-')(self,
imgurl, pageurl) == '2019-03-11-tennis5wp-1.png'
assert joinPathPartsNamer(pageparts=(0, -2))(self, imgurl, pageurl) == '2019_12450'