Merge remote-tracking branch 'vendor/master'

This commit is contained in:
D. Moonfire 2024-07-02 18:57:33 -05:00
commit bf9e7d2760
44 changed files with 517 additions and 568 deletions

View file

@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -32,7 +32,7 @@ jobs:
if: ${{ matrix.python-version != env.DEFAULT_PYTHON }} if: ${{ matrix.python-version != env.DEFAULT_PYTHON }}
- name: Test with tox (and upload coverage) - name: Test with tox (and upload coverage)
uses: paambaati/codeclimate-action@v5.0.0 uses: paambaati/codeclimate-action@v8.0.0
if: ${{ matrix.python-version == env.DEFAULT_PYTHON }} if: ${{ matrix.python-version == env.DEFAULT_PYTHON }}
env: env:
CC_TEST_REPORTER_ID: 2a411f596959fc32f5d73f3ba7cef8cc4d5733299d742dbfc97fd6c190b9010c CC_TEST_REPORTER_ID: 2a411f596959fc32f5d73f3ba7cef8cc4d5733299d742dbfc97fd6c190b9010c
@ -42,6 +42,6 @@ jobs:
${{ github.workspace }}/.tox/reports/*/coverage.xml:coverage.py ${{ github.workspace }}/.tox/reports/*/coverage.xml:coverage.py
prefix: ${{ github.workspace }}/.tox/py39/lib/python3.9/site-packages prefix: ${{ github.workspace }}/.tox/py39/lib/python3.9/site-packages
- uses: codecov/codecov-action@v3 - uses: codecov/codecov-action@v4
with: with:
directory: '.tox/reports' directory: '.tox/reports'

View file

@ -5,12 +5,19 @@ on:
push: push:
branches: branches:
- master - master
workflow_dispatch:
permissions: permissions:
contents: write contents: read
pages: write
id-token: write
concurrency:
group: "pages"
cancel-in-progress: false
jobs: jobs:
deploy: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -28,10 +35,24 @@ jobs:
pip install wheel pip install wheel
pip install git+https://github.com/spanezz/staticsite.git@v2.3 pip install git+https://github.com/spanezz/staticsite.git@v2.3
ssite build --output public ssite build --output public
cd public
rm -rf Jenkinsfile dosagelib scripts tests
- name: Deploy - name: Setup Pages
uses: peaceiris/actions-gh-pages@v3 id: pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with: with:
cname: dosage.rocks path: public
github_token: ${{ secrets.GITHUB_TOKEN }}
exclude_assets: 'Jenkinsfile,dosagelib,scripts,setup.*,tests,*.ini' deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
needs: build
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View file

@ -1,6 +1,6 @@
Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
Copyright (C) 2012-2014 Bastian Kleineidam Copyright (C) 2012-2014 Bastian Kleineidam
Copyright (C) 2015-2022 Tobias Gruetzmacher Copyright (C) 2015-2024 Tobias Gruetzmacher
Permission is hereby granted, free of charge, to any person obtaining Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the a copy of this software and associated documentation files (the

3
Jenkinsfile vendored
View file

@ -4,7 +4,6 @@ def pys = [
[name: 'Python 3.10', docker: '3.10-bookworm', tox:'py310', main: false], [name: 'Python 3.10', docker: '3.10-bookworm', tox:'py310', main: false],
[name: 'Python 3.9', docker: '3.9-bookworm', tox:'py39', main: false], [name: 'Python 3.9', docker: '3.9-bookworm', tox:'py39', main: false],
[name: 'Python 3.8', docker: '3.8-bookworm', tox:'py38', main: false], [name: 'Python 3.8', docker: '3.8-bookworm', tox:'py38', main: false],
[name: 'Python 3.7', docker: '3.7-bookworm', tox:'py37', main: false],
] ]
properties([ properties([
@ -75,7 +74,7 @@ pys.each { py ->
parallel(tasks) parallel(tasks)
parallel modern: { parallel modern: {
stage('Modern Windows binary') { stage('Modern Windows binary') {
windowsBuild('3.11', 'dosage.exe') windowsBuild('3.12', 'dosage.exe')
} }
}, },
legacy: { legacy: {

View file

@ -1,9 +1,9 @@
# Dosage # Dosage
[![Tests](https://github.com/webcomics/dosage/actions/workflows/test.yml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/test.yml) [![CI](https://github.com/webcomics/dosage/actions/workflows/ci.yaml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/ci.yaml)
[![Code Climate](https://codeclimate.com/github/webcomics/dosage/badges/gpa.svg)](https://codeclimate.com/github/webcomics/dosage) [![Code Climate](https://codeclimate.com/github/webcomics/dosage/badges/gpa.svg)](https://codeclimate.com/github/webcomics/dosage)
[![codecov](https://codecov.io/gh/webcomics/dosage/branch/master/graph/badge.svg)](https://codecov.io/gh/webcomics/dosage) [![codecov](https://codecov.io/gh/webcomics/dosage/branch/master/graph/badge.svg)](https://codecov.io/gh/webcomics/dosage)
![Maintenance](https://img.shields.io/maintenance/yes/2023.svg) ![Maintenance](https://img.shields.io/maintenance/yes/2024.svg)
![License](https://img.shields.io/github/license/webcomics/dosage) ![License](https://img.shields.io/github/license/webcomics/dosage)
Dosage is designed to keep a local copy of specific webcomics and other Dosage is designed to keep a local copy of specific webcomics and other
@ -72,7 +72,7 @@ are old enough to view them.
### Dependencies ### Dependencies
Since dosage is written in [Python](http://www.python.org/), a Python Since dosage is written in [Python](http://www.python.org/), a Python
installation is required: Dosage needs at least Python 3.7. Dosage requires installation is required: Dosage needs at least Python 3.8. Dosage requires
some Python modules from PyPI, so installation with `pip` is recommended. some Python modules from PyPI, so installation with `pip` is recommended.
### Using the Windows binary ### Using the Windows binary

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
""" """
Automated comic downloader. Dosage traverses comic websites in Automated comic downloader. Dosage traverses comic websites in
order to download each strip of the comic. The intended use is for order to download each strip of the comic. The intended use is for
@ -14,14 +14,11 @@ The primary interface is the 'dosage' commandline script.
Comic modules for each comic are located in L{dosagelib.plugins}. Comic modules for each comic are located in L{dosagelib.plugins}.
""" """
try: from importlib.metadata import version, PackageNotFoundError
from importlib.metadata import version, PackageNotFoundError
except ImportError:
from importlib_metadata import version, PackageNotFoundError
from .output import out from .output import out
AppName = u'dosage' AppName = 'dosage'
try: try:
__version__ = version(AppName) # PEP 396 __version__ = version(AppName) # PEP 396
except PackageNotFoundError: except PackageNotFoundError:

View file

@ -1,12 +1,15 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations
import os import os
import glob import glob
import codecs import codecs
import contextlib import contextlib
from datetime import datetime from datetime import datetime
from typing import Iterator
from .output import out from .output import out
from .util import unquote, getFilename, urlopen, strsize from .util import unquote, getFilename, urlopen, strsize
@ -14,27 +17,27 @@ from .events import getHandler
# Maximum content size for images # Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB
# RFC 1123 format, as preferred by RFC 2616 # RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT" RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object): class ComicStrip:
"""A list of comic image URLs.""" """A list of comic image URLs."""
def __init__(self, scraper, strip_url, image_urls, text=None): def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:
"""Store the image URL list.""" """Store the image URL list."""
self.scraper = scraper self.scraper = scraper
self.strip_url = strip_url self.strip_url = strip_url
self.image_urls = image_urls self.image_urls = image_urls
self.text = text self.text = text
def getImages(self): def getImages(self) -> Iterator[ComicImage]:
"""Get a list of image downloaders.""" """Get a list of image downloaders."""
for image_url in self.image_urls: for image_url in self.image_urls:
yield self.getDownloader(image_url) yield self.getDownloader(image_url)
def getDownloader(self, url): def getDownloader(self, url: str) -> ComicImage:
"""Get an image downloader.""" """Get an image downloader."""
filename = self.scraper.namer(url, self.strip_url) filename = self.scraper.namer(url, self.strip_url)
if filename is None: if filename is None:
@ -43,7 +46,7 @@ class ComicStrip(object):
text=self.text) text=self.text)
class ComicImage(object): class ComicImage:
"""A comic image downloader.""" """A comic image downloader."""
ChunkBytes = 1024 * 100 # 100KB ChunkBytes = 1024 * 100 # 100KB
@ -64,7 +67,7 @@ class ComicImage(object):
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR) headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.scraper.session, self.urlobj = urlopen(self.url, self.scraper.session,
referrer=self.referrer, referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True, max_content_bytes=MAX_IMAGE_BYTES, stream=True,
headers=headers) headers=headers)
if self.urlobj.status_code == 304: # Not modified if self.urlobj.status_code == 304: # Not modified
return return

View file

@ -1,39 +1,49 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations
from typing import Protocol
from .util import getQueryParams from .util import getQueryParams
from .scraper import Scraper
def queryNamer(param, use_page_url=False): class Namer(Protocol):
"""A protocol for generic callbacks to name web comic images."""
def __call__(_, self: Scraper, image_url: str, page_url: str) -> str | None:
...
def queryNamer(param, use_page_url=False) -> Namer:
"""Get name from URL query part.""" """Get name from URL query part."""
def _namer(self, image_url, page_url): def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get URL query part.""" """Get URL query part."""
url = page_url if use_page_url else image_url url = page_url if use_page_url else image_url
return getQueryParams(url)[param][0] return getQueryParams(url)[param][0]
return _namer return _namer
def regexNamer(regex, use_page_url=False): def regexNamer(regex, use_page_url=False) -> Namer:
"""Get name from regular expression.""" """Get name from regular expression."""
def _namer(self, image_url, page_url): def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get first regular expression group.""" """Get first regular expression group."""
url = page_url if use_page_url else image_url url = page_url if use_page_url else image_url
mo = regex.search(url) mo = regex.search(url)
if mo: return mo.group(1) if mo else None
return mo.group(1)
return _namer return _namer
def joinPathPartsNamer(pageurlparts, imageurlparts=(-1,), joinchar='_'): def joinPathPartsNamer(pageparts=(), imageparts=(), joinchar='_') -> Namer:
"""Get name by mashing path parts together with underscores.""" """Get name by mashing path parts together with underscores."""
def _namer(self, imageurl, pageurl): def _namer(self: Scraper, image_url: str, page_url: str) -> str | None:
# Split and drop host name # Split and drop host name
pageurlsplit = pageurl.split('/')[3:] pagesplit = page_url.split('/')[3:]
imageurlsplit = imageurl.split('/')[3:] imagesplit = image_url.split('/')[3:]
joinparts = ([pageurlsplit[i] for i in pageurlparts] + joinparts = ([pagesplit[i] for i in pageparts] +
[imageurlsplit[i] for i in imageurlparts]) [imagesplit[i] for i in imageparts])
return joinchar.join(joinparts) return joinchar.join(joinparts)
return _namer return _namer

View file

@ -1,18 +1,18 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, sub, MULTILINE from re import compile, escape, sub, MULTILINE
from ..util import tagre from ..util import tagre
from ..scraper import BasicScraper, ParserScraper, _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..helpers import regexNamer, bounceStarter, indirectStarter from ..helpers import joinPathPartsNamer, bounceStarter, indirectStarter
from .common import WordPressScraper, WordPressNavi, WordPressWebcomic from .common import WordPressScraper, WordPressNavi, WordPressWebcomic
class AbstruseGoose(_ParserScraper): class AbstruseGoose(ParserScraper):
url = 'https://abstrusegoose.com/' url = 'https://web.archive.org/web/20230930172141/https://abstrusegoose.com/'
starter = bounceStarter starter = bounceStarter
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
@ -41,24 +41,16 @@ class AbsurdNotions(_BasicScraper):
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
class AcademyVale(_BasicScraper): class Achewood(ParserScraper):
url = 'http://www.imagerie.com/vale/' baseUrl = 'https://achewood.com/'
stripUrl = url + 'avarch.cgi?%s' stripUrl = baseUrl + '%s/title.html'
firstStripUrl = stripUrl % '001' url = stripUrl % '2016/12/25'
imageSearch = compile(tagre('img', 'src', r'(avale\d{4}-\d{2}\.gif)')) firstStripUrl = stripUrl % '2001/10/01'
prevSearch = compile(tagre('a', 'href', r'(avarch[^">]+)', quote="") + imageSearch = '//img[d:class("comicImage")]'
tagre('img', 'src', r'AVNavBack\.gif')) prevSearch = '//a[d:class("comic_prev")]'
help = 'Index format: nnn' namer = joinPathPartsNamer(pageparts=range(0, 2))
help = 'Index format: yyyy/mm/dd'
endOfLife = True
class Achewood(_ParserScraper):
url = 'https://www.achewood.com/'
stripUrl = url + 'index.php?date=%s'
firstStripUrl = stripUrl % '10012001'
imageSearch = '//p[@id="comic_body"]//img'
prevSearch = '//span[d:class("left")]/a[d:class("dateNav")]'
help = 'Index format: mmddyyyy'
namer = regexNamer(compile(r'date=(\d+)'))
class AdventuresOfFifne(_ParserScraper): class AdventuresOfFifne(_ParserScraper):
@ -117,12 +109,8 @@ class AhoiPolloi(_ParserScraper):
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
class AhoyEarth(WordPressNavi):
url = 'http://www.ahoyearth.com/'
class AirForceBlues(WordPressScraper): class AirForceBlues(WordPressScraper):
url = 'http://farvatoons.com/' url = 'https://web.archive.org/web/20210102113825/http://farvatoons.com/'
firstStripUrl = url + 'comic/in-texas-there-are-texans/' firstStripUrl = url + 'comic/in-texas-there-are-texans/'
@ -235,14 +223,11 @@ class AltermetaOld(_ParserScraper):
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
class AmazingSuperPowers(_BasicScraper): class AmazingSuperPowers(WordPressNavi):
url = 'http://www.amazingsuperpowers.com/' url = 'https://www.amazingsuperpowers.com/'
rurl = escape(url)
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2007/09/heredity' firstStripUrl = stripUrl % '2007/09/heredity'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)) imageSearch = '//div[d:class("comicpane")]/img'
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/name'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
@ -271,19 +256,7 @@ class Amya(WordPressScraper):
url = 'http://www.amyachronicles.com/' url = 'http://www.amyachronicles.com/'
class Anaria(_ParserScraper): class Angband(ParserScraper):
url = 'https://www.leahbriere.com/anaria-the-witchs-dream/'
firstStripUrl = url
imageSearch = '//div[contains(@class, "gallery")]//a'
multipleImagesPerStrip = True
endOfLife = True
def namer(self, imageUrl, pageUrl):
filename = imageUrl.rsplit('/', 1)[-1]
return filename.replace('00.jpg', 'new00.jpg').replace('new', '1')
class Angband(_ParserScraper):
url = 'http://angband.calamarain.net/' url = 'http://angband.calamarain.net/'
stripUrl = url + '%s' stripUrl = url + '%s'
imageSearch = '//img' imageSearch = '//img'
@ -292,7 +265,7 @@ class Angband(_ParserScraper):
def starter(self): def starter(self):
page = self.getPage(self.url) page = self.getPage(self.url)
self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href') self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href')
self.firstStripUrl = self.pages[0] self.firstStripUrl = self.pages[0]
return self.pages[-1] return self.pages[-1]
@ -300,14 +273,6 @@ class Angband(_ParserScraper):
return self.pages[self.pages.index(url) - 1] return self.pages[self.pages.index(url) - 1]
class Angels2200(_BasicScraper):
url = 'http://www.janahoffmann.com/angels/'
stripUrl = url + '%s'
imageSearch = compile(tagre("img", "src", r"(http://www\.janahoffmann\.com/angels/comics/[^']+)", quote="'"))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "« Previous")
help = 'Index format: yyyy/mm/dd/part-<n>-comic-<n>'
class Annyseed(_ParserScraper): class Annyseed(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20190511031451/' baseUrl = ('https://web.archive.org/web/20190511031451/'
'http://www.mirrorwoodcomics.com/') 'http://www.mirrorwoodcomics.com/')
@ -330,7 +295,7 @@ class Annyseed(_ParserScraper):
return tourl return tourl
class AntiheroForHire(_ParserScraper): class AntiheroForHire(ParserScraper):
stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s' stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s'
firstStripUrl = stripUrl % '2016/6/8/entrance-vigil' firstStripUrl = stripUrl % '2016/6/8/entrance-vigil'
url = firstStripUrl url = firstStripUrl
@ -341,7 +306,7 @@ class AntiheroForHire(_ParserScraper):
def starter(self): def starter(self):
# Build list of chapters for navigation # Build list of chapters for navigation
page = self.getPage(self.url) page = self.getPage(self.url)
self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href') self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href')
return self.chapters[0] return self.chapters[0]
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
@ -377,7 +342,7 @@ class ArtificialIncident(WordPressWebcomic):
firstStripUrl = stripUrl % 'issue-one-life-changing' firstStripUrl = stripUrl % 'issue-one-life-changing'
class AstronomyPOTD(_ParserScraper): class AstronomyPOTD(ParserScraper):
baseUrl = 'http://apod.nasa.gov/apod/' baseUrl = 'http://apod.nasa.gov/apod/'
url = baseUrl + 'astropix.html' url = baseUrl + 'astropix.html'
starter = bounceStarter starter = bounceStarter
@ -391,7 +356,7 @@ class AstronomyPOTD(_ParserScraper):
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return data.xpath('//iframe') # videos return self.match(data, '//iframe') # videos
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:], return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],

View file

@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper):
help = 'Index format: yyyy/mm/dd/nnn-stripname' help = 'Index format: yyyy/mm/dd/nnn-stripname'
class CarryOn(_ParserScraper): class CarryOn(ParserScraper):
url = 'http://www.hirezfox.com/km/co/' url = 'http://www.hirezfox.com/km/co/'
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'
firstStripUrl = stripUrl % '20040701' firstStripUrl = stripUrl % '20040701'
imageSearch = '//div[@class="strip"]/img' imageSearch = '//div[d:class("strip")]/img'
prevSearch = '//a[text()="Previous Day"]' prevSearch = '//a[text()="Previous Day"]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
@ -122,13 +122,13 @@ class CatAndGirl(_ParserScraper):
prevSearch = '//a[d:class("pager--prev")]' prevSearch = '//a[d:class("pager--prev")]'
class CatenaManor(_ParserScraper): class CatenaManor(ParserScraper):
baseUrl = ('https://web.archive.org/web/20141027141116/' baseUrl = ('https://web.archive.org/web/20141027141116/'
'http://catenamanor.com/') 'http://catenamanor.com/')
url = baseUrl + 'archives' url = baseUrl + 'archives'
stripUrl = baseUrl + '%s/' stripUrl = baseUrl + '%s/'
firstStripUrl = stripUrl % '2003/07' firstStripUrl = stripUrl % '2003/07'
imageSearch = '//img[@class="comicthumbnail"]' imageSearch = '//img[d:class("comicthumbnail")]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True endOfLife = True
strips: List[str] = [] strips: List[str] = []
@ -136,7 +136,7 @@ class CatenaManor(_ParserScraper):
def starter(self): def starter(self):
# Retrieve archive links and select valid range # Retrieve archive links and select valid range
archivePage = self.getPage(self.url) archivePage = self.getPage(self.url)
archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a') archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a')
valid = False valid = False
for link in archiveStrips: for link in archiveStrips:
if self.stripUrl % '2012/01' in link.get('href'): if self.stripUrl % '2012/01' in link.get('href'):
@ -404,7 +404,7 @@ class CrossTimeCafe(_ParserScraper):
class CSectionComics(WordPressScraper): class CSectionComics(WordPressScraper):
url = 'https://www.csectioncomics.com/' url = 'https://www.csectioncomics.com/'
firstStripUrl = url + 'comics/one-day-in-country' firstStripUrl = url + 'comics/one-day-in-country'
namer = joinPathPartsNamer((), (-3, -2, -1)) namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
multipleImagesPerStrip = True multipleImagesPerStrip = True
@ -466,7 +466,7 @@ class CyanideAndHappiness(ParserScraper):
prevSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="180deg"]]' prevSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="180deg"]]'
nextSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="0deg"]]' nextSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="0deg"]]'
starter = bounceStarter starter = bounceStarter
namer = joinPathPartsNamer((), range(-4, 0)) namer = joinPathPartsNamer(imageparts=range(-4, 0))
class CynWolf(_ParserScraper): class CynWolf(_ParserScraper):

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import os import os
from ..scraper import ParserScraper from ..scraper import ParserScraper
@ -79,7 +79,7 @@ class ComicFury(ParserScraper):
num = parts[-1] num = parts[-1]
if self.multipleImagesPerStrip: if self.multipleImagesPerStrip:
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
images = page.xpath('//img[@class="comicsegmentimage"]/@src') images = self.match(page, '//img[d:class("comicsegmentimage")]/@src')
if len(images) > 1: if len(images) > 1:
imageIndex = images.index(imageUrl) + 1 imageIndex = images.index(imageUrl) + 1
return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext) return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext)
@ -88,8 +88,8 @@ class ComicFury(ParserScraper):
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
# Videos on Underverse # Videos on Underverse
return (data.xpath('//div[@id="comicimagewrap"]//video') and return (self.match(data, '//div[@id="comicimagewrap"]//video') and
not data.xpath('//div[@id="comicimagewrap"]//img')) not self.match(data, '//div[@id="comicimagewrap"]//img'))
@classmethod @classmethod
def getmodules(cls): # noqa: CFQ001 def getmodules(cls): # noqa: CFQ001

View file

@ -1,41 +1,35 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer
try: from ..helpers import indirectStarter
from importlib_resources import as_file, files
except ImportError:
from importlib.resources import as_file, files
from ..helpers import bounceStarter, joinPathPartsNamer
from ..scraper import ParserScraper from ..scraper import ParserScraper
class ComicsKingdom(ParserScraper): class ComicsKingdom(ParserScraper):
imageSearch = '//img[@id="theComicImage"]' partDiv = '//div[d:class("comic-reader-item")]'
prevSearch = '//a[./img[contains(@alt, "Previous")]]' imageSearch = '//meta[@property="og:image"]/@content'
nextSearch = '//a[./img[contains(@alt, "Next")]]' prevSearch = partDiv + '[2]/@data-link'
starter = bounceStarter starter = indirectStarter
namer = joinPathPartsNamer((-2, -1), ())
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
def __init__(self, name, path, lang=None): def __init__(self, name, path, lang=None):
super().__init__('ComicsKingdom/' + name) super().__init__('ComicsKingdom/' + name)
self.url = 'https://comicskingdom.com/' + path self.url = 'https://comicskingdom.com/' + path
self.stripUrl = self.url + '/%s' self.stripUrl = self.url + '/%s'
self.latestSearch = f'//a[re:test(@href, "/{path}/[0-9-]+$")]'
if lang: if lang:
self.lang = lang self.lang = lang
def link_modifier(self, fromurl, tourl):
return tourl.replace('//wp.', '//', 1)
@classmethod @classmethod
def getmodules(cls): # noqa: CFQ001 def getmodules(cls): # noqa: CFQ001
return ( return (
# Some comics are not listed on the "all" page (too old?)
cls('Retail', 'retail'),
# do not edit anything below since these entries are generated from # do not edit anything below since these entries are generated from
# scripts/comicskingdom.py # scripts/comicskingdom.py
# START AUTOUPDATE # START AUTOUPDATE
cls('AmazingSpiderman', 'amazing-spider-man'), cls('Alice', 'alice'),
cls('AmazingSpidermanSpanish', 'hombre-arana', lang='es'),
cls('Apartment3G', 'apartment-3-g_1'), cls('Apartment3G', 'apartment-3-g_1'),
cls('ArcticCircle', 'arctic-circle'), cls('ArcticCircle', 'arctic-circle'),
cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'), cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'),
@ -43,22 +37,25 @@ class ComicsKingdom(ParserScraper):
cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'), cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'),
cls('BeetleBailey', 'beetle-bailey-1'), cls('BeetleBailey', 'beetle-bailey-1'),
cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'), cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'),
cls('BeetleMoses', 'beetle-moses'),
cls('BetweenFriends', 'between-friends'), cls('BetweenFriends', 'between-friends'),
cls('BewareOfToddler', 'beware-of-toddler'),
cls('BigBenBolt', 'big-ben-bolt'), cls('BigBenBolt', 'big-ben-bolt'),
cls('BigBenBoltSundays', 'big-ben-bolt-sundays'),
cls('Bizarro', 'bizarro'), cls('Bizarro', 'bizarro'),
cls('Blondie', 'blondie'), cls('Blondie', 'blondie'),
cls('BlondieSpanish', 'pepita', lang='es'), cls('BlondieSpanish', 'pepita', lang='es'),
cls('BobMankoffPresentsShowMeTheFunny', 'show-me-the-funny'),
cls('BobMankoffPresentsShowMeTheFunnyAnimalEdition', 'show-me-the-funny-pets'),
cls('BonersArk', 'boners-ark'), cls('BonersArk', 'boners-ark'),
cls('BonersArkSundays', 'boners-ark-sundays'), cls('BreakOfDay', 'break-of-day'),
cls('BrianDuffy', 'brian-duffy'),
cls('BrickBradford', 'brick-bradford'), cls('BrickBradford', 'brick-bradford'),
cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'), cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'),
cls('BringingUpFather', 'bringing-up-father'), cls('BringingUpFather', 'bringing-up-father'),
cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'), cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'),
cls('BuzSawyer', 'buz-sawyer'), cls('BuzSawyer', 'buz-sawyer'),
cls('Candorville', 'candorville'),
cls('CarpeDiem', 'carpe-diem'), cls('CarpeDiem', 'carpe-diem'),
cls('Crankshaft', 'crankshaft'), cls('Comiclicious', 'comiclicious'),
cls('Crock', 'crock'), cls('Crock', 'crock'),
cls('CrockSpanish', 'crock-spanish', lang='es'), cls('CrockSpanish', 'crock-spanish', lang='es'),
cls('Curtis', 'curtis'), cls('Curtis', 'curtis'),
@ -67,6 +64,7 @@ class ComicsKingdom(ParserScraper):
cls('DavidMHitch', 'david-m-hitch'), cls('DavidMHitch', 'david-m-hitch'),
cls('DennisTheMenace', 'dennis-the-menace'), cls('DennisTheMenace', 'dennis-the-menace'),
cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'), cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'),
cls('Dumplings', 'dumplings'),
cls('Dustin', 'dustin'), cls('Dustin', 'dustin'),
cls('EdGamble', 'ed-gamble'), cls('EdGamble', 'ed-gamble'),
# EdgeCity has a duplicate in GoComics/EdgeCity # EdgeCity has a duplicate in GoComics/EdgeCity
@ -74,18 +72,15 @@ class ComicsKingdom(ParserScraper):
cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'), cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'),
cls('FlashForward', 'flash-forward'), cls('FlashForward', 'flash-forward'),
cls('FlashGordon', 'flash-gordon'), cls('FlashGordon', 'flash-gordon'),
cls('FlashGordonSundays', 'flash-gordon-sundays'), cls('FunnyOnlineAnimals', 'funny-online-animals'),
cls('FunkyWinkerbean', 'funky-winkerbean'), cls('GearheadGertie', 'gearhead-gertie'),
cls('FunkyWinkerbeanSunday', 'funky-winkerbean-sundays'), cls('GodsHands', 'gods-hands'),
cls('FunkyWinkerbeanVintage', 'funky-winkerbean-1'),
cls('FunnyOnlineAnimals', 'Funny-Online-Animals'),
cls('GearheadGertie', 'Gearhead-Gertie'),
cls('HagarTheHorrible', 'hagar-the-horrible'), cls('HagarTheHorrible', 'hagar-the-horrible'),
cls('HagarTheHorribleSpanish', 'olafo', lang='es'), cls('HagarTheHorribleSpanish', 'olafo', lang='es'),
cls('HeartOfJulietJones', 'heart-of-juliet-jones'), cls('HeartOfJulietJones', 'heart-of-juliet-jones'),
cls('HeartOfJulietJonesSundays', 'heart-of-juliet-jones-sundays'),
cls('HiAndLois', 'hi-and-lois'), cls('HiAndLois', 'hi-and-lois'),
cls('IntelligentLife', 'Intelligent'), cls('InsanityStreak', 'insanity-streak'),
cls('IntelligentLife', 'intelligent'),
cls('JimmyMargulies', 'jimmy-margulies'), cls('JimmyMargulies', 'jimmy-margulies'),
cls('JohnBranch', 'john-branch'), cls('JohnBranch', 'john-branch'),
cls('JohnnyHazard', 'johnny-hazard'), cls('JohnnyHazard', 'johnny-hazard'),
@ -93,7 +88,6 @@ class ComicsKingdom(ParserScraper):
cls('JungleJimSundays', 'jungle-jim-sundays'), cls('JungleJimSundays', 'jungle-jim-sundays'),
cls('KatzenjammerKids', 'katzenjammer-kids'), cls('KatzenjammerKids', 'katzenjammer-kids'),
cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'), cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'),
cls('KatzenjammerKidsSundays', 'katzenjammer-kids-sundays'),
cls('KevinAndKell', 'kevin-and-kell'), cls('KevinAndKell', 'kevin-and-kell'),
cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'), cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'),
cls('KirkWalters', 'kirk-walters'), cls('KirkWalters', 'kirk-walters'),
@ -101,44 +95,42 @@ class ComicsKingdom(ParserScraper):
cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'), cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'),
cls('LeeJudge', 'lee-judge'), cls('LeeJudge', 'lee-judge'),
cls('LegalizationNation', 'legalization-nation'), cls('LegalizationNation', 'legalization-nation'),
cls('LegendOfBill', 'Legend-of-Bill'), cls('LegendOfBill', 'legend-of-bill'),
cls('LittleIodineSundays', 'little-iodine-sundays'), cls('LittleIodineSundays', 'little-iodine-sundays'),
cls('LittleKing', 'the-little-king'), cls('LittleKing', 'the-little-king'),
cls('Lockhorns', 'lockhorns'), cls('Macanudo', 'macanudo'),
cls('Macanudo', 'Macanudo'),
cls('MacanudoSpanish', 'macanudo-spanish', lang='es'), cls('MacanudoSpanish', 'macanudo-spanish', lang='es'),
cls('MallardFillmore', 'mallard-fillmore'), cls('MallardFillmore', 'mallard-fillmore'),
cls('MandrakeTheMagician', 'mandrake-the-magician-1'), cls('MandrakeTheMagician', 'mandrake-the-magician'),
cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'), cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'),
cls('MandrakeTheMagicianSundays', 'mandrake-the-magician-sundays'), cls('MaraLlaveKeeperOfTime', 'mara-llave-keeper-of-time'),
cls('MarkTrail', 'mark-trail'), cls('MarkTrail', 'mark-trail'),
cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'), cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'),
cls('MarkTrailVintage', 'Mark-Trail-Vintage'),
cls('Marvin', 'marvin'), cls('Marvin', 'marvin'),
cls('MarvinSpanish', 'marvin-spanish', lang='es'), cls('MarvinSpanish', 'marvin-spanish', lang='es'),
cls('MaryWorth', 'mary-worth'), cls('MaryWorth', 'mary-worth'),
cls('MaryWorthSpanish', 'maria-de-oro', lang='es'), cls('MaryWorthSpanish', 'maria-de-oro', lang='es'),
cls('MikePeters', 'mike-peters'), cls('Mazetoons', 'mazetoons'),
cls('MikeShelton', 'mike-shelton'), cls('MikeShelton', 'mike-shelton'),
cls('MikeSmith', 'mike-smith'), cls('MikeSmith', 'mike-smith'),
cls('MooseAndMolly', 'moose-and-molly'), cls('MooseAndMolly', 'moose-and-molly'),
cls('MooseAndMollySpanish', 'quintin', lang='es'), cls('MooseAndMollySpanish', 'quintin', lang='es'),
cls('MotherGooseAndGrimm', 'mother-goose-grimm'),
cls('MrAbernathySpanish', 'don-abundio', lang='es'), cls('MrAbernathySpanish', 'don-abundio', lang='es'),
cls('Mutts', 'mutts'), cls('Mutts', 'mutts'),
cls('MuttsSpanish', 'motas', lang='es'), cls('MuttsSpanish', 'motas', lang='es'),
cls('NeverBeenDeader', 'never-been-deader'),
cls('OfficeHours', 'office-hours'), cls('OfficeHours', 'office-hours'),
cls('OliveAndPopeye', 'olive-popeye'),
cls('OnTheFastrack', 'on-the-fastrack'), cls('OnTheFastrack', 'on-the-fastrack'),
cls('PajamaDiaries', 'pajama-diaries'), cls('PajamaDiaries', 'pajama-diaries'),
cls('PardonMyPlanet', 'pardon-my-planet'), cls('PardonMyPlanet', 'pardon-my-planet'),
cls('Phantom', 'phantom'), cls('Phantom', 'phantom'),
cls('PhantomSpanish', 'el-fantasma', lang='es'), cls('PhantomSpanish', 'el-fantasma', lang='es'),
cls('PhantomSundays', 'phantom-sundays'), cls('PlanetSyndicate', 'the_planet_syndicate'),
cls('Popeye', 'popeye'), cls('Popeye', 'popeye'),
cls('PopeyesCartoonClub', 'popeyes-cartoon-club'), cls('PopeyesCartoonClub', 'popeyes-cartoon-club'),
cls('PopeyeSpanish', 'popeye-spanish', lang='es'), cls('PopeyeSpanish', 'popeye-spanish', lang='es'),
cls('PrinceValiant', 'prince-valiant'), cls('PrinceValiant', 'prince-valiant'),
cls('PrinceValiantSundays', 'prince-valiant-sundays'),
cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'), cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'),
cls('ProsAndCons', 'pros-cons'), cls('ProsAndCons', 'pros-cons'),
cls('Quincy', 'quincy'), cls('Quincy', 'quincy'),
@ -148,7 +140,9 @@ class ComicsKingdom(ParserScraper):
cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'), cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'),
cls('RhymesWithOrange', 'rhymes-with-orange'), cls('RhymesWithOrange', 'rhymes-with-orange'),
cls('RipKirby', 'rip-kirby'), cls('RipKirby', 'rip-kirby'),
# Rosebuds has a duplicate in GoComics/Rosebuds
cls('SafeHavens', 'safe-havens'), cls('SafeHavens', 'safe-havens'),
cls('SagaOfBrannBjornson', 'the-saga-of-brann-bjornson'),
cls('Sales', 'sales'), cls('Sales', 'sales'),
cls('SallyForth', 'sally-forth'), cls('SallyForth', 'sally-forth'),
cls('SamAndSilo', 'sam-and-silo'), cls('SamAndSilo', 'sam-and-silo'),
@ -156,17 +150,18 @@ class ComicsKingdom(ParserScraper):
cls('SecretAgentX9', 'secret-agent-x-9'), cls('SecretAgentX9', 'secret-agent-x-9'),
# Shoe has a duplicate in GoComics/Shoe # Shoe has a duplicate in GoComics/Shoe
cls('SixChix', 'six-chix'), cls('SixChix', 'six-chix'),
cls('SlylockFoxAndComicsForKids', 'slylock-fox-and-comics-for-kids'), cls('SlylockFox', 'slylock-fox-and-comics-for-kids'),
cls('SlylockFoxAndComicsForKidsSpanish', 'solo-para-ninos', lang='es'), cls('SlylockFoxSpanish', 'solo-para-ninos', lang='es'),
cls('SuburbanFairyTales', 'suburban-fairy-tales'),
cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'), cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'),
cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'), cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'),
cls('ThimbleTheater', 'thimble-theater'), cls('ThimbleTheater', 'thimble-theater'),
cls('Tiger', 'tiger'), cls('Tiger', 'tiger'),
cls('TigerSpanish', 'tigrillo', lang='es'), cls('TigerSpanish', 'tigrillo', lang='es'),
cls('TigerVintage', 'tiger-1'),
cls('TigerVintageSundays', 'tiger-sundays'),
cls('TinasGroove', 'tina-s-groove'), cls('TinasGroove', 'tina-s-groove'),
cls('ToddTheDinosaur', 'todd-the-dinosaur'), cls('ToddTheDinosaur', 'todd-the-dinosaur'),
cls('WillyBlack', 'willy-black'),
cls('WillyBlacksSpanish', 'willy-black-spanish', lang='es'),
cls('ZippyThePinhead', 'zippy-the-pinhead'), cls('ZippyThePinhead', 'zippy-the-pinhead'),
cls('Zits', 'zits'), cls('Zits', 'zits'),
cls('ZitsSpanish', 'jeremias', lang='es'), cls('ZitsSpanish', 'jeremias', lang='es'),

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper):
help = 'Index format: n' help = 'Index format: n'
class DresdenCodak(_ParserScraper): class DresdenCodak(ParserScraper):
url = 'http://dresdencodak.com/' url = 'http://dresdencodak.com/'
startUrl = url + 'cat/comic/'
firstStripUrl = url + '2007/02/08/pom/' firstStripUrl = url + '2007/02/08/pom/'
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
prevSearch = '//a[img[contains(@src, "prev")]]' prevSearch = '//a[img[contains(@src, "prev")]]'
latestSearch = '//a[d:class("tc-grid-bg-link")]' latestSearch = '//a[d:class("tc-grid-bg-link")]'
starter = indirectStarter starter = indirectStarter
# Blog and comic are mixed...
def shouldSkipUrl(self, url, data):
return not data.xpath(self.imageSearch)
class DrFun(_ParserScraper): class DrFun(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20180726145737/' baseUrl = ('https://web.archive.org/web/20180726145737/'
@ -355,14 +350,12 @@ class DrFun(_ParserScraper):
help = 'Index format: nnnnn' help = 'Index format: nnnnn'
class Drive(_BasicScraper): class Drive(ParserScraper):
url = 'http://www.drivecomic.com/' url = 'http://www.drivecomic.com/'
rurl = escape(url) firstStripUrl = url + 'comic/act-1-pg-001/'
stripUrl = url + 'archive/%s.html' imageSearch = ('//div[@id="unspliced-comic"]//img/@data-src-img',
firstStripUrl = stripUrl % '090815' '//div[@id="unspliced-comic"]//picture//img')
imageSearch = compile(tagre("img", "src", r'(http://cdn\.drivecomic\.com/strips/main/[^"]+)')) prevSearch = '//a[d:class("previous-comic")]'
prevSearch = compile(tagre("a", "href", r'(%sarchive/\d+\.html)' % rurl) + "Previous")
help = 'Index format: yymmdd'
class DrMcNinja(_ParserScraper): class DrMcNinja(_ParserScraper):

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper from ..scraper import ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
@ -27,7 +27,7 @@ class Derideal(ParserScraper):
def starter(self): def starter(self):
indexPage = self.getPage(self.url) indexPage = self.getPage(self.url)
self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href') self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href')
self.currentChapter = len(self.chapters) self.currentChapter = len(self.chapters)
return indirectStarter(self) return indirectStarter(self)

View file

@ -113,7 +113,7 @@ class Erfworld(ParserScraper):
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return not data.xpath(self.imageSearch) return not self.match(data, self.imageSearch)
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Fix inconsistent filenames # Fix inconsistent filenames
@ -167,15 +167,6 @@ class Erstwhile(WordPressNavi):
endOfLife = True endOfLife = True
class Everblue(ComicControlScraper):
url = 'http://www.everblue-comic.com/comic/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
def namer(self, imageUrl, pageUrl):
return imageUrl.rsplit('/', 1)[-1].split('-', 1)[1]
class EverybodyLovesEricRaymond(_ParserScraper): class EverybodyLovesEricRaymond(_ParserScraper):
url = 'http://geekz.co.uk/lovesraymond/' url = 'http://geekz.co.uk/lovesraymond/'
firstStripUrl = url + 'archive/slashdotted' firstStripUrl = url + 'archive/slashdotted'
@ -190,9 +181,10 @@ class EvilDiva(WordPressScraper):
endOfLife = True endOfLife = True
class EvilInc(_ParserScraper): class EvilInc(ParserScraper):
url = 'https://www.evil-inc.com/' url = 'https://www.evil-inc.com/'
imageSearch = '//div[@id="unspliced-comic"]/img/@data-src' imageSearch = ('//div[@id="unspliced-comic"]/img',
'//div[@id="unspliced-comic"]/picture//img')
prevSearch = '//a[./i[d:class("fa-chevron-left")]]' prevSearch = '//a[./i[d:class("fa-chevron-left")]]'
firstStripUrl = url + 'comic/monday-3/' firstStripUrl = url + 'comic/monday-3/'
@ -263,7 +255,7 @@ class ExtraFabulousComics(WordPressScraper):
return '_'.join((pagepart, imagename)) return '_'.join((pagepart, imagename))
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
return data.xpath('//div[@id="comic"]//iframe') return self.match(data, '//div[@id="comic"]//iframe')
class ExtraLife(_BasicScraper): class ExtraLife(_BasicScraper):

View file

@ -140,7 +140,7 @@ class FoxDad(ParserScraper):
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
post = page.xpath('//li[@class="timestamp"]/a/@href')[0] post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0]
post = post.replace('https://foxdad.com/post/', '') post = post.replace('https://foxdad.com/post/', '')
if '-consider-support' in post: if '-consider-support' in post:
post = post.split('-consider-support')[0] post = post.split('-consider-support')[0]
@ -171,7 +171,7 @@ class Fragile(_ParserScraper):
endOfLife = True endOfLife = True
class FredoAndPidjin(_ParserScraper): class FredoAndPidjin(ParserScraper):
url = 'https://www.pidjin.net/' url = 'https://www.pidjin.net/'
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/02/19/goofy-monday' firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
@ -180,7 +180,7 @@ class FredoAndPidjin(_ParserScraper):
prevSearch = '//span[d:class("prev")]/a' prevSearch = '//span[d:class("prev")]/a'
latestSearch = '//section[d:class("latest")]//a' latestSearch = '//section[d:class("latest")]//a'
starter = indirectStarter starter = indirectStarter
namer = joinPathPartsNamer((0, 1, 2)) namer = joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))
class Freefall(_ParserScraper): class Freefall(_ParserScraper):
@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper):
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '') strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1] return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1]

View file

@ -3,11 +3,11 @@
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape from re import compile
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre, getQueryParams
from .common import ComicControlScraper, WordPressScraper, WordPressNavi from .common import ComicControlScraper, WordPressScraper, WordPressNavi
@ -27,13 +27,9 @@ class Garanos(WordPressScraper):
endOfLife = True endOfLife = True
class GastroPhobia(_ParserScraper): class GastroPhobia(ComicControlScraper):
url = 'http://www.gastrophobia.com/' url = 'https://gastrophobia.com/'
stripUrl = url + 'index.php?date=%s' firstStripUrl = url + 'comix/the-mane-event'
firstStripUrl = stripUrl % '2008-07-30'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//div[@id="prev"]/a'
help = 'Index format: yyyy-mm-dd'
class Geeks(_ParserScraper): class Geeks(_ParserScraper):
@ -51,7 +47,7 @@ class GeeksNextDoor(_ParserScraper):
url = 'http://www.geeksnextcomic.com/' url = 'http://www.geeksnextcomic.com/'
stripUrl = url + '%s.html' stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '2007-03-27' # '2010-10-04' firstStripUrl = stripUrl % '2007-03-27' # '2010-10-04'
imageSearch = '//p/img' imageSearch = ('//p/img', '//p/span/img')
prevSearch = ( prevSearch = (
'//a[img[contains(@src, "/nav_prev")]]', '//a[img[contains(@src, "/nav_prev")]]',
'//a[contains(text(), "< prev")]', # start page is different '//a[contains(text(), "< prev")]', # start page is different
@ -59,16 +55,12 @@ class GeeksNextDoor(_ParserScraper):
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
class GirlGenius(_BasicScraper): class GirlGenius(ParserScraper):
baseUrl = 'http://www.girlgeniusonline.com/' url = 'https://www.girlgeniusonline.com/comic.php'
rurl = escape(baseUrl)
url = baseUrl + 'comic.php'
stripUrl = url + '?date=%s' stripUrl = url + '?date=%s'
firstStripUrl = stripUrl % '20021104' firstStripUrl = stripUrl % '20021104'
imageSearch = compile( imageSearch = '//img[@alt="Comic"]'
tagre("img", "src", r"(%sggmain/strips/[^']*)" % rurl, quote="'")) prevSearch = '//a[@id="topprev"]'
prevSearch = compile(tagre("a", "id", "topprev", quote="\"",
before=r"(%s[^\"']+)" % rurl))
multipleImagesPerStrip = True multipleImagesPerStrip = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -99,20 +91,18 @@ class GoGetARoomie(ComicControlScraper):
url = 'http://www.gogetaroomie.com' url = 'http://www.gogetaroomie.com'
class GoneWithTheBlastwave(_BasicScraper): class GoneWithTheBlastwave(ParserScraper):
url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1' stripUrl = 'http://www.blastwave-comic.com/index.php?p=comic&nro=%s'
starter = indirectStarter
stripUrl = url[:-1] + '%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = compile(r'<img.+src=".+(/comics/.+?)"') url = firstStripUrl
prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' + starter = indirectStarter
r'<img src="images/page/default/previous') imageSearch = '//*[@id="comic_ruutu"]/center/img'
latestSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' + prevSearch = '//a[img[contains(@src, "previous")]]'
r'<img src="images/page/default/latest') latestSearch = '//a[img[contains(@src, "latest")]]'
help = 'Index format: n' help = 'Index format: n'
def namer(self, image_url, page_url): def namer(self, image_url, page_url):
return '%02d' % int(compile(r'nro=(\d+)').search(page_url).group(1)) return '%02d' % int(getQueryParams(page_url)['nro'][0])
class GrrlPower(WordPressScraper): class GrrlPower(WordPressScraper):
@ -130,13 +120,12 @@ class GuildedAge(WordPressScraper):
firstStripUrl = url + 'comic/chapter-1-cover/' firstStripUrl = url + 'comic/chapter-1-cover/'
class GUComics(_BasicScraper): class GUComics(ParserScraper):
url = 'http://www.gucomics.com/' stripUrl = 'https://www.gucomics.com/%s'
stripUrl = url + '%s' url = stripUrl % 'comic/'
firstStripUrl = stripUrl % '20000710' firstStripUrl = stripUrl % '20000710'
imageSearch = compile(tagre("img", "src", r'(/comics/\d{4}/gu_[^"]+)')) imageSearch = '//img[contains(@src, "/comics/2")]'
prevSearch = compile(tagre("a", "href", r'(/\d+)') + prevSearch = '//a[img[contains(@alt, "previous")]]'
tagre("img", "src", r'/images/nav/prev\.png'))
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from ..scraper import ParserScraper from ..scraper import ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
@ -31,7 +31,7 @@ class GoComics(ParserScraper):
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return data.xpath('//img[contains(@src, "content-error-missing")]') return self.match(data, '//img[contains(@src, "content-error-missing")]')
@classmethod @classmethod
def getmodules(cls): # noqa: CFQ001 def getmodules(cls): # noqa: CFQ001
@ -44,7 +44,6 @@ class GoComics(ParserScraper):
# START AUTOUPDATE # START AUTOUPDATE
cls('1AndDone', '1-and-done'), cls('1AndDone', '1-and-done'),
cls('9ChickweedLane', '9chickweedlane'), cls('9ChickweedLane', '9chickweedlane'),
cls('9ChickweedLaneClassics', '9-chickweed-lane-classics'),
cls('9To5', '9to5'), cls('9To5', '9to5'),
cls('Aaggghhh', 'Aaggghhh', 'es'), cls('Aaggghhh', 'Aaggghhh', 'es'),
cls('AdamAtHome', 'adamathome'), cls('AdamAtHome', 'adamathome'),
@ -62,6 +61,7 @@ class GoComics(ParserScraper):
cls('Annie', 'annie'), cls('Annie', 'annie'),
cls('AProblemLikeJamal', 'a-problem-like-jamal'), cls('AProblemLikeJamal', 'a-problem-like-jamal'),
cls('ArloAndJanis', 'arloandjanis'), cls('ArloAndJanis', 'arloandjanis'),
cls('ArtByMoga', 'artbymoga'),
cls('AskShagg', 'askshagg'), cls('AskShagg', 'askshagg'),
cls('AtTavicat', 'tavicat'), cls('AtTavicat', 'tavicat'),
cls('AuntyAcid', 'aunty-acid'), cls('AuntyAcid', 'aunty-acid'),
@ -69,7 +69,6 @@ class GoComics(ParserScraper):
cls('BackInTheDay', 'backintheday'), cls('BackInTheDay', 'backintheday'),
cls('BackToBC', 'back-to-bc'), cls('BackToBC', 'back-to-bc'),
cls('Bacon', 'bacon'), cls('Bacon', 'bacon'),
cls('Badlands', 'badlands'),
cls('BadMachinery', 'bad-machinery'), cls('BadMachinery', 'bad-machinery'),
cls('Baldo', 'baldo'), cls('Baldo', 'baldo'),
cls('BaldoEnEspanol', 'baldoespanol', 'es'), cls('BaldoEnEspanol', 'baldoespanol', 'es'),
@ -90,8 +89,8 @@ class GoComics(ParserScraper):
cls('Betty', 'betty'), cls('Betty', 'betty'),
cls('BFGFSyndrome', 'bfgf-syndrome'), cls('BFGFSyndrome', 'bfgf-syndrome'),
cls('BigNate', 'bignate'), cls('BigNate', 'bignate'),
cls('BigNateFirstClass', 'big-nate-first-class'),
cls('BigTop', 'bigtop'), cls('BigTop', 'bigtop'),
cls('BillBramhall', 'bill-bramhall'),
cls('BirdAndMoon', 'bird-and-moon'), cls('BirdAndMoon', 'bird-and-moon'),
cls('Birdbrains', 'birdbrains'), cls('Birdbrains', 'birdbrains'),
cls('BleekerTheRechargeableDog', 'bleeker'), cls('BleekerTheRechargeableDog', 'bleeker'),
@ -99,14 +98,14 @@ class GoComics(ParserScraper):
cls('BloomCounty', 'bloomcounty'), cls('BloomCounty', 'bloomcounty'),
cls('BloomCounty2019', 'bloom-county'), cls('BloomCounty2019', 'bloom-county'),
cls('BobGorrell', 'bobgorrell'), cls('BobGorrell', 'bobgorrell'),
cls('BobTheAngryFlower', 'bob-the-angry-flower'),
cls('BobTheSquirrel', 'bobthesquirrel'), cls('BobTheSquirrel', 'bobthesquirrel'),
cls('BoNanas', 'bonanas'), cls('BoNanas', 'bonanas'),
cls('Boomerangs', 'boomerangs'), cls('Boomerangs', 'boomerangs'),
cls('Bottomliners', 'bottomliners'), cls('BottomLiners', 'bottomliners'),
cls('BoundAndGagged', 'boundandgagged'), cls('BoundAndGagged', 'boundandgagged'),
cls('Bozo', 'bozo'), cls('Bozo', 'bozo'),
cls('BreakingCatNews', 'breaking-cat-news'), cls('BreakingCatNews', 'breaking-cat-news'),
cls('BreakOfDay', 'break-of-day'),
cls('Brevity', 'brevity'), cls('Brevity', 'brevity'),
cls('BrewsterRockit', 'brewsterrockit'), cls('BrewsterRockit', 'brewsterrockit'),
cls('BrianMcFadden', 'brian-mcfadden'), cls('BrianMcFadden', 'brian-mcfadden'),
@ -116,7 +115,6 @@ class GoComics(ParserScraper):
cls('Buni', 'buni'), cls('Buni', 'buni'),
cls('CalvinAndHobbes', 'calvinandhobbes'), cls('CalvinAndHobbes', 'calvinandhobbes'),
cls('CalvinAndHobbesEnEspanol', 'calvinandhobbesespanol', 'es'), cls('CalvinAndHobbesEnEspanol', 'calvinandhobbesespanol', 'es'),
cls('Candorville', 'candorville'),
cls('CatanaComics', 'little-moments-of-love'), cls('CatanaComics', 'little-moments-of-love'),
cls('CathyClassics', 'cathy'), cls('CathyClassics', 'cathy'),
cls('CathyCommiserations', 'cathy-commiserations'), cls('CathyCommiserations', 'cathy-commiserations'),
@ -139,17 +137,18 @@ class GoComics(ParserScraper):
cls('CowAndBoyClassics', 'cowandboy'), cls('CowAndBoyClassics', 'cowandboy'),
cls('CowTown', 'cowtown'), cls('CowTown', 'cowtown'),
cls('Crabgrass', 'crabgrass'), cls('Crabgrass', 'crabgrass'),
# Crankshaft has a duplicate in ComicsKingdom/Crankshaft
cls('Crumb', 'crumb'), cls('Crumb', 'crumb'),
cls('CulDeSac', 'culdesac'), cls('CulDeSac', 'culdesac'),
cls('Curses', 'curses'),
cls('DaddysHome', 'daddyshome'), cls('DaddysHome', 'daddyshome'),
cls('DanaSummers', 'danasummers'), cls('DanaSummers', 'danasummers'),
cls('DarkSideOfTheHorse', 'darksideofthehorse'), cls('DarkSideOfTheHorse', 'darksideofthehorse'),
cls('DayByDave', 'day-by-dave'),
cls('DeepDarkFears', 'deep-dark-fears'), cls('DeepDarkFears', 'deep-dark-fears'),
cls('DeFlocked', 'deflocked'), cls('DeFlocked', 'deflocked'),
cls('DiamondLil', 'diamondlil'), cls('DiamondLil', 'diamondlil'),
cls('DickTracy', 'dicktracy'), cls('DickTracy', 'dicktracy'),
cls('DilbertClassics', 'dilbert-classics'),
cls('DilbertEnEspanol', 'dilbert-en-espanol', 'es'),
cls('DinosaurComics', 'dinosaur-comics'), cls('DinosaurComics', 'dinosaur-comics'),
cls('DogEatDoug', 'dogeatdoug'), cls('DogEatDoug', 'dogeatdoug'),
cls('DogsOfCKennel', 'dogsofckennel'), cls('DogsOfCKennel', 'dogsofckennel'),
@ -160,15 +159,14 @@ class GoComics(ParserScraper):
cls('Doonesbury', 'doonesbury'), cls('Doonesbury', 'doonesbury'),
cls('Drabble', 'drabble'), cls('Drabble', 'drabble'),
cls('DrewSheneman', 'drewsheneman'), cls('DrewSheneman', 'drewsheneman'),
cls('DumbwichCastle', 'dumbwich-castle'),
cls('EdgeCity', 'edge-city'), cls('EdgeCity', 'edge-city'),
cls('Eek', 'eek'), cls('Eek', 'eek'),
cls('ElCafDePoncho', 'el-cafe-de-poncho', 'es'), cls('ElCafDePoncho', 'el-cafe-de-poncho', 'es'),
cls('EmmyLou', 'emmy-lou'), cls('EmmyLou', 'emmy-lou'),
cls('Endtown', 'endtown'), cls('Endtown', 'endtown'),
cls('EricAllie', 'eric-allie'),
cls('EverydayPeopleCartoons', 'everyday-people-cartoons'), cls('EverydayPeopleCartoons', 'everyday-people-cartoons'),
cls('Eyebeam', 'eyebeam'), cls('Eyebeam', 'eyebeam'),
cls('EyebeamClassic', 'eyebeam-classic'),
cls('FalseKnees', 'false-knees'), cls('FalseKnees', 'false-knees'),
cls('FamilyTree', 'familytree'), cls('FamilyTree', 'familytree'),
cls('Farcus', 'farcus'), cls('Farcus', 'farcus'),
@ -191,8 +189,8 @@ class GoComics(ParserScraper):
cls('FreeRange', 'freerange'), cls('FreeRange', 'freerange'),
cls('FreshlySqueezed', 'freshlysqueezed'), cls('FreshlySqueezed', 'freshlysqueezed'),
cls('FrogApplause', 'frogapplause'), cls('FrogApplause', 'frogapplause'),
cls('FurBabies', 'furbabies'),
cls('Garfield', 'garfield'), cls('Garfield', 'garfield'),
cls('GarfieldClassics', 'garfield-classics'),
cls('GarfieldEnEspanol', 'garfieldespanol', 'es'), cls('GarfieldEnEspanol', 'garfieldespanol', 'es'),
cls('GaryMarkstein', 'garymarkstein'), cls('GaryMarkstein', 'garymarkstein'),
cls('GaryVarvel', 'garyvarvel'), cls('GaryVarvel', 'garyvarvel'),
@ -222,6 +220,7 @@ class GoComics(ParserScraper):
cls('HerbAndJamaal', 'herbandjamaal'), cls('HerbAndJamaal', 'herbandjamaal'),
cls('Herman', 'herman'), cls('Herman', 'herman'),
cls('HomeAndAway', 'homeandaway'), cls('HomeAndAway', 'homeandaway'),
cls('HomeFree', 'homefree'),
cls('HotComicsForCoolPeople', 'hot-comics-for-cool-people'), cls('HotComicsForCoolPeople', 'hot-comics-for-cool-people'),
cls('HutchOwen', 'hutch-owen'), cls('HutchOwen', 'hutch-owen'),
cls('ImagineThis', 'imaginethis'), cls('ImagineThis', 'imaginethis'),
@ -238,10 +237,12 @@ class GoComics(ParserScraper):
cls('JeffDanziger', 'jeffdanziger'), cls('JeffDanziger', 'jeffdanziger'),
cls('JeffStahler', 'jeffstahler'), cls('JeffStahler', 'jeffstahler'),
cls('JenSorensen', 'jen-sorensen'), cls('JenSorensen', 'jen-sorensen'),
cls('JerryKingComics', 'jerry-king-comics'),
cls('JimBentonCartoons', 'jim-benton-cartoons'), cls('JimBentonCartoons', 'jim-benton-cartoons'),
cls('JimMorin', 'jimmorin'), cls('JimMorin', 'jimmorin'),
cls('JoeHeller', 'joe-heller'), cls('JoeHeller', 'joe-heller'),
cls('JoelPett', 'joelpett'), cls('JoelPett', 'joelpett'),
cls('JoeyWeatherford', 'joey-weatherford'),
cls('JohnDeering', 'johndeering'), cls('JohnDeering', 'johndeering'),
cls('JumpStart', 'jumpstart'), cls('JumpStart', 'jumpstart'),
cls('JunkDrawer', 'junk-drawer'), cls('JunkDrawer', 'junk-drawer'),
@ -287,7 +288,6 @@ class GoComics(ParserScraper):
cls('Lunarbaboon', 'lunarbaboon'), cls('Lunarbaboon', 'lunarbaboon'),
cls('M2Bulls', 'm2bulls'), cls('M2Bulls', 'm2bulls'),
cls('Maintaining', 'maintaining'), cls('Maintaining', 'maintaining'),
cls('MakingIt', 'making-it'),
cls('MannequinOnTheMoon', 'mannequin-on-the-moon'), cls('MannequinOnTheMoon', 'mannequin-on-the-moon'),
cls('MariasDay', 'marias-day'), cls('MariasDay', 'marias-day'),
cls('Marmaduke', 'marmaduke'), cls('Marmaduke', 'marmaduke'),
@ -299,6 +299,7 @@ class GoComics(ParserScraper):
cls('MessycowComics', 'messy-cow'), cls('MessycowComics', 'messy-cow'),
cls('MexikidStories', 'mexikid-stories'), cls('MexikidStories', 'mexikid-stories'),
cls('MichaelRamirez', 'michaelramirez'), cls('MichaelRamirez', 'michaelramirez'),
cls('MikeBeckom', 'mike-beckom'),
cls('MikeDuJour', 'mike-du-jour'), cls('MikeDuJour', 'mike-du-jour'),
cls('MikeLester', 'mike-lester'), cls('MikeLester', 'mike-lester'),
cls('MikeLuckovich', 'mikeluckovich'), cls('MikeLuckovich', 'mikeluckovich'),
@ -307,9 +308,9 @@ class GoComics(ParserScraper):
cls('Momma', 'momma'), cls('Momma', 'momma'),
cls('Monty', 'monty'), cls('Monty', 'monty'),
cls('MontyDiaros', 'monty-diaros', 'es'), cls('MontyDiaros', 'monty-diaros', 'es'),
# MotherGooseAndGrimm has a duplicate in ComicsKingdom/MotherGooseAndGrimm
cls('MotleyClassics', 'motley-classics'), cls('MotleyClassics', 'motley-classics'),
cls('MrLowe', 'mr-lowe'), cls('MrLowe', 'mr-lowe'),
cls('MtPleasant', 'mtpleasant'),
cls('MuttAndJeff', 'muttandjeff'), cls('MuttAndJeff', 'muttandjeff'),
cls('MyDadIsDracula', 'my-dad-is-dracula'), cls('MyDadIsDracula', 'my-dad-is-dracula'),
cls('MythTickle', 'mythtickle'), cls('MythTickle', 'mythtickle'),
@ -341,10 +342,10 @@ class GoComics(ParserScraper):
cls('OverTheHedge', 'overthehedge'), cls('OverTheHedge', 'overthehedge'),
cls('OzyAndMillie', 'ozy-and-millie'), cls('OzyAndMillie', 'ozy-and-millie'),
cls('PatOliphant', 'patoliphant'), cls('PatOliphant', 'patoliphant'),
cls('PCAndPixel', 'pcandpixel'),
cls('Peanuts', 'peanuts'), cls('Peanuts', 'peanuts'),
cls('PeanutsBegins', 'peanuts-begins'), cls('PeanutsBegins', 'peanuts-begins'),
cls('PearlsBeforeSwine', 'pearlsbeforeswine'), cls('PearlsBeforeSwine', 'pearlsbeforeswine'),
cls('PedroXMolina', 'pedroxmolina'),
cls('Periquita', 'periquita', 'es'), cls('Periquita', 'periquita', 'es'),
cls('PerlasParaLosCerdos', 'perlas-para-los-cerdos', 'es'), cls('PerlasParaLosCerdos', 'perlas-para-los-cerdos', 'es'),
cls('PerryBibleFellowship', 'perry-bible-fellowship'), cls('PerryBibleFellowship', 'perry-bible-fellowship'),
@ -383,7 +384,6 @@ class GoComics(ParserScraper):
cls('RoseIsRose', 'roseisrose'), cls('RoseIsRose', 'roseisrose'),
cls('Rubes', 'rubes'), cls('Rubes', 'rubes'),
cls('RudyPark', 'rudypark'), cls('RudyPark', 'rudypark'),
cls('SaltNPepper', 'salt-n-pepper'),
cls('SarahsScribbles', 'sarahs-scribbles'), cls('SarahsScribbles', 'sarahs-scribbles'),
cls('SaturdayMorningBreakfastCereal', 'saturday-morning-breakfast-cereal'), cls('SaturdayMorningBreakfastCereal', 'saturday-morning-breakfast-cereal'),
cls('SavageChickens', 'savage-chickens'), cls('SavageChickens', 'savage-chickens'),
@ -394,13 +394,11 @@ class GoComics(ParserScraper):
cls('ShermansLagoon', 'shermanslagoon'), cls('ShermansLagoon', 'shermanslagoon'),
cls('ShirleyAndSonClassics', 'shirley-and-son-classics'), cls('ShirleyAndSonClassics', 'shirley-and-son-classics'),
cls('Shoe', 'shoe'), cls('Shoe', 'shoe'),
cls('SigneWilkinson', 'signewilkinson'),
cls('SketchsharkComics', 'sketchshark-comics'), cls('SketchsharkComics', 'sketchshark-comics'),
cls('SkinHorse', 'skinhorse'), cls('SkinHorse', 'skinhorse'),
cls('Skippy', 'skippy'), cls('Skippy', 'skippy'),
cls('SmallPotatoes', 'small-potatoes'), cls('SmallPotatoes', 'small-potatoes'),
cls('SnoopyEnEspanol', 'peanuts-espanol', 'es'), cls('SnoopyEnEspanol', 'peanuts-espanol', 'es'),
cls('Snowflakes', 'snowflakes'),
cls('SnowSez', 'snow-sez'), cls('SnowSez', 'snow-sez'),
cls('SpeedBump', 'speedbump'), cls('SpeedBump', 'speedbump'),
cls('SpiritOfTheStaircase', 'spirit-of-the-staircase'), cls('SpiritOfTheStaircase', 'spirit-of-the-staircase'),
@ -410,9 +408,7 @@ class GoComics(ParserScraper):
cls('SteveKelley', 'stevekelley'), cls('SteveKelley', 'stevekelley'),
cls('StickyComics', 'sticky-comics'), cls('StickyComics', 'sticky-comics'),
cls('StoneSoup', 'stonesoup'), cls('StoneSoup', 'stonesoup'),
cls('StoneSoupClassics', 'stone-soup-classics'),
cls('StrangeBrew', 'strangebrew'), cls('StrangeBrew', 'strangebrew'),
cls('StuartCarlson', 'stuartcarlson'),
cls('StudioJantze', 'studio-jantze'), cls('StudioJantze', 'studio-jantze'),
cls('SunnyStreet', 'sunny-street'), cls('SunnyStreet', 'sunny-street'),
cls('SunshineState', 'sunshine-state'), cls('SunshineState', 'sunshine-state'),
@ -425,6 +421,7 @@ class GoComics(ParserScraper):
cls('TarzanEnEspanol', 'tarzan-en-espanol', 'es'), cls('TarzanEnEspanol', 'tarzan-en-espanol', 'es'),
cls('TedRall', 'ted-rall'), cls('TedRall', 'ted-rall'),
cls('TenCats', 'ten-cats'), cls('TenCats', 'ten-cats'),
cls('Tex', 'tex'),
cls('TextsFromMittens', 'texts-from-mittens'), cls('TextsFromMittens', 'texts-from-mittens'),
cls('Thatababy', 'thatababy'), cls('Thatababy', 'thatababy'),
cls('ThatIsPriceless', 'that-is-priceless'), cls('ThatIsPriceless', 'that-is-priceless'),
@ -451,6 +448,7 @@ class GoComics(ParserScraper):
cls('TheHumbleStumble', 'humble-stumble'), cls('TheHumbleStumble', 'humble-stumble'),
cls('TheKChronicles', 'thekchronicles'), cls('TheKChronicles', 'thekchronicles'),
cls('TheKnightLife', 'theknightlife'), cls('TheKnightLife', 'theknightlife'),
cls('TheLockhorns', 'lockhorns'),
cls('TheMartianConfederacy', 'the-martian-confederacy'), cls('TheMartianConfederacy', 'the-martian-confederacy'),
cls('TheMeaningOfLila', 'meaningoflila'), cls('TheMeaningOfLila', 'meaningoflila'),
cls('TheMiddleAge', 'the-middle-age'), cls('TheMiddleAge', 'the-middle-age'),
@ -473,6 +471,7 @@ class GoComics(ParserScraper):
cls('TruthFacts', 'truth-facts'), cls('TruthFacts', 'truth-facts'),
cls('Tutelandia', 'tutelandia', 'es'), cls('Tutelandia', 'tutelandia', 'es'),
cls('TwoPartyOpera', 'two-party-opera'), cls('TwoPartyOpera', 'two-party-opera'),
cls('UFO', 'ufo'),
cls('UnderpantsAndOverbites', 'underpants-and-overbites'), cls('UnderpantsAndOverbites', 'underpants-and-overbites'),
cls('UnderstandingChaos', 'understanding-chaos'), cls('UnderstandingChaos', 'understanding-chaos'),
cls('UnstrangePhenomena', 'unstrange-phenomena'), cls('UnstrangePhenomena', 'unstrange-phenomena'),
@ -487,6 +486,7 @@ class GoComics(ParserScraper):
cls('ViiviAndWagner', 'viivi-and-wagner'), cls('ViiviAndWagner', 'viivi-and-wagner'),
cls('WallaceTheBrave', 'wallace-the-brave'), cls('WallaceTheBrave', 'wallace-the-brave'),
cls('WaltHandelsman', 'walthandelsman'), cls('WaltHandelsman', 'walthandelsman'),
cls('Wannabe', 'wannabe'),
cls('Warped', 'warped'), cls('Warped', 'warped'),
cls('WatchYourHead', 'watchyourhead'), cls('WatchYourHead', 'watchyourhead'),
cls('Wawawiwa', 'wawawiwa'), cls('Wawawiwa', 'wawawiwa'),
@ -505,6 +505,7 @@ class GoComics(ParserScraper):
cls('WuMo', 'wumo'), cls('WuMo', 'wumo'),
cls('WumoEnEspanol', 'wumoespanol', 'es'), cls('WumoEnEspanol', 'wumoespanol', 'es'),
cls('Yaffle', 'yaffle'), cls('Yaffle', 'yaffle'),
cls('YeahItsChill', 'yeah-its-chill'),
cls('YesImHotInThis', 'yesimhotinthis'), cls('YesImHotInThis', 'yesimhotinthis'),
cls('ZackHill', 'zackhill'), cls('ZackHill', 'zackhill'),
cls('ZenPencils', 'zen-pencils'), cls('ZenPencils', 'zen-pencils'),

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper from ..scraper import ParserScraper
@ -44,7 +44,7 @@ class KemonoCafe(ParserScraper):
# Fix unordered filenames # Fix unordered filenames
if 'addictivescience' in pageUrl: if 'addictivescience' in pageUrl:
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')) num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
filename = '%04d_%s' % (num, filename) filename = '%04d_%s' % (num, filename)
elif 'CaughtInOrbit' in filename: elif 'CaughtInOrbit' in filename:
filename = filename.replace('CaughtInOrbit', 'CIO') filename = filename.replace('CaughtInOrbit', 'CIO')

View file

@ -5,24 +5,7 @@
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper, _ParserScraper from ..scraper import ParserScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter, indirectStarter
from .common import ComicControlScraper, WordPressScraper, WordPressNaviIn from .common import ComicControlScraper, WordPressScraper
class Lackadaisy(ParserScraper):
url = 'https://www.lackadaisy.com/comic.php'
stripUrl = url + '?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//div[@id="exhibit"]/img[contains(@src, "comic/")]'
prevSearch = '//div[@class="prev"]/a'
nextSearch = '//div[@class="next"]/a'
help = 'Index format: n'
starter = bounceStarter
def namer(self, imageUrl, pageUrl):
# Use comic id for filename
num = pageUrl.rsplit('=', 1)[-1]
ext = imageUrl.rsplit('.', 1)[-1]
return 'lackadaisy_%s.%s' % (num, ext)
class Lancer(WordPressScraper): class Lancer(WordPressScraper):
@ -55,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
# Fix broken navigation links # Fix broken navigation links
if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00': if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
return self.stripUrl % 'summer21' return self.stripUrl % 'summer21'
return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data) return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data)

View file

@ -4,22 +4,18 @@
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
import json import json
from re import compile, escape, IGNORECASE from re import compile, IGNORECASE
from ..helpers import indirectStarter from ..helpers import indirectStarter
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from ..xml import NS
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
class MacHall(_BasicScraper): class MacHall(ComicControlScraper):
url = 'http://www.machall.com/' url = 'https://www.machall.com/'
stripUrl = url + 'view.php?date=%s' stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % '2000-11-07' firstStripUrl = stripUrl % 'moving-in'
imageSearch = compile(r'<img src="(comics/.+?)"')
prevSearch = compile(r'<a href="(.+?)"><img[^>]+?src=\'drop_shadow/previous.gif\'>')
help = 'Index format: yyyy-mm-dd'
class MadamAndEve(_BasicScraper): class MadamAndEve(_BasicScraper):
@ -58,12 +54,12 @@ class MareInternum(WordPressScraper):
firstStripUrl = stripUrl % 'intro-page-1' firstStripUrl = stripUrl % 'intro-page-1'
class Marilith(_BasicScraper): class Marilith(ParserScraper):
url = 'http://www.marilith.com/' url = 'https://web.archive.org/web/20170619193143/http://www.marilith.com/'
stripUrl = url + 'archive.php?date=%s' stripUrl = url + 'archive.php?date=%s'
firstStripUrl = stripUrl % '20041215' firstStripUrl = stripUrl % '20041215'
imageSearch = compile(r'<img src="(comics/.+?)" border') imageSearch = '//img[contains(@src, "comics/")]'
prevSearch = compile(r'<a href="(archive\.php\?date=.+?)"><img border=0 name=previous_day') prevSearch = '//a[img[@name="previous_day"]]'
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -80,22 +76,14 @@ class MarriedToTheSea(_ParserScraper):
return '%s-%s' % (date, filename) return '%s-%s' % (date, filename)
class MarryMe(_ParserScraper): class MarryMe(ParserScraper):
url = 'http://marryme.keenspot.com/' stripUrl = 'http://marryme.keenspot.com/d/%s.html'
stripUrl = url + 'd/%s.html' url = stripUrl % '20191001'
firstStripUrl = stripUrl % '20120730' firstStripUrl = stripUrl % '20120730'
imageSearch = '//img[@class="ksc"]' imageSearch = '//img[@class="ksc"]'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
endOfLife = True endOfLife = True
help = 'Index format: yyyymmdd'
class MaxOveracts(_ParserScraper):
url = 'http://occasionalcomics.com/'
stripUrl = url + '%s/'
css = True
imageSearch = '#comic img'
prevSearch = '.nav-previous > a'
help = 'Index format: nnn'
class Meek(WordPressScraper): class Meek(WordPressScraper):
@ -149,20 +137,22 @@ class MisfileHellHigh(Misfile):
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
class MistyTheMouse(WordPressScraper): class MistyTheMouse(ParserScraper):
url = 'http://www.mistythemouse.com/' url = 'http://www.mistythemouse.com/'
prevSearch = '//a[@rel="prev"]' imageSearch = '//center/p/img'
firstStripUrl = 'http://www.mistythemouse.com/?p=12' prevSearch = '//a[img[contains(@src, "Previous")]]'
firstStripUrl = url + 'The_Live_In.html'
class MonkeyUser(_ParserScraper): class MonkeyUser(ParserScraper):
url = 'https://www.monkeyuser.com/' url = 'https://www.monkeyuser.com/'
prevSearch = '//div[@title="previous"]/a'
imageSearch = '//div[d:class("content")]/p/img' imageSearch = '//div[d:class("content")]/p/img'
prevSearch = '//a[text()="Prev"]'
multipleImagesPerStrip = True
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
# videos # videos
return data.xpath('//div[d:class("video-container")]', namespaces=NS) return self.match(data, '//div[d:class("video-container")]')
class MonsieurLeChien(ParserScraper): class MonsieurLeChien(ParserScraper):
@ -195,43 +185,10 @@ class Moonlace(WordPressWebcomic):
return indirectStarter(self) return indirectStarter(self)
class Moonsticks(_ParserScraper): class Moonsticks(ParserScraper):
url = "http://moonsticks.org/" url = "https://moonsticks.org/"
imageSearch = "//div[@class='entry']//img" imageSearch = "//div[d:class('entry-content')]//img"
prevSearch = u"//a[text()='\u00AB Prev']" prevSearch = ('//a[@rel="prev"]', "//a[text()='\u00AB Prev']")
class MrLovenstein(_BasicScraper):
url = 'http://www.mrlovenstein.com/'
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % '1'
imageSearch = (
# captures rollover comic
compile(tagre("div", "class", r'comic_image') + r'\s*.*\s*' +
tagre("div", "style", r'display: none;') + r'\s*.*\s' +
tagre("img", "src", r'(/images/comics/[^"]+)')),
# captures standard comic
compile(tagre("img", "src", r'(/images/comics/[^"]+)',
before="comic_main_image")),
)
prevSearch = compile(tagre("a", "href", r'([^"]+)') +
tagre("img", "src", "/images/nav_left.png"))
textSearch = compile(r'<meta name="description" content="(.+?)" />')
help = 'Index Format: n'
class MyCartoons(_BasicScraper):
url = 'http://mycartoons.de/'
rurl = escape(url)
stripUrl = url + 'page/%s'
imageSearch = (
compile(tagre("img", "src", r'(%swp-content/cartoons/(?:[^"]+/)?\d+-\d+-\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%scartoons/[^"]+/\d+-\d+-\d+[^"]+)' % rurl)),
)
prevSearch = compile(tagre("a", "href", r'(%spage/[^"]+)' % rurl) +
"&laquo;")
help = 'Index format: number'
lang = 'de'
class MyLifeWithFel(ParserScraper): class MyLifeWithFel(ParserScraper):

View file

@ -11,6 +11,12 @@ from ..util import tagre
from .common import WordPressScraper, WordPressNavi from .common import WordPressScraper, WordPressNavi
class OccasionalComicsDisorder(WordPressScraper):
url = 'https://occasionalcomics.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'latest-comic-2'
class OctopusPie(_ParserScraper): class OctopusPie(_ParserScraper):
url = 'http://www.octopuspie.com/' url = 'http://www.octopuspie.com/'
rurl = escape(url) rurl = escape(url)

View file

@ -604,7 +604,6 @@ class Removed(Scraper):
cls('WotNow'), cls('WotNow'),
# Removed in 3.0 # Removed in 3.0
cls('CatenaManor/CatenaCafe'),
cls('ComicFury/AdventuresOftheGreatCaptainMaggieandCrew'), cls('ComicFury/AdventuresOftheGreatCaptainMaggieandCrew'),
cls('ComicFury/AWAKENING'), cls('ComicFury/AWAKENING'),
cls('ComicFury/Beebleville'), cls('ComicFury/Beebleville'),
@ -833,8 +832,6 @@ class Removed(Scraper):
cls('ComicsKingdom/Redeye'), cls('ComicsKingdom/Redeye'),
cls('ComicsKingdom/RedeyeSundays'), cls('ComicsKingdom/RedeyeSundays'),
cls('CrapIDrewOnMyLunchBreak'), cls('CrapIDrewOnMyLunchBreak'),
cls('FalseStart'),
cls('Ginpu'),
cls('GoComics/060'), cls('GoComics/060'),
cls('GoComics/2CowsAndAChicken'), cls('GoComics/2CowsAndAChicken'),
cls('GoComics/ABitSketch'), cls('GoComics/ABitSketch'),
@ -995,11 +992,9 @@ class Removed(Scraper):
cls('GoComics/Wrobbertcartoons'), cls('GoComics/Wrobbertcartoons'),
cls('GoComics/Zootopia'), cls('GoComics/Zootopia'),
cls('JustAnotherEscape'), cls('JustAnotherEscape'),
cls('KemonoCafe/PrincessBunny'),
cls('Laiyu', 'brk'), cls('Laiyu', 'brk'),
cls('MangaDex/DrStone', 'legal'), cls('MangaDex/DrStone', 'legal'),
cls('MangaDex/HeavensDesignTeam', 'legal'), cls('MangaDex/HeavensDesignTeam', 'legal'),
cls('MangaDex/ImTheMaxLevelNewbie', 'legal'),
cls('MangaDex/SPYxFAMILY', 'legal'), cls('MangaDex/SPYxFAMILY', 'legal'),
cls('Ryugou'), cls('Ryugou'),
cls('SeelPeel'), cls('SeelPeel'),
@ -1573,22 +1568,82 @@ class Removed(Scraper):
cls('SnafuComics/Tin'), cls('SnafuComics/Tin'),
cls('SnafuComics/Titan'), cls('SnafuComics/Titan'),
cls('StudioKhimera/Eorah', 'mov'), cls('StudioKhimera/Eorah', 'mov'),
cls('StudioKhimera/Mousechevious'),
cls('StuffNoOneToldMe'), cls('StuffNoOneToldMe'),
cls('TaleOfTenThousand'), cls('TaleOfTenThousand'),
cls('TalesAndTactics'),
cls('TheCyantianChronicles/CookieCaper'), cls('TheCyantianChronicles/CookieCaper'),
cls('TheCyantianChronicles/Pawprints'), cls('TheCyantianChronicles/Pawprints'),
cls('VampireHunterBoyfriends'),
cls('VGCats/Adventure'), cls('VGCats/Adventure'),
cls('VGCats/Super'), cls('VGCats/Super'),
cls('VictimsOfTheSystem'), cls('VictimsOfTheSystem'),
cls('WebDesignerCOTW'), cls('WebDesignerCOTW'),
cls('WebToons/Adamsville'), cls('WebToons/Adamsville'),
cls('WebToons/CrapIDrewOnMyLunchBreak'), cls('WebToons/CrapIDrewOnMyLunchBreak'),
cls('WintersLight'),
# Removed in 3.1
cls('AbbysAgency', 'brk'),
cls('AcademyVale'),
cls('AhoyEarth', 'block'),
cls('Anaria', 'del'),
cls('Angels2200', 'del'),
cls('BlackRose', 'brk'),
cls('CatenaManor/CatenaCafe'),
cls('ComicsKingdom/AmazingSpiderman'),
cls('ComicsKingdom/AmazingSpidermanSpanish'),
cls('ComicsKingdom/BigBenBoltSundays'),
cls('ComicsKingdom/BonersArkSundays'),
cls('ComicsKingdom/BrianDuffy'),
cls('ComicsKingdom/Crankshaft'),
cls('ComicsKingdom/FlashGordonSundays'),
cls('ComicsKingdom/FunkyWinkerbean'),
cls('ComicsKingdom/FunkyWinkerbeanSunday'),
cls('ComicsKingdom/FunkyWinkerbeanSundays'),
cls('ComicsKingdom/FunkyWinkerbeanVintage'),
cls('ComicsKingdom/HeartOfJulietJonesSundays'),
cls('ComicsKingdom/KatzenjammerKidsSundays'),
cls('ComicsKingdom/Lockhorns'),
cls('ComicsKingdom/MandrakeTheMagicianSundays'),
cls('ComicsKingdom/MarkTrailVintage'),
cls('ComicsKingdom/MikePeters'),
cls('ComicsKingdom/MotherGooseAndGrimm'),
cls('ComicsKingdom/PhantomSundays'),
cls('ComicsKingdom/PrinceValiantSundays'),
cls('ComicsKingdom/Retail'),
cls('ComicsKingdom/TigerSundays'),
cls('ComicsKingdom/TigerVintage'),
cls('ComicsKingdom/TigerVintageSundays'),
cls('Everblue', 'block'),
cls('FalseStart'),
cls('Ginpu'),
cls('GoComics/9ChickweedLaneClassics'),
cls('GoComics/Badlands'),
cls('GoComics/BigNateFirstClass'),
cls('GoComics/BreakOfDay'),
cls('GoComics/Candorville'),
cls('GoComics/DilbertClassics'),
cls('GoComics/DilbertEnEspanol'),
cls('GoComics/DumbwichCastle'),
cls('GoComics/EyebeamClassic'),
cls('GoComics/GarfieldClassics'),
cls('GoComics/MakingIt'),
cls('GoComics/MtPleasant'),
cls('GoComics/PCAndPixel'),
cls('GoComics/SaltNPepper'),
cls('GoComics/SigneWilkinson'),
cls('GoComics/Snowflakes'),
cls('GoComics/StoneSoupClassics'),
cls('GoComics/StuartCarlson'),
cls('KemonoCafe/PrincessBunny'),
cls('Lackadaisy', 'block'),
cls('MangaDex/ImTheMaxLevelNewbie', 'legal'),
cls('MrLovenstein', 'jsh'),
cls('MyCartoons'),
cls('Shivae/BlackRose', 'brk'),
cls('StudioKhimera/Mousechevious'),
cls('TalesAndTactics'),
cls('VampireHunterBoyfriends'),
cls('WebToons/CrystalVirus'), cls('WebToons/CrystalVirus'),
cls('WebToons/OVERPOWERED'), cls('WebToons/OVERPOWERED'),
cls('WintersLight'),
) )
@ -1667,10 +1722,8 @@ class Renamed(Scraper):
# Renamed in 3.0 # Renamed in 3.0
cls('AHClub', 'RickGriffinStudios/AHClub'), cls('AHClub', 'RickGriffinStudios/AHClub'),
cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'), cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'),
cls('ComicsKingdom/FunkyWinkerbeanSundays', 'ComicsKingdom/FunkyWinkerbeanSunday'),
cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'), cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'),
cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'), cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'),
cls('ComicsKingdom/TigerSundays', 'ComicsKingdom/TigerVintageSundays'),
cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'), cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'),
cls('GoComics/Cathy', 'GoComics/CathyClassics'), cls('GoComics/Cathy', 'GoComics/CathyClassics'),
cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'), cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'),
@ -1681,7 +1734,6 @@ class Renamed(Scraper):
cls('GoComics/Widdershins', 'Widdershins'), cls('GoComics/Widdershins', 'Widdershins'),
cls('Guardia', 'ComicFury/Guardia'), cls('Guardia', 'ComicFury/Guardia'),
cls('RadioactivePanda', 'Tapas/RadioactivePanda'), cls('RadioactivePanda', 'Tapas/RadioactivePanda'),
cls('Shivae/BlackRose', 'BlackRose'),
cls('SmackJeeves/BlackTapestries', 'ComicFury/BlackTapestries'), cls('SmackJeeves/BlackTapestries', 'ComicFury/BlackTapestries'),
cls('SmackJeeves/ByTheBook', 'ByTheBook'), cls('SmackJeeves/ByTheBook', 'ByTheBook'),
cls('SmackJeeves/FurryExperience', 'ComicFury/FurryExperience'), cls('SmackJeeves/FurryExperience', 'ComicFury/FurryExperience'),
@ -1694,6 +1746,9 @@ class Renamed(Scraper):
cls('TracesOfThePast/NSFW', 'RickGriffinStudios/TracesOfThePastNSFW'), cls('TracesOfThePast/NSFW', 'RickGriffinStudios/TracesOfThePastNSFW'),
# Renamed in 3.1 # Renamed in 3.1
cls('ComicsKingdom/SlylockFoxAndComicsForKids', 'ComicsKingdom/SlylockFox'),
cls('ComicsKingdom/SlylockFoxAndComicsForKidsSpanish', 'ComicsKingdom/SlylockFoxSpanish'),
cls('Exiern', 'ComicFury/Exiern'), cls('Exiern', 'ComicFury/Exiern'),
cls('MaxOveracts', 'OccasionalComicsDisorder'),
cls('SafelyEndangered', 'WebToons/SafelyEndangered'), cls('SafelyEndangered', 'WebToons/SafelyEndangered'),
) )

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
@ -34,16 +34,11 @@ class ParadigmShift(_BasicScraper):
help = 'Index format: custom' help = 'Index format: custom'
class ParallelUniversum(_BasicScraper): class ParallelUniversum(WordPressScraper):
url = 'http://www.paralleluniversum.net/' url = 'https://www.paralleluniversum.net/'
rurl = escape(url)
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '001-der-comic-ist-tot' firstStripUrl = stripUrl % '001-der-comic-ist-tot'
imageSearch = compile(tagre("img", "src", prevSearch = '//a[@rel="prev"]'
r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl) +
tagre("span", "class", "prev"))
help = 'Index format: number-stripname'
lang = 'de' lang = 'de'
@ -95,14 +90,12 @@ class PebbleVersion(_ParserScraper):
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
class PennyAndAggie(_BasicScraper): class PennyAndAggie(ComicControlScraper):
url = 'http://pennyandaggie.com/' url = 'https://pixietrixcomix.com/penny-and-aggie'
rurl = escape(url) stripUrl = url + '/%s'
stripUrl = url + 'index.php?p=%s' firstStripUrl = stripUrl % '2004-09-06'
imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)')) endOfLife = True
prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") + help = 'Index format: yyyy-mm-dd'
tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote=""))
help = 'Index format: n (unpadded)'
class PennyArcade(_ParserScraper): class PennyArcade(_ParserScraper):
@ -117,19 +110,17 @@ class PennyArcade(_ParserScraper):
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
class PeppermintSaga(WordPressNavi): class PeppermintSaga(WordPressScraper):
url = 'http://www.pepsaga.com/' url = 'http://www.pepsaga.com/'
stripUrl = url + '?p=%s' stripUrl = url + 'comics/%s/'
firstStripUrl = stripUrl % '3' firstStripUrl = stripUrl % 'the-sword-of-truth-vol1'
help = 'Index format: number'
adult = True adult = True
class PeppermintSagaBGR(WordPressNavi): class PeppermintSagaBGR(WordPressScraper):
url = 'http://bgr.pepsaga.com/' url = 'http://bgr.pepsaga.com/'
stripUrl = url + '?p=%s' stripUrl = url + '?comic=%s'
firstStripUrl = stripUrl % '4' firstStripUrl = stripUrl % '04172011'
help = 'Index format: number'
adult = True adult = True
@ -150,14 +141,16 @@ class PeterAndWhitney(_ParserScraper):
prevSearch = '//a[./img[contains(@src, "nav_previous")]]' prevSearch = '//a[./img[contains(@src, "nav_previous")]]'
class PHDComics(_ParserScraper): class PHDComics(ParserScraper):
BROKEN_COMMENT_END = compile(r'--!>') BROKEN_COMMENT_END = compile(r'--!>')
baseUrl = 'http://phdcomics.com/' baseUrl = 'http://phdcomics.com/'
url = baseUrl + 'comics.php' url = baseUrl + 'comics.php'
stripUrl = baseUrl + 'comics/archive.php?comicid=%s' stripUrl = baseUrl + 'comics/archive.php?comicid=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//img[@id="comic2"]' imageSearch = ('//img[@id="comic2"]',
r'//img[d:class("img-responsive") and re:test(@name, "comic\d+")]')
multipleImagesPerStrip = True
prevSearch = '//a[img[contains(@src, "prev_button")]]' prevSearch = '//a[img[contains(@src, "prev_button")]]'
nextSearch = '//a[img[contains(@src, "next_button")]]' nextSearch = '//a[img[contains(@src, "next_button")]]'
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -173,7 +166,7 @@ class PHDComics(_ParserScraper):
# video # video
self.stripUrl % '1880', self.stripUrl % '1880',
self.stripUrl % '1669', self.stripUrl % '1669',
) ) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]')
class Picklewhistle(ComicControlScraper): class Picklewhistle(ComicControlScraper):
@ -333,11 +326,12 @@ class PS238(_ParserScraper):
class PvPOnline(ParserScraper): class PvPOnline(ParserScraper):
baseUrl = 'https://www.toonhoundstudios.com/' baseUrl = 'https://www.toonhoundstudios.com/'
url = baseUrl + 'pvp/' stripUrl = baseUrl + 'comic/%s/?sid=372'
stripUrl = baseUrl + 'comic/%s/' url = stripUrl % 'pvp-2022-09-16'
firstStripUrl = stripUrl % '19980504' firstStripUrl = stripUrl % '19980504'
imageSearch = '//div[@id="spliced-comic"]//img/@data-src-img' imageSearch = '//div[@id="spliced-comic"]//img/@data-src-img'
prevSearch = '//a[d:class("prev")]' prevSearch = '//a[d:class("prev")]'
endOfLife = True
def namer(self, imageUrl, pageUrl): def namer(self, image_url, page_url):
return 'pvp' + imageUrl.rsplit('/', 1)[-1] return 'pvp' + image_url.rsplit('/', 1)[-1]

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile from re import compile
from urllib.parse import urljoin from urllib.parse import urljoin
@ -121,7 +121,7 @@ class Requiem(WordPressScraper):
firstStripUrl = stripUrl % '2004-06-07-3' firstStripUrl = stripUrl % '2004-06-07-3'
class Replay(_ParserScraper): class Replay(ParserScraper):
url = 'http://replaycomic.com/' url = 'http://replaycomic.com/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'red-desert' firstStripUrl = stripUrl % 'red-desert'
@ -132,11 +132,11 @@ class Replay(_ParserScraper):
def starter(self): def starter(self):
# Retrieve archive page to identify chapters # Retrieve archive page to identify chapters
archivePage = self.getPage(self.url + 'archive') archivePage = self.getPage(self.url + 'archive')
archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]') archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]')
self.chapter = len(archive) - 1 self.chapter = len(archive) - 1
self.startOfChapter = [] self.startOfChapter = []
for archiveChapter in archive: for archiveChapter in archive:
self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href')) self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href'))
return bounceStarter(self) return bounceStarter(self)
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):

View file

@ -196,7 +196,7 @@ class Sharksplode(WordPressScraper):
class Sheldon(ParserScraper): class Sheldon(ParserScraper):
url = 'https://www.sheldoncomics.com/' url = 'https://www.sheldoncomics.com/'
firstStripUrl = url + 'comic/well-who-is-this/' firstStripUrl = url + 'comic/well-who-is-this/'
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img/@data-src-img'
prevSearch = '//a[img[d:class("left")]]' prevSearch = '//a[img[d:class("left")]]'
@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper):
def extract_image_urls(self, url, data): def extract_image_urls(self, url, data):
# Website requires JS, so build the list of image URLs manually # Website requires JS, so build the list of image URLs manually
imageurls = [] imageurls = []
current = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value'))
for page in reversed(range(1, current + 1)): for page in reversed(range(1, current + 1)):
imageurls.append(self.url + 'comics/' + str(page) + '.jpg') imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
return imageurls return imageurls
@ -636,16 +636,16 @@ class StrongFemaleProtagonist(_ParserScraper):
) )
class StupidFox(_ParserScraper): class StupidFox(ParserScraper):
url = 'http://stupidfox.net/' url = 'http://stupidfox.net/'
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % 'hello' firstStripUrl = stripUrl % 'hello'
imageSearch = '//div[@class="comicmid"]//img' imageSearch = '//div[d:class("comicmid")]//img'
prevSearch = '//a[@accesskey="p"]' prevSearch = '//a[@accesskey="p"]'
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-') title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
return title + '.' + imageUrl.rsplit('.', 1)[-1] return title + '.' + imageUrl.rsplit('.', 1)[-1]

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2021 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from .common import WordPressSpliced from .common import WordPressSpliced
@ -12,22 +12,20 @@ class _CommonMulti(WordPressSpliced):
self.endOfLife = eol self.endOfLife = eol
class AbbysAgency(WordPressSpliced):
url = 'https://abbysagency.us/'
stripUrl = url + 'blog/comic/%s/'
firstStripUrl = stripUrl % 'a'
class AlienDice(WordPressSpliced): class AlienDice(WordPressSpliced):
url = 'https://aliendice.com/' url = 'https://aliendice.com/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '05162001' firstStripUrl = stripUrl % '05162001'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return not self.match(data, self.imageSearch)
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
# Fix broken navigation # Fix broken navigation
if url == self.stripUrl % 'day-29-part-2-page-3-4': if url == self.stripUrl % 'day-29-part-2-page-3-4':
return self.stripUrl % 'day-29-part-2-page-3-2' return self.stripUrl % 'day-29-part-2-page-3-2'
return super(AlienDice, self).getPrevUrl(url, data) return super().getPrevUrl(url, data)
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Fix inconsistent filename # Fix inconsistent filename
@ -47,12 +45,6 @@ class AlienDiceLegacy(WordPressSpliced):
return super().isfirststrip(url.rsplit('?', 1)[0]) return super().isfirststrip(url.rsplit('?', 1)[0])
class BlackRose(WordPressSpliced):
url = 'https://www.blackrose.monster/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '2004-11-01'
class TheCyantianChronicles(_CommonMulti): class TheCyantianChronicles(_CommonMulti):
baseUrl = 'https://cyantian.net/' baseUrl = 'https://cyantian.net/'
@ -81,9 +73,9 @@ class TheCyantianChronicles(_CommonMulti):
class Shivae(WordPressSpliced): class Shivae(WordPressSpliced):
url = 'https://shivae.com/' url = 'https://shivae.net/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '09202001' firstStripUrl = stripUrl % '2002-02-27'
class ShivaeComics(_CommonMulti): class ShivaeComics(_CommonMulti):

View file

@ -4,10 +4,7 @@
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, MULTILINE from re import compile, escape, MULTILINE
try: from functools import cached_property
from functools import cached_property
except ImportError:
from cached_property import cached_property
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
from ..helpers import indirectStarter, joinPathPartsNamer from ..helpers import indirectStarter, joinPathPartsNamer
@ -275,7 +272,7 @@ class ToonHole(ParserScraper):
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]' latestSearch = '//a[@rel="bookmark"]'
starter = indirectStarter starter = indirectStarter
namer = joinPathPartsNamer((), (-3, -2, -1)) namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
class TrippingOverYou(_BasicScraper): class TrippingOverYou(_BasicScraper):

View file

@ -3,7 +3,6 @@
# SPDX-FileCopyrightText: © 2019 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..output import out from ..output import out
from ..scraper import ParserScraper from ..scraper import ParserScraper
from ..xml import NS
class Tapas(ParserScraper): class Tapas(ParserScraper):
@ -21,7 +20,7 @@ class Tapas(ParserScraper):
def starter(self): def starter(self):
# Retrieve comic metadata from info page # Retrieve comic metadata from info page
info = self.getPage(self.url) info = self.getPage(self.url)
series = info.xpath('//@data-series-id')[0] series = self.match(info, '//@data-series-id')[0]
# Retrieve comic metadata from API # Retrieve comic metadata from API
data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST') data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST')
data.raise_for_status() data.raise_for_status()
@ -43,7 +42,7 @@ class Tapas(ParserScraper):
return self._cached_image_urls return self._cached_image_urls
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS): if self.match(data, '//button[d:class("js-have-to-sign")]'):
out.warn(f'Nothing to download on "{url}", because a login is required.') out.warn(f'Nothing to download on "{url}", because a login is required.')
return True return True
return False return False

View file

@ -107,7 +107,7 @@ class Unsounded(ParserScraper):
return urls return urls
def extract_css_bg(self, page) -> str | None: def extract_css_bg(self, page) -> str | None:
comicdivs = page.xpath('//div[@id="comic"]') comicdivs = self.match(page, '//div[@id="comic"]')
if comicdivs: if comicdivs:
style = comicdivs[0].attrib.get('style') style = comicdivs[0].attrib.get('style')
if style: if style:

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper, _ParserScraper from ..scraper import ParserScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter from ..helpers import bounceStarter, indirectStarter
@ -27,7 +27,7 @@ class VGCats(_ParserScraper):
url = 'https://www.vgcats.com/comics/' url = 'https://www.vgcats.com/comics/'
stripUrl = url + '?strip_id=%s' stripUrl = url + '?strip_id=%s'
firstStripUrl = stripUrl % '0' firstStripUrl = stripUrl % '0'
imageSearch = '//td/img[contains(@src, "images/")]' imageSearch = '//td/font/img[contains(@src, "images/")]'
prevSearch = '//a[img[contains(@src, "back.")]]' prevSearch = '//a[img[contains(@src, "back.")]]'
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'
@ -44,15 +44,15 @@ class Vibe(ParserScraper):
help = 'Index format: VIBEnnn (padded)' help = 'Index format: VIBEnnn (padded)'
class VickiFox(_ParserScraper): class VickiFox(ParserScraper):
url = 'http://www.vickifox.com/comic/strip' url = 'http://www.vickifox.com/comic/strip'
stripUrl = url + '?id=%s' stripUrl = url + '?id=%s'
firstStripUrl = stripUrl % '001' firstStripUrl = stripUrl % '001'
imageSearch = '//img[contains(@src, "comic/")]' imageSearch = '//img[contains(@src, "comic/")]'
prevSearch = '//button[@id="btnPrev"]/@value' prevSearch = '//button[@id="btnPrev"]/@value'
def getPrevUrl(self, url, data): def link_modifier(self, fromurl, tourl):
return self.stripUrl % self.getPage(url).xpath(self.prevSearch)[0] return self.stripUrl % tourl
class ViiviJaWagner(_ParserScraper): class ViiviJaWagner(_ParserScraper):

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, IGNORECASE from re import compile, escape, IGNORECASE
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
@ -17,7 +17,7 @@ class WapsiSquare(WordPressNaviIn):
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip pages without images.""" """Skip pages without images."""
return data.xpath('//iframe') # videos return self.match(data, '//iframe') # videos
class WastedTalent(_ParserScraper): class WastedTalent(_ParserScraper):

View file

@ -24,9 +24,9 @@ class WebToons(ParserScraper):
self.session.cookies.set(cookie, 'false', domain='webtoons.com') self.session.cookies.set(cookie, 'false', domain='webtoons.com')
# Find current episode number # Find current episode number
listPage = self.getPage(self.listUrl) listPage = self.getPage(self.listUrl)
currentEpisode = listPage.xpath('//div[@class="detail_lst"]/ul/li')[0].attrib['data-episode-no'] currentEpisode = self.match(listPage, '//div[d:class("detail_lst")]/ul/li')[0].attrib['data-episode-no']
# Check for completed tag # Check for completed tag
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != []) self.endOfLife = not self.match(listPage, '//div[@id="_asideDetail"]//span[d:class("txt_ico_completed2")]')
return self.stripUrl % currentEpisode return self.stripUrl % currentEpisode
def extract_image_urls(self, url, data): def extract_image_urls(self, url, data):
@ -52,6 +52,7 @@ class WebToons(ParserScraper):
cls('1111Animals', 'comedy/1111-animals', 437), cls('1111Animals', 'comedy/1111-animals', 437),
cls('2015SpaceSeries', 'sf/2015-space-series', 391), cls('2015SpaceSeries', 'sf/2015-space-series', 391),
cls('3SecondStrip', 'comedy/3-second-strip', 380), cls('3SecondStrip', 'comedy/3-second-strip', 380),
cls('99ReinforcedStick', 'comedy/99-reinforced-wooden-stick', 4286),
cls('ABittersweetLife', 'slice-of-life/a-bittersweet-life', 294), cls('ABittersweetLife', 'slice-of-life/a-bittersweet-life', 294),
cls('AboutDeath', 'drama/about-death', 82), cls('AboutDeath', 'drama/about-death', 82),
cls('ABudgiesLife', 'slice-of-life/its-a-budgies-life', 985), cls('ABudgiesLife', 'slice-of-life/its-a-budgies-life', 985),
@ -64,6 +65,7 @@ class WebToons(ParserScraper):
cls('AGoodDayToBeADog', 'romance/a-good-day-tobe-a-dog', 1390), cls('AGoodDayToBeADog', 'romance/a-good-day-tobe-a-dog', 1390),
cls('Aisopos', 'drama/aisopos', 76), cls('Aisopos', 'drama/aisopos', 76),
cls('AliceElise', 'fantasy/alice-elise', 1481), cls('AliceElise', 'fantasy/alice-elise', 1481),
cls('AlloyComics', 'canvas/alloy-comics', 747447),
cls('AllThatWeHopeToBe', 'slice-of-life/all-that-we-hope-to-be', 470), cls('AllThatWeHopeToBe', 'slice-of-life/all-that-we-hope-to-be', 470),
cls('AllThatYouAre', 'drama/all-that-you-are', 403), cls('AllThatYouAre', 'drama/all-that-you-are', 403),
cls('AlwaysHuman', 'romance/always-human', 557), cls('AlwaysHuman', 'romance/always-human', 557),
@ -128,6 +130,7 @@ class WebToons(ParserScraper):
cls('CursedPrincessClub', 'comedy/cursed-princess-club', 1537), cls('CursedPrincessClub', 'comedy/cursed-princess-club', 1537),
cls('Cyberbunk', 'sf/cyberbunk', 466), cls('Cyberbunk', 'sf/cyberbunk', 466),
cls('Cyberforce', 'super-hero/cyberforce', 531), cls('Cyberforce', 'super-hero/cyberforce', 531),
cls('CydoniaShattering', 'fantasy/cydonia-shattering', 2881),
cls('CykoKO', 'super-hero/cyko-ko', 560), cls('CykoKO', 'super-hero/cyko-ko', 560),
cls('Darbi', 'action/darbi', 1098), cls('Darbi', 'action/darbi', 1098),
cls('Darchon', 'challenge/darchon', 532053), cls('Darchon', 'challenge/darchon', 532053),
@ -153,6 +156,8 @@ class WebToons(ParserScraper):
cls('DrawnToYou', 'challenge/drawn-to-you', 172022), cls('DrawnToYou', 'challenge/drawn-to-you', 172022),
cls('DrFrost', 'drama/dr-frost', 371), cls('DrFrost', 'drama/dr-frost', 371),
cls('DuelIdentity', 'challenge/duel-identity', 532064), cls('DuelIdentity', 'challenge/duel-identity', 532064),
cls('DungeonCleaningLife', 'action/the-dungeon-cleaning-life-of-a-once-genius-hunter', 4677),
cls('DungeonsAndDoodlesTalesFromTheTables', 'canvas/dungeons-doodles-tales-from-the-tables', 682646),
cls('DungeonMinis', 'challenge/dungeonminis', 64132), cls('DungeonMinis', 'challenge/dungeonminis', 64132),
cls('Dustinteractive', 'comedy/dustinteractive', 907), cls('Dustinteractive', 'comedy/dustinteractive', 907),
cls('DutyAfterSchool', 'sf/duty-after-school', 370), cls('DutyAfterSchool', 'sf/duty-after-school', 370),
@ -170,6 +175,7 @@ class WebToons(ParserScraper):
cls('FAMILYMAN', 'drama/family-man', 85), cls('FAMILYMAN', 'drama/family-man', 85),
cls('FantasySketchTheGame', 'sf/fantasy-sketch', 1020), cls('FantasySketchTheGame', 'sf/fantasy-sketch', 1020),
cls('Faust', 'supernatural/faust', 522), cls('Faust', 'supernatural/faust', 522),
cls('FinalRaidBoss', 'fantasy/the-final-raid-boss', 3921),
cls('FINALITY', 'mystery/finality', 1457), cls('FINALITY', 'mystery/finality', 1457),
cls('Firebrand', 'supernatural/firebrand', 877), cls('Firebrand', 'supernatural/firebrand', 877),
cls('FirstDefense', 'challenge/first-defense', 532072), cls('FirstDefense', 'challenge/first-defense', 532072),
@ -204,11 +210,13 @@ class WebToons(ParserScraper):
cls('HeliosFemina', 'fantasy/helios-femina', 638), cls('HeliosFemina', 'fantasy/helios-femina', 638),
cls('HelloWorld', 'slice-of-life/hello-world', 827), cls('HelloWorld', 'slice-of-life/hello-world', 827),
cls('Hellper', 'fantasy/hellper', 185), cls('Hellper', 'fantasy/hellper', 185),
cls('Hench', 'canvas/hench/', 857225),
cls('HeroineChic', 'super-hero/heroine-chic', 561), cls('HeroineChic', 'super-hero/heroine-chic', 561),
cls('HIVE', 'thriller/hive', 65), cls('HIVE', 'thriller/hive', 65),
cls('Hooky', 'fantasy/hooky', 425), cls('Hooky', 'fantasy/hooky', 425),
cls('HoovesOfDeath', 'fantasy/hooves-of-death', 1535), cls('HoovesOfDeath', 'fantasy/hooves-of-death', 1535),
cls('HouseOfStars', 'fantasy/house-of-stars', 1620), cls('HouseOfStars', 'fantasy/house-of-stars', 1620),
cls('HowToBeAMindReaver', 'canvas/how-to-be-a-mind-reaver', 301213),
cls('HowToBecomeADragon', 'fantasy/how-to-become-a-dragon', 1973), cls('HowToBecomeADragon', 'fantasy/how-to-become-a-dragon', 1973),
cls('HowToLove', 'slice-of-life/how-to-love', 472), cls('HowToLove', 'slice-of-life/how-to-love', 472),
cls('IDontWantThisKindOfHero', 'super-hero/i-dont-want-this-kind-of-hero', 98), cls('IDontWantThisKindOfHero', 'super-hero/i-dont-want-this-kind-of-hero', 98),
@ -235,6 +243,7 @@ class WebToons(ParserScraper):
cls('KindOfLove', 'slice-of-life/kind-of-love', 1850), cls('KindOfLove', 'slice-of-life/kind-of-love', 1850),
cls('KissItGoodbye', 'challenge/kiss-it-goodbye', 443703), cls('KissItGoodbye', 'challenge/kiss-it-goodbye', 443703),
cls('KnightRun', 'sf/knight-run', 67), cls('KnightRun', 'sf/knight-run', 67),
cls('KnightUnderMyHeart', 'action/knight-under-my-heart', 4215),
cls('Kubera', 'fantasy/kubera', 83), cls('Kubera', 'fantasy/kubera', 83),
cls('LalinsCurse', 'supernatural/lalins-curse', 1601), cls('LalinsCurse', 'supernatural/lalins-curse', 1601),
cls('Lars', 'slice-of-life/lars', 358), cls('Lars', 'slice-of-life/lars', 358),
@ -261,6 +270,7 @@ class WebToons(ParserScraper):
cls('LUMINE', 'fantasy/lumine', 1022), cls('LUMINE', 'fantasy/lumine', 1022),
cls('Lunarbaboon', 'slice-of-life/lunarbaboon', 523), cls('Lunarbaboon', 'slice-of-life/lunarbaboon', 523),
cls('MageAndDemonQueen', 'comedy/mage-and-demon-queen', 1438), cls('MageAndDemonQueen', 'comedy/mage-and-demon-queen', 1438),
cls('MageAndMimic', 'comedy/mage-and-mimic', 5973),
cls('Magical12thGraders', 'super-hero/magical-12th-graders', 90), cls('Magical12thGraders', 'super-hero/magical-12th-graders', 90),
cls('Magician', 'fantasy/magician', 70), cls('Magician', 'fantasy/magician', 70),
cls('MagicSodaPop', 'fantasy/magic-soda-pop', 1947), cls('MagicSodaPop', 'fantasy/magic-soda-pop', 1947),
@ -292,6 +302,8 @@ class WebToons(ParserScraper):
cls('MyGiantNerdBoyfriend', 'slice-of-life/my-giant-nerd-boyfriend', 958), cls('MyGiantNerdBoyfriend', 'slice-of-life/my-giant-nerd-boyfriend', 958),
cls('MyKittyAndOldDog', 'slice-of-life/my-kitty-and-old-dog', 184), cls('MyKittyAndOldDog', 'slice-of-life/my-kitty-and-old-dog', 184),
cls('MyNameIsBenny', 'slice-of-life/my-name-is-benny', 1279), cls('MyNameIsBenny', 'slice-of-life/my-name-is-benny', 1279),
cls('MySClassHunter', 'action/my-s-class-hunters', 3963),
cls('MythicItemObtained', 'fantasy/mythic-item-obtained', 4582),
cls('MyWallflowerKiss', 'challenge/my-wallflower-kiss', 151869), cls('MyWallflowerKiss', 'challenge/my-wallflower-kiss', 151869),
cls('NanoList', 'sf/nano-list', 700), cls('NanoList', 'sf/nano-list', 700),
cls('NationalDogDay2016', 'slice-of-life/national-dog-day', 747), cls('NationalDogDay2016', 'slice-of-life/national-dog-day', 747),
@ -439,6 +451,7 @@ class WebToons(ParserScraper):
cls('UpAndOut', 'slice-of-life/up-and-out', 488), cls('UpAndOut', 'slice-of-life/up-and-out', 488),
cls('UrbanAnimal', 'super-hero/urban-animal', 1483), cls('UrbanAnimal', 'super-hero/urban-animal', 1483),
cls('Uriah', 'horror/uriah', 1607), cls('Uriah', 'horror/uriah', 1607),
cls('VampireFamily', 'comedy/vampire-family', 6402),
cls('VarsityNoir', 'mystery/varsity-noir', 1613), cls('VarsityNoir', 'mystery/varsity-noir', 1613),
cls('VersionDayAndNight', 'drama/version-day-and-night', 1796), cls('VersionDayAndNight', 'drama/version-day-and-night', 1796),
cls('WafflesAndPancakes', 'slice-of-life/waffles-and-pancakes', 1310), cls('WafflesAndPancakes', 'slice-of-life/waffles-and-pancakes', 1310),

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019-2022 Daniel Ring # SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper from ..scraper import ParserScraper
from ..helpers import indirectStarter from ..helpers import indirectStarter
@ -15,21 +15,21 @@ class Wrongside(ParserScraper):
def starter(self): def starter(self):
archivePage = self.getPage(self.url) archivePage = self.getPage(self.url)
chapterUrls = archivePage.xpath('//ul[@class="albThumbs"]//a/@href') chapterUrls = self.match(archivePage, '//ul[d:class("albThumbs")]//a/@href')
self.archive = [] self.archive = []
for chapterUrl in chapterUrls: for chapterUrl in chapterUrls:
chapterPage = self.getPage(chapterUrl) chapterPage = self.getPage(chapterUrl)
self.archive.append(chapterPage.xpath('(//ul[@id="thumbnails"]//a/@href)[last()]')[0]) self.archive.append(self.match(chapterPage, '(//ul[@id="thumbnails"]//a/@href)[last()]')[0])
return self.archive[0] return self.archive[0]
def getPrevUrl(self, url, data): def getPrevUrl(self, url, data):
if data.xpath(self.prevSearch) == [] and len(self.archive) > 0: if self.match(data, self.prevSearch) == [] and len(self.archive) > 0:
return self.archive.pop() return self.archive.pop()
return super(Wrongside, self).getPrevUrl(url, data) return super(Wrongside, self).getPrevUrl(url, data)
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
@ -71,5 +71,5 @@ class WrongsideSideStories(ParserScraper):
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl) page = self.getPage(pageUrl)
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]

View file

@ -23,7 +23,7 @@ class Zapiro(ParserScraper):
imageSearch = '//div[@id="cartoon"]/img' imageSearch = '//div[@id="cartoon"]/img'
prevSearch = '//a[d:class("left")]' prevSearch = '//a[d:class("left")]'
nextSearch = '//a[d:class("right")]' nextSearch = '//a[d:class("right")]'
namer = joinPathPartsNamer((-1,), ()) namer = joinPathPartsNamer(pageparts=(-1,))
class ZenPencils(WordPressNavi): class ZenPencils(WordPressNavi):
@ -60,7 +60,7 @@ class Zwarwald(BasicScraper):
tagre("img", "src", tagre("img", "src",
r'http://zwarwald\.de/images/prev\.jpg', r'http://zwarwald\.de/images/prev\.jpg',
quote="'")) quote="'"))
namer = joinPathPartsNamer((), (-3, -2, -1)) namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
help = 'Index format: number' help = 'Index format: number'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):

View file

@ -119,45 +119,45 @@ class Scraper:
if val: if val:
self._indexes = tuple(sorted(val)) self._indexes = tuple(sorted(val))
def __init__(self, name): def __init__(self, name: str) -> None:
"""Initialize internal variables.""" """Initialize internal variables."""
self.name = name self.name = name
self.urls = set() self.urls: set[str] = set()
self._indexes = () self._indexes = ()
self.skippedUrls = set() self.skippedUrls: set[str] = set()
self.hitFirstStripUrl = False self.hitFirstStripUrl = False
def __hash__(self): def __hash__(self) -> int:
"""Get hash value from name and index list.""" """Get hash value from name and index list."""
return hash((self.name, self.indexes)) return hash((self.name, self.indexes))
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url: str, data) -> bool:
"""Determine if search for images in given URL should be skipped.""" """Determine if search for images in given URL should be skipped."""
return False return False
def getComicStrip(self, url, data): def getComicStrip(self, url, data) -> ComicStrip:
"""Get comic strip downloader for given URL and data.""" """Get comic strip downloader for given URL and data."""
imageUrls = self.extract_image_urls(url, data) urls = self.extract_image_urls(url, data)
# map modifier function on image URLs # map modifier function on image URLs
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls] urls = [self.imageUrlModifier(x, data) for x in urls]
# remove duplicate URLs # remove duplicate URLs
imageUrls = uniq(imageUrls) urls = uniq(urls)
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(urls) > 1 and not self.multipleImagesPerStrip:
out.warn( out.warn(
u"Found %d images instead of 1 at %s with expressions %s" % u"Found %d images instead of 1 at %s with expressions %s" %
(len(imageUrls), url, prettyMatcherList(self.imageSearch))) (len(urls), url, prettyMatcherList(self.imageSearch)))
image = imageUrls[0] image = urls[0]
out.warn(u"Choosing image %s" % image) out.warn("Choosing image %s" % image)
imageUrls = (image,) urls = (image,)
elif not imageUrls: elif not urls:
out.warn(u"Found no images at %s with expressions %s" % (url, out.warn("Found no images at %s with expressions %s" % (url,
prettyMatcherList(self.imageSearch))) prettyMatcherList(self.imageSearch)))
if self.textSearch: if self.textSearch:
text = self.fetchText(url, data, self.textSearch, text = self.fetchText(url, data, self.textSearch,
optional=self.textOptional) optional=self.textOptional)
else: else:
text = None text = None
return ComicStrip(self, url, imageUrls, text=text) return ComicStrip(self, url, urls, text=text)
def getStrips(self, maxstrips=None): def getStrips(self, maxstrips=None):
"""Get comic strips.""" """Get comic strips."""
@ -217,7 +217,7 @@ class Scraper:
break break
url = prevUrl url = prevUrl
def isfirststrip(self, url): def isfirststrip(self, url: str) -> bool:
"""Check if the specified URL is the first strip of a comic. This is """Check if the specified URL is the first strip of a comic. This is
specially for comics taken from archive.org, since the base URL of specially for comics taken from archive.org, since the base URL of
archive.org changes whenever pages are taken from a different archive.org changes whenever pages are taken from a different
@ -228,7 +228,7 @@ class Scraper:
currenturl = ARCHIVE_ORG_URL.sub('', url) currenturl = ARCHIVE_ORG_URL.sub('', url)
return firsturl == currenturl return firsturl == currenturl
def getPrevUrl(self, url, data): def getPrevUrl(self, url: str, data) -> str | None:
"""Find previous URL.""" """Find previous URL."""
prevUrl = None prevUrl = None
if self.prevSearch: if self.prevSearch:
@ -243,40 +243,40 @@ class Scraper:
getHandler().comicPageLink(self, url, prevUrl) getHandler().comicPageLink(self, url, prevUrl)
return prevUrl return prevUrl
def getIndexStripUrl(self, index): def getIndexStripUrl(self, index: str) -> str:
"""Get comic strip URL from index.""" """Get comic strip URL from index."""
return self.stripUrl % index return self.stripUrl % index
def starter(self): def starter(self) -> str:
"""Get starter URL from where to scrape comic strips.""" """Get starter URL from where to scrape comic strips."""
return self.url return self.url
def namer(self, image_url, page_url): def namer(self, image_url: str, page_url: str) -> str | None:
"""Return filename for given image and page URL.""" """Return filename for given image and page URL."""
return return
def link_modifier(self, fromurl, tourl): def link_modifier(self, fromurl: str, tourl: str) -> str:
"""Optional modification of parsed link (previous/back/latest) URLs. """Optional modification of parsed link (previous/back/latest) URLs.
Useful if there are domain redirects. The default implementation does Useful if there are domain redirects. The default implementation does
not modify the URL. not modify the URL.
""" """
return tourl return tourl
def imageUrlModifier(self, image_url, data): def imageUrlModifier(self, image_url: str, data) -> str:
"""Optional modification of parsed image URLs. Useful if the URL """Optional modification of parsed image URLs. Useful if the URL
needs to be fixed before usage. The default implementation does needs to be fixed before usage. The default implementation does
not modify the URL. The given data is the URL page data. not modify the URL. The given data is the URL page data.
""" """
return image_url return image_url
def vote(self): def vote(self) -> None:
"""Cast a public vote for this comic.""" """Cast a public vote for this comic."""
uid = get_system_uid() uid = get_system_uid()
data = {"name": self.name.replace('/', '_'), "uid": uid} data = {"name": self.name.replace('/', '_'), "uid": uid}
response = self.session.post(configuration.VoteUrl, data=data) response = self.session.post(configuration.VoteUrl, data=data)
response.raise_for_status() response.raise_for_status()
def get_download_dir(self, basepath): def get_download_dir(self, basepath: str) -> str:
"""Try to find the corect download directory, ignoring case """Try to find the corect download directory, ignoring case
differences.""" differences."""
path = basepath path = basepath
@ -294,16 +294,16 @@ class Scraper:
path = os.path.join(path, part) path = os.path.join(path, part)
return path return path
def getCompleteFile(self, basepath): def getCompleteFile(self, basepath: str) -> str:
"""Get filename indicating all comics are downloaded.""" """Get filename indicating all comics are downloaded."""
dirname = self.get_download_dir(basepath) dirname = self.get_download_dir(basepath)
return os.path.join(dirname, "complete.txt") return os.path.join(dirname, "complete.txt")
def isComplete(self, basepath): def isComplete(self, basepath: str) -> bool:
"""Check if all comics are downloaded.""" """Check if all comics are downloaded."""
return os.path.isfile(self.getCompleteFile(basepath)) return os.path.isfile(self.getCompleteFile(basepath))
def setComplete(self, basepath): def setComplete(self, basepath: str) -> None:
"""Set complete flag for this comic, ie. all comics are downloaded.""" """Set complete flag for this comic, ie. all comics are downloaded."""
if self.endOfLife: if self.endOfLife:
filename = self.getCompleteFile(basepath) filename = self.getCompleteFile(basepath)
@ -521,15 +521,10 @@ class ParserScraper(Scraper):
return text.strip() return text.strip()
def _matchPattern(self, data, patterns): def _matchPattern(self, data, patterns):
if self.css:
searchFun = data.cssselect
else:
def searchFun(s):
return data.xpath(s, namespaces=NS)
patterns = makeSequence(patterns) patterns = makeSequence(patterns)
for search in patterns: for search in patterns:
matched = False matched = False
for match in searchFun(search): for match in self.match(data, search):
matched = True matched = True
yield match, search yield match, search
@ -537,6 +532,13 @@ class ParserScraper(Scraper):
# do not search other links if one pattern matched # do not search other links if one pattern matched
break break
def match(self, data, pattern):
"""Match a pattern (XPath/CSS) against a page."""
if self.css:
return data.cssselect(pattern)
else:
return data.xpath(pattern, namespaces=NS)
def getDisabledReasons(self): def getDisabledReasons(self):
res = {} res = {}
if self.css and cssselect is None: if self.css and cssselect is None:

View file

@ -17,7 +17,6 @@ classifiers = [
"Programming Language :: Python", "Programming Language :: Python",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
@ -27,15 +26,13 @@ classifiers = [
"Topic :: Multimedia :: Graphics", "Topic :: Multimedia :: Graphics",
] ]
keywords = ["comic", "webcomic", "downloader", "archiver", "crawler"] keywords = ["comic", "webcomic", "downloader", "archiver", "crawler"]
requires-python = ">=3.7" requires-python = ">=3.8"
dependencies = [ dependencies = [
"colorama", "colorama",
"imagesize", "imagesize",
"lxml>=4.0.0", "lxml>=4.0.0",
"platformdirs", "platformdirs",
"requests>=2.0", "requests>=2.0",
"cached_property;python_version<'3.8'",
"importlib_metadata;python_version<'3.8'",
"importlib_resources>=5.0.0;python_version<'3.9'", "importlib_resources>=5.0.0;python_version<'3.9'",
] ]
dynamic = ["version"] dynamic = ["version"]
@ -101,7 +98,7 @@ ignore = [
] ]
noqa-require-code = true noqa-require-code = true
no-accept-encodings = true no-accept-encodings = true
min-version = "3.7" min-version = "3.8"
extend-exclude = [ extend-exclude = [
'.venv', '.venv',
'build', 'build',

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# Copyright (C) 2019 Thomas W. Littauer # SPDX-FileCopyrightText: © 2019 Thomas W. Littauer
""" """
Script to get a list of comicskingdom.com comics and save the info in a JSON Script to get a list of comicskingdom.com comics and save the info in a JSON
file for further processing. file for further processing.
@ -19,39 +19,17 @@ class ComicsKingdomUpdater(ComicListUpdater):
"ComicGenesis/%s", "ComicGenesis/%s",
) )
def handle_startpage(self, page): def handle_listing(self, page):
"""Parse list of comics from the bottom of the start page.""" for link in page.xpath('//ul[d:class("index")]//a', namespaces=NS):
for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS): name = link.text_content().removeprefix('The ')
link = li.xpath('./a')[0]
url = link.attrib['href'] url = link.attrib['href']
name = link.text.removeprefix('The ') lang = 'es' if ' (Spanish)' in name else None
self.add_comic(name, (url, None)) self.add_comic(name, (url, lang))
def handle_listing(self, page, lang: str = None, add: str = ''):
hasnew = True
while hasnew:
hasnew = False
for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS):
nametag = comicdiv.xpath('./a/comic-name')
if len(nametag) == 0:
continue
name = nametag[0].text.removeprefix('The ') + add
url = comicdiv.xpath('./a')[0].attrib['href']
if self.add_comic(name, (url, lang)):
hasnew = True
nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]')
page = self.get_url(nextlink[0].attrib['href'])
def collect_results(self): def collect_results(self):
"""Parse all search result pages.""" """Parse all search result pages."""
page = self.get_url('https://www.comicskingdom.com/') self.handle_listing(self.get_url('https://comicskingdom.com/features'))
self.handle_startpage(page)
self.handle_listing(page)
self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish')
def get_entry(self, name: str, data: tuple[str, str]): def get_entry(self, name: str, data: tuple[str, str]):
opt = f", lang='{data[1]}'" if data[1] else '' opt = f", lang='{data[1]}'" if data[1] else ''

View file

@ -1,28 +1,30 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2017-2020 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2017 Tobias Gruetzmacher
import re
from importlib import metadata
# Idea from # Idea from
# https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Setuptools-Entry-Point, # https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Setuptools-Entry-Point,
# but with importlib # but with importlib
def Entrypoint(group, name, **kwargs): def entrypoint(group, name, **kwargs):
import re
try:
from importlib.metadata import entry_points
except ImportError:
from importlib_metadata import entry_points
# get the entry point # get the entry point
eps = entry_points()[group] eps = metadata.entry_points()
ep = next(ep for ep in eps if ep.name == name) if 'select' in dir(eps):
module, attr = re.split(r'\s*:\s*', ep.value, 1) # modern
ep = eps.select(group=group)[name]
else:
# legacy (pre-3.10)
ep = next(ep for ep in eps[group] if ep.name == name)
module, attr = re.split(r'\s*:\s*', ep.value, maxsplit=1)
# script name must not be a valid module name to avoid name clashes on import # script name must not be a valid module name to avoid name clashes on import
script_path = os.path.join(workpath, name + '-script.py') script_path = os.path.join(workpath, name + '-script.py')
print("creating script for entry point", group, name) print("creating script for entry point", group, name)
with open(script_path, 'w') as fh: with open(script_path, mode='w', encoding='utf-8') as fh:
print("import sys", file=fh) print("import sys", file=fh)
print("import", module, file=fh) print("import", module, file=fh)
print("sys.exit(%s.%s())" % (module, attr), file=fh) print(f"sys.exit({module}.{attr}())", file=fh)
return Analysis( return Analysis(
[script_path] + kwargs.get('scripts', []), [script_path] + kwargs.get('scripts', []),
@ -30,7 +32,7 @@ def Entrypoint(group, name, **kwargs):
) )
a = Entrypoint('console_scripts', 'dosage') a = entrypoint('console_scripts', 'dosage')
a.binaries = [x for x in a.binaries if not x[1].lower().startswith(r'c:\windows')] a.binaries = [x for x in a.binaries if not x[1].lower().startswith(r'c:\windows')]

View file

@ -1,8 +1,8 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
""" """
Script to get a list of gocomics and save the info in a JSON file for further Script to get a list of gocomics and save the info in a JSON file for further
processing. processing.
@ -20,6 +20,8 @@ class GoComicsUpdater(ComicListUpdater):
excluded_comics = ( excluded_comics = (
# too short # too short
'LukeyMcGarrysTLDR', 'LukeyMcGarrysTLDR',
# Has its own module
'Widdershins',
) )
def handle_gocomics(self, url, outercss='a.gc-blended-link', lang=None): def handle_gocomics(self, url, outercss='a.gc-blended-link', lang=None):

View file

@ -61,7 +61,10 @@ def create_symlinks(d):
else: else:
order.extend(data["pages"][work]["images"].values()) order.extend(data["pages"][work]["images"].values())
if "prev" in data["pages"][work]: if "prev" in data["pages"][work]:
work = data["pages"][work]["prev"] if data["pages"][work]["prev"] == work:
work = None
else:
work = data["pages"][work]["prev"]
else: else:
work = None work = None
order.reverse() order.reverse()

View file

@ -3,12 +3,15 @@
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher # Copyright (C) 2015-2022 Tobias Gruetzmacher
import re import re
from operator import attrgetter
import pytest
from dosagelib.scraper import scrapers from dosagelib.scraper import scrapers
from dosagelib.plugins import old from dosagelib.plugins import old
class TestComicNames(object): class TestComicNames:
def test_names(self): def test_names(self):
for scraperobj in scrapers.all(): for scraperobj in scrapers.all():
@ -20,11 +23,11 @@ class TestComicNames(object):
comicname = name comicname = name
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
def test_renamed(self): @pytest.mark.parametrize(('scraperobj'),
for scraperobj in scrapers.all(include_removed=True): [obj for obj in scrapers.all(include_removed=True)
if not isinstance(scraperobj, old.Renamed): if isinstance(obj, old.Renamed)], ids=attrgetter('name'))
continue def test_renamed(self, scraperobj):
assert len(scraperobj.getDisabledReasons()) > 0 assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper # Renamed scraper should only point to an non-disabled scraper
newscraper = scrapers.find(scraperobj.newname) newscraper = scrapers.find(scraperobj.newname)
assert len(newscraper.getDisabledReasons()) == 0 assert len(newscraper.getDisabledReasons()) == 0

View file

@ -1,9 +1,9 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2019 Tobias Gruetzmacher # SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
from dosagelib.helpers import joinPathPartsNamer, queryNamer from dosagelib.helpers import joinPathPartsNamer, queryNamer
class TestNamer(object): class TestNamer:
""" """
Tests for comic namer. Tests for comic namer.
""" """
@ -16,6 +16,8 @@ class TestNamer(object):
def test_joinPathPartsNamer(self): def test_joinPathPartsNamer(self):
imgurl = 'https://HOST/wp-content/uploads/2019/02/tennis5wp-1.png' imgurl = 'https://HOST/wp-content/uploads/2019/02/tennis5wp-1.png'
pageurl = 'https://HOST/2019/03/11/12450/' pageurl = 'https://HOST/2019/03/11/12450/'
assert joinPathPartsNamer((0, 1, 2))(self, imgurl, pageurl) == '2019_03_11_tennis5wp-1.png' assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))(self,
assert joinPathPartsNamer((0, 1, 2), (-1,), '-')(self, imgurl, pageurl) == '2019-03-11-tennis5wp-1.png' imgurl, pageurl) == '2019_03_11_tennis5wp-1.png'
assert joinPathPartsNamer((0, -2), ())(self, imgurl, pageurl) == '2019_12450' assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,), joinchar='-')(self,
imgurl, pageurl) == '2019-03-11-tennis5wp-1.png'
assert joinPathPartsNamer(pageparts=(0, -2))(self, imgurl, pageurl) == '2019_12450'

View file

@ -1,10 +1,9 @@
[tox] [tox]
envlist = py37, py38, py39, py310, py311, py312, flake8 envlist = py38, py39, py310, py311, py312, flake8
isolated_build = True isolated_build = True
[gh-actions] [gh-actions]
python = python =
3.7: py37
3.8: py38 3.8: py38
3.9: py39 3.9: py39
3.10: py310 3.10: py310