Merge remote-tracking branch 'vendor/master'

This commit is contained in:
D. Moonfire 2024-07-02 18:57:33 -05:00
commit bf9e7d2760
44 changed files with 517 additions and 568 deletions

View file

@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
steps:
- uses: actions/checkout@v4
@ -32,7 +32,7 @@ jobs:
if: ${{ matrix.python-version != env.DEFAULT_PYTHON }}
- name: Test with tox (and upload coverage)
uses: paambaati/codeclimate-action@v5.0.0
uses: paambaati/codeclimate-action@v8.0.0
if: ${{ matrix.python-version == env.DEFAULT_PYTHON }}
env:
CC_TEST_REPORTER_ID: 2a411f596959fc32f5d73f3ba7cef8cc4d5733299d742dbfc97fd6c190b9010c
@ -42,6 +42,6 @@ jobs:
${{ github.workspace }}/.tox/reports/*/coverage.xml:coverage.py
prefix: ${{ github.workspace }}/.tox/py39/lib/python3.9/site-packages
- uses: codecov/codecov-action@v3
- uses: codecov/codecov-action@v4
with:
directory: '.tox/reports'

View file

@ -5,12 +5,19 @@ on:
push:
branches:
- master
workflow_dispatch:
permissions:
contents: write
contents: read
pages: write
id-token: write
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
deploy:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
@ -28,10 +35,24 @@ jobs:
pip install wheel
pip install git+https://github.com/spanezz/staticsite.git@v2.3
ssite build --output public
cd public
rm -rf Jenkinsfile dosagelib scripts tests
- name: Deploy
uses: peaceiris/actions-gh-pages@v3
- name: Setup Pages
id: pages
uses: actions/configure-pages@v5
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
cname: dosage.rocks
github_token: ${{ secrets.GITHUB_TOKEN }}
exclude_assets: 'Jenkinsfile,dosagelib,scripts,setup.*,tests,*.ini'
path: public
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
needs: build
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4

View file

@ -1,6 +1,6 @@
Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
Copyright (C) 2012-2014 Bastian Kleineidam
Copyright (C) 2015-2022 Tobias Gruetzmacher
Copyright (C) 2015-2024 Tobias Gruetzmacher
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the

3
Jenkinsfile vendored
View file

@ -4,7 +4,6 @@ def pys = [
[name: 'Python 3.10', docker: '3.10-bookworm', tox:'py310', main: false],
[name: 'Python 3.9', docker: '3.9-bookworm', tox:'py39', main: false],
[name: 'Python 3.8', docker: '3.8-bookworm', tox:'py38', main: false],
[name: 'Python 3.7', docker: '3.7-bookworm', tox:'py37', main: false],
]
properties([
@ -75,7 +74,7 @@ pys.each { py ->
parallel(tasks)
parallel modern: {
stage('Modern Windows binary') {
windowsBuild('3.11', 'dosage.exe')
windowsBuild('3.12', 'dosage.exe')
}
},
legacy: {

View file

@ -1,9 +1,9 @@
# Dosage
[![Tests](https://github.com/webcomics/dosage/actions/workflows/test.yml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/test.yml)
[![CI](https://github.com/webcomics/dosage/actions/workflows/ci.yaml/badge.svg)](https://github.com/webcomics/dosage/actions/workflows/ci.yaml)
[![Code Climate](https://codeclimate.com/github/webcomics/dosage/badges/gpa.svg)](https://codeclimate.com/github/webcomics/dosage)
[![codecov](https://codecov.io/gh/webcomics/dosage/branch/master/graph/badge.svg)](https://codecov.io/gh/webcomics/dosage)
![Maintenance](https://img.shields.io/maintenance/yes/2023.svg)
![Maintenance](https://img.shields.io/maintenance/yes/2024.svg)
![License](https://img.shields.io/github/license/webcomics/dosage)
Dosage is designed to keep a local copy of specific webcomics and other
@ -72,7 +72,7 @@ are old enough to view them.
### Dependencies
Since dosage is written in [Python](http://www.python.org/), a Python
installation is required: Dosage needs at least Python 3.7. Dosage requires
installation is required: Dosage needs at least Python 3.8. Dosage requires
some Python modules from PyPI, so installation with `pip` is recommended.
### Using the Windows binary

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
"""
Automated comic downloader. Dosage traverses comic websites in
order to download each strip of the comic. The intended use is for
@ -14,14 +14,11 @@ The primary interface is the 'dosage' commandline script.
Comic modules for each comic are located in L{dosagelib.plugins}.
"""
try:
from importlib.metadata import version, PackageNotFoundError
except ImportError:
from importlib_metadata import version, PackageNotFoundError
from importlib.metadata import version, PackageNotFoundError
from .output import out
AppName = u'dosage'
AppName = 'dosage'
try:
__version__ = version(AppName) # PEP 396
except PackageNotFoundError:

View file

@ -1,12 +1,15 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from __future__ import annotations
import os
import glob
import codecs
import contextlib
from datetime import datetime
from typing import Iterator
from .output import out
from .util import unquote, getFilename, urlopen, strsize
@ -14,27 +17,27 @@ from .events import getHandler
# Maximum content size for images
MaxImageBytes = 1024 * 1024 * 20 # 20 MB
MAX_IMAGE_BYTES = 1024 * 1024 * 20 # 20 MB
# RFC 1123 format, as preferred by RFC 2616
RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT"
class ComicStrip(object):
class ComicStrip:
"""A list of comic image URLs."""
def __init__(self, scraper, strip_url, image_urls, text=None):
def __init__(self, scraper, strip_url: str, image_urls: str, text=None) -> None:
"""Store the image URL list."""
self.scraper = scraper
self.strip_url = strip_url
self.image_urls = image_urls
self.text = text
def getImages(self):
def getImages(self) -> Iterator[ComicImage]:
"""Get a list of image downloaders."""
for image_url in self.image_urls:
yield self.getDownloader(image_url)
def getDownloader(self, url):
def getDownloader(self, url: str) -> ComicImage:
"""Get an image downloader."""
filename = self.scraper.namer(url, self.strip_url)
if filename is None:
@ -43,7 +46,7 @@ class ComicStrip(object):
text=self.text)
class ComicImage(object):
class ComicImage:
"""A comic image downloader."""
ChunkBytes = 1024 * 100 # 100KB
@ -64,7 +67,7 @@ class ComicImage(object):
headers['If-Modified-Since'] = lastchange.strftime(RFC_1123_DT_STR)
self.urlobj = urlopen(self.url, self.scraper.session,
referrer=self.referrer,
max_content_bytes=MaxImageBytes, stream=True,
max_content_bytes=MAX_IMAGE_BYTES, stream=True,
headers=headers)
if self.urlobj.status_code == 304: # Not modified
return

View file

@ -1,39 +1,49 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations
from typing import Protocol
from .util import getQueryParams
from .scraper import Scraper
def queryNamer(param, use_page_url=False):
class Namer(Protocol):
"""A protocol for generic callbacks to name web comic images."""
def __call__(_, self: Scraper, image_url: str, page_url: str) -> str | None:
...
def queryNamer(param, use_page_url=False) -> Namer:
"""Get name from URL query part."""
def _namer(self, image_url, page_url):
def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get URL query part."""
url = page_url if use_page_url else image_url
return getQueryParams(url)[param][0]
return _namer
def regexNamer(regex, use_page_url=False):
def regexNamer(regex, use_page_url=False) -> Namer:
"""Get name from regular expression."""
def _namer(self, image_url, page_url):
def _namer(self, image_url: str, page_url: str) -> str | None:
"""Get first regular expression group."""
url = page_url if use_page_url else image_url
mo = regex.search(url)
if mo:
return mo.group(1)
return mo.group(1) if mo else None
return _namer
def joinPathPartsNamer(pageurlparts, imageurlparts=(-1,), joinchar='_'):
def joinPathPartsNamer(pageparts=(), imageparts=(), joinchar='_') -> Namer:
"""Get name by mashing path parts together with underscores."""
def _namer(self, imageurl, pageurl):
def _namer(self: Scraper, image_url: str, page_url: str) -> str | None:
# Split and drop host name
pageurlsplit = pageurl.split('/')[3:]
imageurlsplit = imageurl.split('/')[3:]
joinparts = ([pageurlsplit[i] for i in pageurlparts] +
[imageurlsplit[i] for i in imageurlparts])
pagesplit = page_url.split('/')[3:]
imagesplit = image_url.split('/')[3:]
joinparts = ([pagesplit[i] for i in pageparts] +
[imagesplit[i] for i in imageparts])
return joinchar.join(joinparts)
return _namer

View file

@ -1,18 +1,18 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, sub, MULTILINE
from ..util import tagre
from ..scraper import BasicScraper, ParserScraper, _BasicScraper, _ParserScraper
from ..helpers import regexNamer, bounceStarter, indirectStarter
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..helpers import joinPathPartsNamer, bounceStarter, indirectStarter
from .common import WordPressScraper, WordPressNavi, WordPressWebcomic
class AbstruseGoose(_ParserScraper):
url = 'https://abstrusegoose.com/'
class AbstruseGoose(ParserScraper):
url = 'https://web.archive.org/web/20230930172141/https://abstrusegoose.com/'
starter = bounceStarter
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
@ -41,24 +41,16 @@ class AbsurdNotions(_BasicScraper):
help = 'Index format: n (unpadded)'
class AcademyVale(_BasicScraper):
url = 'http://www.imagerie.com/vale/'
stripUrl = url + 'avarch.cgi?%s'
firstStripUrl = stripUrl % '001'
imageSearch = compile(tagre('img', 'src', r'(avale\d{4}-\d{2}\.gif)'))
prevSearch = compile(tagre('a', 'href', r'(avarch[^">]+)', quote="") +
tagre('img', 'src', r'AVNavBack\.gif'))
help = 'Index format: nnn'
class Achewood(_ParserScraper):
url = 'https://www.achewood.com/'
stripUrl = url + 'index.php?date=%s'
firstStripUrl = stripUrl % '10012001'
imageSearch = '//p[@id="comic_body"]//img'
prevSearch = '//span[d:class("left")]/a[d:class("dateNav")]'
help = 'Index format: mmddyyyy'
namer = regexNamer(compile(r'date=(\d+)'))
class Achewood(ParserScraper):
baseUrl = 'https://achewood.com/'
stripUrl = baseUrl + '%s/title.html'
url = stripUrl % '2016/12/25'
firstStripUrl = stripUrl % '2001/10/01'
imageSearch = '//img[d:class("comicImage")]'
prevSearch = '//a[d:class("comic_prev")]'
namer = joinPathPartsNamer(pageparts=range(0, 2))
help = 'Index format: yyyy/mm/dd'
endOfLife = True
class AdventuresOfFifne(_ParserScraper):
@ -117,12 +109,8 @@ class AhoiPolloi(_ParserScraper):
help = 'Index format: yyyymmdd'
class AhoyEarth(WordPressNavi):
url = 'http://www.ahoyearth.com/'
class AirForceBlues(WordPressScraper):
url = 'http://farvatoons.com/'
url = 'https://web.archive.org/web/20210102113825/http://farvatoons.com/'
firstStripUrl = url + 'comic/in-texas-there-are-texans/'
@ -235,14 +223,11 @@ class AltermetaOld(_ParserScraper):
help = 'Index format: n (unpadded)'
class AmazingSuperPowers(_BasicScraper):
url = 'http://www.amazingsuperpowers.com/'
rurl = escape(url)
class AmazingSuperPowers(WordPressNavi):
url = 'https://www.amazingsuperpowers.com/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2007/09/heredity'
imageSearch = compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+)' % rurl, after="prev"))
help = 'Index format: yyyy/mm/name'
imageSearch = '//div[d:class("comicpane")]/img'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
@ -271,19 +256,7 @@ class Amya(WordPressScraper):
url = 'http://www.amyachronicles.com/'
class Anaria(_ParserScraper):
url = 'https://www.leahbriere.com/anaria-the-witchs-dream/'
firstStripUrl = url
imageSearch = '//div[contains(@class, "gallery")]//a'
multipleImagesPerStrip = True
endOfLife = True
def namer(self, imageUrl, pageUrl):
filename = imageUrl.rsplit('/', 1)[-1]
return filename.replace('00.jpg', 'new00.jpg').replace('new', '1')
class Angband(_ParserScraper):
class Angband(ParserScraper):
url = 'http://angband.calamarain.net/'
stripUrl = url + '%s'
imageSearch = '//img'
@ -292,7 +265,7 @@ class Angband(_ParserScraper):
def starter(self):
page = self.getPage(self.url)
self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href')
self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href')
self.firstStripUrl = self.pages[0]
return self.pages[-1]
@ -300,14 +273,6 @@ class Angband(_ParserScraper):
return self.pages[self.pages.index(url) - 1]
class Angels2200(_BasicScraper):
url = 'http://www.janahoffmann.com/angels/'
stripUrl = url + '%s'
imageSearch = compile(tagre("img", "src", r"(http://www\.janahoffmann\.com/angels/comics/[^']+)", quote="'"))
prevSearch = compile(tagre("a", "href", r'([^"]+)') + "« Previous")
help = 'Index format: yyyy/mm/dd/part-<n>-comic-<n>'
class Annyseed(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20190511031451/'
'http://www.mirrorwoodcomics.com/')
@ -330,7 +295,7 @@ class Annyseed(_ParserScraper):
return tourl
class AntiheroForHire(_ParserScraper):
class AntiheroForHire(ParserScraper):
stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s'
firstStripUrl = stripUrl % '2016/6/8/entrance-vigil'
url = firstStripUrl
@ -341,7 +306,7 @@ class AntiheroForHire(_ParserScraper):
def starter(self):
# Build list of chapters for navigation
page = self.getPage(self.url)
self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href')
self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href')
return self.chapters[0]
def getPrevUrl(self, url, data):
@ -377,7 +342,7 @@ class ArtificialIncident(WordPressWebcomic):
firstStripUrl = stripUrl % 'issue-one-life-changing'
class AstronomyPOTD(_ParserScraper):
class AstronomyPOTD(ParserScraper):
baseUrl = 'http://apod.nasa.gov/apod/'
url = baseUrl + 'astropix.html'
starter = bounceStarter
@ -391,7 +356,7 @@ class AstronomyPOTD(_ParserScraper):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//iframe') # videos
return self.match(data, '//iframe') # videos
def namer(self, image_url, page_url):
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],

View file

@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper):
help = 'Index format: yyyy/mm/dd/nnn-stripname'
class CarryOn(_ParserScraper):
class CarryOn(ParserScraper):
url = 'http://www.hirezfox.com/km/co/'
stripUrl = url + 'd/%s.html'
firstStripUrl = stripUrl % '20040701'
imageSearch = '//div[@class="strip"]/img'
imageSearch = '//div[d:class("strip")]/img'
prevSearch = '//a[text()="Previous Day"]'
multipleImagesPerStrip = True
@ -122,13 +122,13 @@ class CatAndGirl(_ParserScraper):
prevSearch = '//a[d:class("pager--prev")]'
class CatenaManor(_ParserScraper):
class CatenaManor(ParserScraper):
baseUrl = ('https://web.archive.org/web/20141027141116/'
'http://catenamanor.com/')
url = baseUrl + 'archives'
stripUrl = baseUrl + '%s/'
firstStripUrl = stripUrl % '2003/07'
imageSearch = '//img[@class="comicthumbnail"]'
imageSearch = '//img[d:class("comicthumbnail")]'
multipleImagesPerStrip = True
endOfLife = True
strips: List[str] = []
@ -136,7 +136,7 @@ class CatenaManor(_ParserScraper):
def starter(self):
# Retrieve archive links and select valid range
archivePage = self.getPage(self.url)
archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a')
archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a')
valid = False
for link in archiveStrips:
if self.stripUrl % '2012/01' in link.get('href'):
@ -404,7 +404,7 @@ class CrossTimeCafe(_ParserScraper):
class CSectionComics(WordPressScraper):
url = 'https://www.csectioncomics.com/'
firstStripUrl = url + 'comics/one-day-in-country'
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
multipleImagesPerStrip = True
@ -466,7 +466,7 @@ class CyanideAndHappiness(ParserScraper):
prevSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="180deg"]]'
nextSearch = '//div[@type="comic"]//a[*[local-name()="svg" and @rotate="0deg"]]'
starter = bounceStarter
namer = joinPathPartsNamer((), range(-4, 0))
namer = joinPathPartsNamer(imageparts=range(-4, 0))
class CynWolf(_ParserScraper):

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
import os
from ..scraper import ParserScraper
@ -79,7 +79,7 @@ class ComicFury(ParserScraper):
num = parts[-1]
if self.multipleImagesPerStrip:
page = self.getPage(pageUrl)
images = page.xpath('//img[@class="comicsegmentimage"]/@src')
images = self.match(page, '//img[d:class("comicsegmentimage")]/@src')
if len(images) > 1:
imageIndex = images.index(imageUrl) + 1
return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext)
@ -88,8 +88,8 @@ class ComicFury(ParserScraper):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
# Videos on Underverse
return (data.xpath('//div[@id="comicimagewrap"]//video') and
not data.xpath('//div[@id="comicimagewrap"]//img'))
return (self.match(data, '//div[@id="comicimagewrap"]//video') and
not self.match(data, '//div[@id="comicimagewrap"]//img'))
@classmethod
def getmodules(cls): # noqa: CFQ001

View file

@ -1,41 +1,35 @@
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer
try:
from importlib_resources import as_file, files
except ImportError:
from importlib.resources import as_file, files
from ..helpers import bounceStarter, joinPathPartsNamer
from ..helpers import indirectStarter
from ..scraper import ParserScraper
class ComicsKingdom(ParserScraper):
imageSearch = '//img[@id="theComicImage"]'
prevSearch = '//a[./img[contains(@alt, "Previous")]]'
nextSearch = '//a[./img[contains(@alt, "Next")]]'
starter = bounceStarter
namer = joinPathPartsNamer((-2, -1), ())
partDiv = '//div[d:class("comic-reader-item")]'
imageSearch = '//meta[@property="og:image"]/@content'
prevSearch = partDiv + '[2]/@data-link'
starter = indirectStarter
help = 'Index format: yyyy-mm-dd'
def __init__(self, name, path, lang=None):
super().__init__('ComicsKingdom/' + name)
self.url = 'https://comicskingdom.com/' + path
self.stripUrl = self.url + '/%s'
self.latestSearch = f'//a[re:test(@href, "/{path}/[0-9-]+$")]'
if lang:
self.lang = lang
def link_modifier(self, fromurl, tourl):
return tourl.replace('//wp.', '//', 1)
@classmethod
def getmodules(cls): # noqa: CFQ001
return (
# Some comics are not listed on the "all" page (too old?)
cls('Retail', 'retail'),
# do not edit anything below since these entries are generated from
# scripts/comicskingdom.py
# START AUTOUPDATE
cls('AmazingSpiderman', 'amazing-spider-man'),
cls('AmazingSpidermanSpanish', 'hombre-arana', lang='es'),
cls('Alice', 'alice'),
cls('Apartment3G', 'apartment-3-g_1'),
cls('ArcticCircle', 'arctic-circle'),
cls('ATodaVelocidadSpanish', 'a-toda-velocidad', lang='es'),
@ -43,22 +37,25 @@ class ComicsKingdom(ParserScraper):
cls('BarneyGoogleAndSnuffySmithSpanish', 'tapon', lang='es'),
cls('BeetleBailey', 'beetle-bailey-1'),
cls('BeetleBaileySpanish', 'beto-el-recluta', lang='es'),
cls('BeetleMoses', 'beetle-moses'),
cls('BetweenFriends', 'between-friends'),
cls('BewareOfToddler', 'beware-of-toddler'),
cls('BigBenBolt', 'big-ben-bolt'),
cls('BigBenBoltSundays', 'big-ben-bolt-sundays'),
cls('Bizarro', 'bizarro'),
cls('Blondie', 'blondie'),
cls('BlondieSpanish', 'pepita', lang='es'),
cls('BobMankoffPresentsShowMeTheFunny', 'show-me-the-funny'),
cls('BobMankoffPresentsShowMeTheFunnyAnimalEdition', 'show-me-the-funny-pets'),
cls('BonersArk', 'boners-ark'),
cls('BonersArkSundays', 'boners-ark-sundays'),
cls('BrianDuffy', 'brian-duffy'),
cls('BreakOfDay', 'break-of-day'),
cls('BrickBradford', 'brick-bradford'),
cls('BrilliantMindOfEdisonLee', 'brilliant-mind-of-edison-lee'),
cls('BringingUpFather', 'bringing-up-father'),
cls('BringingUpFatherSpanish', 'educando-a-papa', lang='es'),
cls('BuzSawyer', 'buz-sawyer'),
cls('Candorville', 'candorville'),
cls('CarpeDiem', 'carpe-diem'),
cls('Crankshaft', 'crankshaft'),
cls('Comiclicious', 'comiclicious'),
cls('Crock', 'crock'),
cls('CrockSpanish', 'crock-spanish', lang='es'),
cls('Curtis', 'curtis'),
@ -67,6 +64,7 @@ class ComicsKingdom(ParserScraper):
cls('DavidMHitch', 'david-m-hitch'),
cls('DennisTheMenace', 'dennis-the-menace'),
cls('DennisTheMenaceSpanish', 'daniel-el-travieso', lang='es'),
cls('Dumplings', 'dumplings'),
cls('Dustin', 'dustin'),
cls('EdGamble', 'ed-gamble'),
# EdgeCity has a duplicate in GoComics/EdgeCity
@ -74,18 +72,15 @@ class ComicsKingdom(ParserScraper):
cls('FamilyCircusSpanish', 'circulo-familiar', lang='es'),
cls('FlashForward', 'flash-forward'),
cls('FlashGordon', 'flash-gordon'),
cls('FlashGordonSundays', 'flash-gordon-sundays'),
cls('FunkyWinkerbean', 'funky-winkerbean'),
cls('FunkyWinkerbeanSunday', 'funky-winkerbean-sundays'),
cls('FunkyWinkerbeanVintage', 'funky-winkerbean-1'),
cls('FunnyOnlineAnimals', 'Funny-Online-Animals'),
cls('GearheadGertie', 'Gearhead-Gertie'),
cls('FunnyOnlineAnimals', 'funny-online-animals'),
cls('GearheadGertie', 'gearhead-gertie'),
cls('GodsHands', 'gods-hands'),
cls('HagarTheHorrible', 'hagar-the-horrible'),
cls('HagarTheHorribleSpanish', 'olafo', lang='es'),
cls('HeartOfJulietJones', 'heart-of-juliet-jones'),
cls('HeartOfJulietJonesSundays', 'heart-of-juliet-jones-sundays'),
cls('HiAndLois', 'hi-and-lois'),
cls('IntelligentLife', 'Intelligent'),
cls('InsanityStreak', 'insanity-streak'),
cls('IntelligentLife', 'intelligent'),
cls('JimmyMargulies', 'jimmy-margulies'),
cls('JohnBranch', 'john-branch'),
cls('JohnnyHazard', 'johnny-hazard'),
@ -93,7 +88,6 @@ class ComicsKingdom(ParserScraper):
cls('JungleJimSundays', 'jungle-jim-sundays'),
cls('KatzenjammerKids', 'katzenjammer-kids'),
cls('KatzenjammerKidsSpanish', 'maldades-de-dos-pilluelos', lang='es'),
cls('KatzenjammerKidsSundays', 'katzenjammer-kids-sundays'),
cls('KevinAndKell', 'kevin-and-kell'),
cls('KingOfTheRoyalMounted', 'king-of-the-royal-mounted'),
cls('KirkWalters', 'kirk-walters'),
@ -101,44 +95,42 @@ class ComicsKingdom(ParserScraper):
cls('LaloYLolaSpanish', 'lalo-y-lola', lang='es'),
cls('LeeJudge', 'lee-judge'),
cls('LegalizationNation', 'legalization-nation'),
cls('LegendOfBill', 'Legend-of-Bill'),
cls('LegendOfBill', 'legend-of-bill'),
cls('LittleIodineSundays', 'little-iodine-sundays'),
cls('LittleKing', 'the-little-king'),
cls('Lockhorns', 'lockhorns'),
cls('Macanudo', 'Macanudo'),
cls('Macanudo', 'macanudo'),
cls('MacanudoSpanish', 'macanudo-spanish', lang='es'),
cls('MallardFillmore', 'mallard-fillmore'),
cls('MandrakeTheMagician', 'mandrake-the-magician-1'),
cls('MandrakeTheMagician', 'mandrake-the-magician'),
cls('MandrakeTheMagicianSpanish', 'mandrake-the-magician-spanish', lang='es'),
cls('MandrakeTheMagicianSundays', 'mandrake-the-magician-sundays'),
cls('MaraLlaveKeeperOfTime', 'mara-llave-keeper-of-time'),
cls('MarkTrail', 'mark-trail'),
cls('MarkTrailSpanish', 'mark-trail-spanish', lang='es'),
cls('MarkTrailVintage', 'Mark-Trail-Vintage'),
cls('Marvin', 'marvin'),
cls('MarvinSpanish', 'marvin-spanish', lang='es'),
cls('MaryWorth', 'mary-worth'),
cls('MaryWorthSpanish', 'maria-de-oro', lang='es'),
cls('MikePeters', 'mike-peters'),
cls('Mazetoons', 'mazetoons'),
cls('MikeShelton', 'mike-shelton'),
cls('MikeSmith', 'mike-smith'),
cls('MooseAndMolly', 'moose-and-molly'),
cls('MooseAndMollySpanish', 'quintin', lang='es'),
cls('MotherGooseAndGrimm', 'mother-goose-grimm'),
cls('MrAbernathySpanish', 'don-abundio', lang='es'),
cls('Mutts', 'mutts'),
cls('MuttsSpanish', 'motas', lang='es'),
cls('NeverBeenDeader', 'never-been-deader'),
cls('OfficeHours', 'office-hours'),
cls('OliveAndPopeye', 'olive-popeye'),
cls('OnTheFastrack', 'on-the-fastrack'),
cls('PajamaDiaries', 'pajama-diaries'),
cls('PardonMyPlanet', 'pardon-my-planet'),
cls('Phantom', 'phantom'),
cls('PhantomSpanish', 'el-fantasma', lang='es'),
cls('PhantomSundays', 'phantom-sundays'),
cls('PlanetSyndicate', 'the_planet_syndicate'),
cls('Popeye', 'popeye'),
cls('PopeyesCartoonClub', 'popeyes-cartoon-club'),
cls('PopeyeSpanish', 'popeye-spanish', lang='es'),
cls('PrinceValiant', 'prince-valiant'),
cls('PrinceValiantSundays', 'prince-valiant-sundays'),
cls('PrincipeValienteSpanish', 'principe-valiente', lang='es'),
cls('ProsAndCons', 'pros-cons'),
cls('Quincy', 'quincy'),
@ -148,7 +140,9 @@ class ComicsKingdom(ParserScraper):
cls('RexMorganMDSpanish', 'rex-morgan-md-spanish', lang='es'),
cls('RhymesWithOrange', 'rhymes-with-orange'),
cls('RipKirby', 'rip-kirby'),
# Rosebuds has a duplicate in GoComics/Rosebuds
cls('SafeHavens', 'safe-havens'),
cls('SagaOfBrannBjornson', 'the-saga-of-brann-bjornson'),
cls('Sales', 'sales'),
cls('SallyForth', 'sally-forth'),
cls('SamAndSilo', 'sam-and-silo'),
@ -156,17 +150,18 @@ class ComicsKingdom(ParserScraper):
cls('SecretAgentX9', 'secret-agent-x-9'),
# Shoe has a duplicate in GoComics/Shoe
cls('SixChix', 'six-chix'),
cls('SlylockFoxAndComicsForKids', 'slylock-fox-and-comics-for-kids'),
cls('SlylockFoxAndComicsForKidsSpanish', 'solo-para-ninos', lang='es'),
cls('SlylockFox', 'slylock-fox-and-comics-for-kids'),
cls('SlylockFoxSpanish', 'solo-para-ninos', lang='es'),
cls('SuburbanFairyTales', 'suburban-fairy-tales'),
cls('TakeItFromTheTinkersons', 'take-it-from-the-tinkersons'),
cls('TheyllDoItEveryTimeSpanish', 'nunca-falta-alguien-asi', lang='es'),
cls('ThimbleTheater', 'thimble-theater'),
cls('Tiger', 'tiger'),
cls('TigerSpanish', 'tigrillo', lang='es'),
cls('TigerVintage', 'tiger-1'),
cls('TigerVintageSundays', 'tiger-sundays'),
cls('TinasGroove', 'tina-s-groove'),
cls('ToddTheDinosaur', 'todd-the-dinosaur'),
cls('WillyBlack', 'willy-black'),
cls('WillyBlacksSpanish', 'willy-black-spanish', lang='es'),
cls('ZippyThePinhead', 'zippy-the-pinhead'),
cls('Zits', 'zits'),
cls('ZitsSpanish', 'jeremias', lang='es'),

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper):
help = 'Index format: n'
class DresdenCodak(_ParserScraper):
class DresdenCodak(ParserScraper):
url = 'http://dresdencodak.com/'
startUrl = url + 'cat/comic/'
firstStripUrl = url + '2007/02/08/pom/'
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
prevSearch = '//a[img[contains(@src, "prev")]]'
latestSearch = '//a[d:class("tc-grid-bg-link")]'
starter = indirectStarter
# Blog and comic are mixed...
def shouldSkipUrl(self, url, data):
return not data.xpath(self.imageSearch)
class DrFun(_ParserScraper):
baseUrl = ('https://web.archive.org/web/20180726145737/'
@ -355,14 +350,12 @@ class DrFun(_ParserScraper):
help = 'Index format: nnnnn'
class Drive(_BasicScraper):
class Drive(ParserScraper):
url = 'http://www.drivecomic.com/'
rurl = escape(url)
stripUrl = url + 'archive/%s.html'
firstStripUrl = stripUrl % '090815'
imageSearch = compile(tagre("img", "src", r'(http://cdn\.drivecomic\.com/strips/main/[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(%sarchive/\d+\.html)' % rurl) + "Previous")
help = 'Index format: yymmdd'
firstStripUrl = url + 'comic/act-1-pg-001/'
imageSearch = ('//div[@id="unspliced-comic"]//img/@data-src-img',
'//div[@id="unspliced-comic"]//picture//img')
prevSearch = '//a[d:class("previous-comic")]'
class DrMcNinja(_ParserScraper):

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper
from ..helpers import indirectStarter
@ -27,7 +27,7 @@ class Derideal(ParserScraper):
def starter(self):
indexPage = self.getPage(self.url)
self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href')
self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href')
self.currentChapter = len(self.chapters)
return indirectStarter(self)

View file

@ -113,7 +113,7 @@ class Erfworld(ParserScraper):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return not data.xpath(self.imageSearch)
return not self.match(data, self.imageSearch)
def namer(self, imageUrl, pageUrl):
# Fix inconsistent filenames
@ -167,15 +167,6 @@ class Erstwhile(WordPressNavi):
endOfLife = True
class Everblue(ComicControlScraper):
url = 'http://www.everblue-comic.com/comic/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1'
def namer(self, imageUrl, pageUrl):
return imageUrl.rsplit('/', 1)[-1].split('-', 1)[1]
class EverybodyLovesEricRaymond(_ParserScraper):
url = 'http://geekz.co.uk/lovesraymond/'
firstStripUrl = url + 'archive/slashdotted'
@ -190,9 +181,10 @@ class EvilDiva(WordPressScraper):
endOfLife = True
class EvilInc(_ParserScraper):
class EvilInc(ParserScraper):
url = 'https://www.evil-inc.com/'
imageSearch = '//div[@id="unspliced-comic"]/img/@data-src'
imageSearch = ('//div[@id="unspliced-comic"]/img',
'//div[@id="unspliced-comic"]/picture//img')
prevSearch = '//a[./i[d:class("fa-chevron-left")]]'
firstStripUrl = url + 'comic/monday-3/'
@ -263,7 +255,7 @@ class ExtraFabulousComics(WordPressScraper):
return '_'.join((pagepart, imagename))
def shouldSkipUrl(self, url, data):
return data.xpath('//div[@id="comic"]//iframe')
return self.match(data, '//div[@id="comic"]//iframe')
class ExtraLife(_BasicScraper):

View file

@ -140,7 +140,7 @@ class FoxDad(ParserScraper):
def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
post = page.xpath('//li[@class="timestamp"]/a/@href')[0]
post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0]
post = post.replace('https://foxdad.com/post/', '')
if '-consider-support' in post:
post = post.split('-consider-support')[0]
@ -171,7 +171,7 @@ class Fragile(_ParserScraper):
endOfLife = True
class FredoAndPidjin(_ParserScraper):
class FredoAndPidjin(ParserScraper):
url = 'https://www.pidjin.net/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
@ -180,7 +180,7 @@ class FredoAndPidjin(_ParserScraper):
prevSearch = '//span[d:class("prev")]/a'
latestSearch = '//section[d:class("latest")]//a'
starter = indirectStarter
namer = joinPathPartsNamer((0, 1, 2))
namer = joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))
class Freefall(_ParserScraper):
@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper):
def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1]

View file

@ -3,11 +3,11 @@
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape
from re import compile
from ..scraper import _BasicScraper, _ParserScraper
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from ..util import tagre, getQueryParams
from .common import ComicControlScraper, WordPressScraper, WordPressNavi
@ -27,13 +27,9 @@ class Garanos(WordPressScraper):
endOfLife = True
class GastroPhobia(_ParserScraper):
url = 'http://www.gastrophobia.com/'
stripUrl = url + 'index.php?date=%s'
firstStripUrl = stripUrl % '2008-07-30'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//div[@id="prev"]/a'
help = 'Index format: yyyy-mm-dd'
class GastroPhobia(ComicControlScraper):
url = 'https://gastrophobia.com/'
firstStripUrl = url + 'comix/the-mane-event'
class Geeks(_ParserScraper):
@ -51,7 +47,7 @@ class GeeksNextDoor(_ParserScraper):
url = 'http://www.geeksnextcomic.com/'
stripUrl = url + '%s.html'
firstStripUrl = stripUrl % '2007-03-27' # '2010-10-04'
imageSearch = '//p/img'
imageSearch = ('//p/img', '//p/span/img')
prevSearch = (
'//a[img[contains(@src, "/nav_prev")]]',
'//a[contains(text(), "< prev")]', # start page is different
@ -59,16 +55,12 @@ class GeeksNextDoor(_ParserScraper):
help = 'Index format: yyyy-mm-dd'
class GirlGenius(_BasicScraper):
baseUrl = 'http://www.girlgeniusonline.com/'
rurl = escape(baseUrl)
url = baseUrl + 'comic.php'
class GirlGenius(ParserScraper):
url = 'https://www.girlgeniusonline.com/comic.php'
stripUrl = url + '?date=%s'
firstStripUrl = stripUrl % '20021104'
imageSearch = compile(
tagre("img", "src", r"(%sggmain/strips/[^']*)" % rurl, quote="'"))
prevSearch = compile(tagre("a", "id", "topprev", quote="\"",
before=r"(%s[^\"']+)" % rurl))
imageSearch = '//img[@alt="Comic"]'
prevSearch = '//a[@id="topprev"]'
multipleImagesPerStrip = True
help = 'Index format: yyyymmdd'
@ -99,20 +91,18 @@ class GoGetARoomie(ComicControlScraper):
url = 'http://www.gogetaroomie.com'
class GoneWithTheBlastwave(_BasicScraper):
url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1'
starter = indirectStarter
stripUrl = url[:-1] + '%s'
class GoneWithTheBlastwave(ParserScraper):
stripUrl = 'http://www.blastwave-comic.com/index.php?p=comic&nro=%s'
firstStripUrl = stripUrl % '1'
imageSearch = compile(r'<img.+src=".+(/comics/.+?)"')
prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/previous')
latestSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
r'<img src="images/page/default/latest')
url = firstStripUrl
starter = indirectStarter
imageSearch = '//*[@id="comic_ruutu"]/center/img'
prevSearch = '//a[img[contains(@src, "previous")]]'
latestSearch = '//a[img[contains(@src, "latest")]]'
help = 'Index format: n'
def namer(self, image_url, page_url):
return '%02d' % int(compile(r'nro=(\d+)').search(page_url).group(1))
return '%02d' % int(getQueryParams(page_url)['nro'][0])
class GrrlPower(WordPressScraper):
@ -130,13 +120,12 @@ class GuildedAge(WordPressScraper):
firstStripUrl = url + 'comic/chapter-1-cover/'
class GUComics(_BasicScraper):
url = 'http://www.gucomics.com/'
stripUrl = url + '%s'
class GUComics(ParserScraper):
stripUrl = 'https://www.gucomics.com/%s'
url = stripUrl % 'comic/'
firstStripUrl = stripUrl % '20000710'
imageSearch = compile(tagre("img", "src", r'(/comics/\d{4}/gu_[^"]+)'))
prevSearch = compile(tagre("a", "href", r'(/\d+)') +
tagre("img", "src", r'/images/nav/prev\.png'))
imageSearch = '//img[contains(@src, "/comics/2")]'
prevSearch = '//a[img[contains(@alt, "previous")]]'
help = 'Index format: yyyymmdd'

View file

@ -1,7 +1,7 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
from ..scraper import ParserScraper
from ..helpers import indirectStarter
@ -31,7 +31,7 @@ class GoComics(ParserScraper):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//img[contains(@src, "content-error-missing")]')
return self.match(data, '//img[contains(@src, "content-error-missing")]')
@classmethod
def getmodules(cls): # noqa: CFQ001
@ -44,7 +44,6 @@ class GoComics(ParserScraper):
# START AUTOUPDATE
cls('1AndDone', '1-and-done'),
cls('9ChickweedLane', '9chickweedlane'),
cls('9ChickweedLaneClassics', '9-chickweed-lane-classics'),
cls('9To5', '9to5'),
cls('Aaggghhh', 'Aaggghhh', 'es'),
cls('AdamAtHome', 'adamathome'),
@ -62,6 +61,7 @@ class GoComics(ParserScraper):
cls('Annie', 'annie'),
cls('AProblemLikeJamal', 'a-problem-like-jamal'),
cls('ArloAndJanis', 'arloandjanis'),
cls('ArtByMoga', 'artbymoga'),
cls('AskShagg', 'askshagg'),
cls('AtTavicat', 'tavicat'),
cls('AuntyAcid', 'aunty-acid'),
@ -69,7 +69,6 @@ class GoComics(ParserScraper):
cls('BackInTheDay', 'backintheday'),
cls('BackToBC', 'back-to-bc'),
cls('Bacon', 'bacon'),
cls('Badlands', 'badlands'),
cls('BadMachinery', 'bad-machinery'),
cls('Baldo', 'baldo'),
cls('BaldoEnEspanol', 'baldoespanol', 'es'),
@ -90,8 +89,8 @@ class GoComics(ParserScraper):
cls('Betty', 'betty'),
cls('BFGFSyndrome', 'bfgf-syndrome'),
cls('BigNate', 'bignate'),
cls('BigNateFirstClass', 'big-nate-first-class'),
cls('BigTop', 'bigtop'),
cls('BillBramhall', 'bill-bramhall'),
cls('BirdAndMoon', 'bird-and-moon'),
cls('Birdbrains', 'birdbrains'),
cls('BleekerTheRechargeableDog', 'bleeker'),
@ -99,14 +98,14 @@ class GoComics(ParserScraper):
cls('BloomCounty', 'bloomcounty'),
cls('BloomCounty2019', 'bloom-county'),
cls('BobGorrell', 'bobgorrell'),
cls('BobTheAngryFlower', 'bob-the-angry-flower'),
cls('BobTheSquirrel', 'bobthesquirrel'),
cls('BoNanas', 'bonanas'),
cls('Boomerangs', 'boomerangs'),
cls('Bottomliners', 'bottomliners'),
cls('BottomLiners', 'bottomliners'),
cls('BoundAndGagged', 'boundandgagged'),
cls('Bozo', 'bozo'),
cls('BreakingCatNews', 'breaking-cat-news'),
cls('BreakOfDay', 'break-of-day'),
cls('Brevity', 'brevity'),
cls('BrewsterRockit', 'brewsterrockit'),
cls('BrianMcFadden', 'brian-mcfadden'),
@ -116,7 +115,6 @@ class GoComics(ParserScraper):
cls('Buni', 'buni'),
cls('CalvinAndHobbes', 'calvinandhobbes'),
cls('CalvinAndHobbesEnEspanol', 'calvinandhobbesespanol', 'es'),
cls('Candorville', 'candorville'),
cls('CatanaComics', 'little-moments-of-love'),
cls('CathyClassics', 'cathy'),
cls('CathyCommiserations', 'cathy-commiserations'),
@ -139,17 +137,18 @@ class GoComics(ParserScraper):
cls('CowAndBoyClassics', 'cowandboy'),
cls('CowTown', 'cowtown'),
cls('Crabgrass', 'crabgrass'),
# Crankshaft has a duplicate in ComicsKingdom/Crankshaft
cls('Crumb', 'crumb'),
cls('CulDeSac', 'culdesac'),
cls('Curses', 'curses'),
cls('DaddysHome', 'daddyshome'),
cls('DanaSummers', 'danasummers'),
cls('DarkSideOfTheHorse', 'darksideofthehorse'),
cls('DayByDave', 'day-by-dave'),
cls('DeepDarkFears', 'deep-dark-fears'),
cls('DeFlocked', 'deflocked'),
cls('DiamondLil', 'diamondlil'),
cls('DickTracy', 'dicktracy'),
cls('DilbertClassics', 'dilbert-classics'),
cls('DilbertEnEspanol', 'dilbert-en-espanol', 'es'),
cls('DinosaurComics', 'dinosaur-comics'),
cls('DogEatDoug', 'dogeatdoug'),
cls('DogsOfCKennel', 'dogsofckennel'),
@ -160,15 +159,14 @@ class GoComics(ParserScraper):
cls('Doonesbury', 'doonesbury'),
cls('Drabble', 'drabble'),
cls('DrewSheneman', 'drewsheneman'),
cls('DumbwichCastle', 'dumbwich-castle'),
cls('EdgeCity', 'edge-city'),
cls('Eek', 'eek'),
cls('ElCafDePoncho', 'el-cafe-de-poncho', 'es'),
cls('EmmyLou', 'emmy-lou'),
cls('Endtown', 'endtown'),
cls('EricAllie', 'eric-allie'),
cls('EverydayPeopleCartoons', 'everyday-people-cartoons'),
cls('Eyebeam', 'eyebeam'),
cls('EyebeamClassic', 'eyebeam-classic'),
cls('FalseKnees', 'false-knees'),
cls('FamilyTree', 'familytree'),
cls('Farcus', 'farcus'),
@ -191,8 +189,8 @@ class GoComics(ParserScraper):
cls('FreeRange', 'freerange'),
cls('FreshlySqueezed', 'freshlysqueezed'),
cls('FrogApplause', 'frogapplause'),
cls('FurBabies', 'furbabies'),
cls('Garfield', 'garfield'),
cls('GarfieldClassics', 'garfield-classics'),
cls('GarfieldEnEspanol', 'garfieldespanol', 'es'),
cls('GaryMarkstein', 'garymarkstein'),
cls('GaryVarvel', 'garyvarvel'),
@ -222,6 +220,7 @@ class GoComics(ParserScraper):
cls('HerbAndJamaal', 'herbandjamaal'),
cls('Herman', 'herman'),
cls('HomeAndAway', 'homeandaway'),
cls('HomeFree', 'homefree'),
cls('HotComicsForCoolPeople', 'hot-comics-for-cool-people'),
cls('HutchOwen', 'hutch-owen'),
cls('ImagineThis', 'imaginethis'),
@ -238,10 +237,12 @@ class GoComics(ParserScraper):
cls('JeffDanziger', 'jeffdanziger'),
cls('JeffStahler', 'jeffstahler'),
cls('JenSorensen', 'jen-sorensen'),
cls('JerryKingComics', 'jerry-king-comics'),
cls('JimBentonCartoons', 'jim-benton-cartoons'),
cls('JimMorin', 'jimmorin'),
cls('JoeHeller', 'joe-heller'),
cls('JoelPett', 'joelpett'),
cls('JoeyWeatherford', 'joey-weatherford'),
cls('JohnDeering', 'johndeering'),
cls('JumpStart', 'jumpstart'),
cls('JunkDrawer', 'junk-drawer'),
@ -287,7 +288,6 @@ class GoComics(ParserScraper):
cls('Lunarbaboon', 'lunarbaboon'),
cls('M2Bulls', 'm2bulls'),
cls('Maintaining', 'maintaining'),
cls('MakingIt', 'making-it'),
cls('MannequinOnTheMoon', 'mannequin-on-the-moon'),
cls('MariasDay', 'marias-day'),
cls('Marmaduke', 'marmaduke'),
@ -299,6 +299,7 @@ class GoComics(ParserScraper):
cls('MessycowComics', 'messy-cow'),
cls('MexikidStories', 'mexikid-stories'),
cls('MichaelRamirez', 'michaelramirez'),
cls('MikeBeckom', 'mike-beckom'),
cls('MikeDuJour', 'mike-du-jour'),
cls('MikeLester', 'mike-lester'),
cls('MikeLuckovich', 'mikeluckovich'),
@ -307,9 +308,9 @@ class GoComics(ParserScraper):
cls('Momma', 'momma'),
cls('Monty', 'monty'),
cls('MontyDiaros', 'monty-diaros', 'es'),
# MotherGooseAndGrimm has a duplicate in ComicsKingdom/MotherGooseAndGrimm
cls('MotleyClassics', 'motley-classics'),
cls('MrLowe', 'mr-lowe'),
cls('MtPleasant', 'mtpleasant'),
cls('MuttAndJeff', 'muttandjeff'),
cls('MyDadIsDracula', 'my-dad-is-dracula'),
cls('MythTickle', 'mythtickle'),
@ -341,10 +342,10 @@ class GoComics(ParserScraper):
cls('OverTheHedge', 'overthehedge'),
cls('OzyAndMillie', 'ozy-and-millie'),
cls('PatOliphant', 'patoliphant'),
cls('PCAndPixel', 'pcandpixel'),
cls('Peanuts', 'peanuts'),
cls('PeanutsBegins', 'peanuts-begins'),
cls('PearlsBeforeSwine', 'pearlsbeforeswine'),
cls('PedroXMolina', 'pedroxmolina'),
cls('Periquita', 'periquita', 'es'),
cls('PerlasParaLosCerdos', 'perlas-para-los-cerdos', 'es'),
cls('PerryBibleFellowship', 'perry-bible-fellowship'),
@ -383,7 +384,6 @@ class GoComics(ParserScraper):
cls('RoseIsRose', 'roseisrose'),
cls('Rubes', 'rubes'),
cls('RudyPark', 'rudypark'),
cls('SaltNPepper', 'salt-n-pepper'),
cls('SarahsScribbles', 'sarahs-scribbles'),
cls('SaturdayMorningBreakfastCereal', 'saturday-morning-breakfast-cereal'),
cls('SavageChickens', 'savage-chickens'),
@ -394,13 +394,11 @@ class GoComics(ParserScraper):
cls('ShermansLagoon', 'shermanslagoon'),
cls('ShirleyAndSonClassics', 'shirley-and-son-classics'),
cls('Shoe', 'shoe'),
cls('SigneWilkinson', 'signewilkinson'),
cls('SketchsharkComics', 'sketchshark-comics'),
cls('SkinHorse', 'skinhorse'),
cls('Skippy', 'skippy'),
cls('SmallPotatoes', 'small-potatoes'),
cls('SnoopyEnEspanol', 'peanuts-espanol', 'es'),
cls('Snowflakes', 'snowflakes'),
cls('SnowSez', 'snow-sez'),
cls('SpeedBump', 'speedbump'),
cls('SpiritOfTheStaircase', 'spirit-of-the-staircase'),
@ -410,9 +408,7 @@ class GoComics(ParserScraper):
cls('SteveKelley', 'stevekelley'),
cls('StickyComics', 'sticky-comics'),
cls('StoneSoup', 'stonesoup'),
cls('StoneSoupClassics', 'stone-soup-classics'),
cls('StrangeBrew', 'strangebrew'),
cls('StuartCarlson', 'stuartcarlson'),
cls('StudioJantze', 'studio-jantze'),
cls('SunnyStreet', 'sunny-street'),
cls('SunshineState', 'sunshine-state'),
@ -425,6 +421,7 @@ class GoComics(ParserScraper):
cls('TarzanEnEspanol', 'tarzan-en-espanol', 'es'),
cls('TedRall', 'ted-rall'),
cls('TenCats', 'ten-cats'),
cls('Tex', 'tex'),
cls('TextsFromMittens', 'texts-from-mittens'),
cls('Thatababy', 'thatababy'),
cls('ThatIsPriceless', 'that-is-priceless'),
@ -451,6 +448,7 @@ class GoComics(ParserScraper):
cls('TheHumbleStumble', 'humble-stumble'),
cls('TheKChronicles', 'thekchronicles'),
cls('TheKnightLife', 'theknightlife'),
cls('TheLockhorns', 'lockhorns'),
cls('TheMartianConfederacy', 'the-martian-confederacy'),
cls('TheMeaningOfLila', 'meaningoflila'),
cls('TheMiddleAge', 'the-middle-age'),
@ -473,6 +471,7 @@ class GoComics(ParserScraper):
cls('TruthFacts', 'truth-facts'),
cls('Tutelandia', 'tutelandia', 'es'),
cls('TwoPartyOpera', 'two-party-opera'),
cls('UFO', 'ufo'),
cls('UnderpantsAndOverbites', 'underpants-and-overbites'),
cls('UnderstandingChaos', 'understanding-chaos'),
cls('UnstrangePhenomena', 'unstrange-phenomena'),
@ -487,6 +486,7 @@ class GoComics(ParserScraper):
cls('ViiviAndWagner', 'viivi-and-wagner'),
cls('WallaceTheBrave', 'wallace-the-brave'),
cls('WaltHandelsman', 'walthandelsman'),
cls('Wannabe', 'wannabe'),
cls('Warped', 'warped'),
cls('WatchYourHead', 'watchyourhead'),
cls('Wawawiwa', 'wawawiwa'),
@ -505,6 +505,7 @@ class GoComics(ParserScraper):
cls('WuMo', 'wumo'),
cls('WumoEnEspanol', 'wumoespanol', 'es'),
cls('Yaffle', 'yaffle'),
cls('YeahItsChill', 'yeah-its-chill'),
cls('YesImHotInThis', 'yesimhotinthis'),
cls('ZackHill', 'zackhill'),
cls('ZenPencils', 'zen-pencils'),

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper
@ -44,7 +44,7 @@ class KemonoCafe(ParserScraper):
# Fix unordered filenames
if 'addictivescience' in pageUrl:
page = self.getPage(pageUrl)
num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
filename = '%04d_%s' % (num, filename)
elif 'CaughtInOrbit' in filename:
filename = filename.replace('CaughtInOrbit', 'CIO')

View file

@ -5,24 +5,7 @@
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter
from .common import ComicControlScraper, WordPressScraper, WordPressNaviIn
class Lackadaisy(ParserScraper):
url = 'https://www.lackadaisy.com/comic.php'
stripUrl = url + '?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//div[@id="exhibit"]/img[contains(@src, "comic/")]'
prevSearch = '//div[@class="prev"]/a'
nextSearch = '//div[@class="next"]/a'
help = 'Index format: n'
starter = bounceStarter
def namer(self, imageUrl, pageUrl):
# Use comic id for filename
num = pageUrl.rsplit('=', 1)[-1]
ext = imageUrl.rsplit('.', 1)[-1]
return 'lackadaisy_%s.%s' % (num, ext)
from .common import ComicControlScraper, WordPressScraper
class Lancer(WordPressScraper):
@ -55,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato
def getPrevUrl(self, url, data):
# Fix broken navigation links
if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
return self.stripUrl % 'summer21'
return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data)

View file

@ -4,22 +4,18 @@
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
import json
from re import compile, escape, IGNORECASE
from re import compile, IGNORECASE
from ..helpers import indirectStarter
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
from ..util import tagre
from ..xml import NS
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
class MacHall(_BasicScraper):
url = 'http://www.machall.com/'
stripUrl = url + 'view.php?date=%s'
firstStripUrl = stripUrl % '2000-11-07'
imageSearch = compile(r'<img src="(comics/.+?)"')
prevSearch = compile(r'<a href="(.+?)"><img[^>]+?src=\'drop_shadow/previous.gif\'>')
help = 'Index format: yyyy-mm-dd'
class MacHall(ComicControlScraper):
url = 'https://www.machall.com/'
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % 'moving-in'
class MadamAndEve(_BasicScraper):
@ -58,12 +54,12 @@ class MareInternum(WordPressScraper):
firstStripUrl = stripUrl % 'intro-page-1'
class Marilith(_BasicScraper):
url = 'http://www.marilith.com/'
class Marilith(ParserScraper):
url = 'https://web.archive.org/web/20170619193143/http://www.marilith.com/'
stripUrl = url + 'archive.php?date=%s'
firstStripUrl = stripUrl % '20041215'
imageSearch = compile(r'<img src="(comics/.+?)" border')
prevSearch = compile(r'<a href="(archive\.php\?date=.+?)"><img border=0 name=previous_day')
imageSearch = '//img[contains(@src, "comics/")]'
prevSearch = '//a[img[@name="previous_day"]]'
help = 'Index format: yyyymmdd'
@ -80,22 +76,14 @@ class MarriedToTheSea(_ParserScraper):
return '%s-%s' % (date, filename)
class MarryMe(_ParserScraper):
url = 'http://marryme.keenspot.com/'
stripUrl = url + 'd/%s.html'
class MarryMe(ParserScraper):
stripUrl = 'http://marryme.keenspot.com/d/%s.html'
url = stripUrl % '20191001'
firstStripUrl = stripUrl % '20120730'
imageSearch = '//img[@class="ksc"]'
prevSearch = '//a[@rel="prev"]'
endOfLife = True
class MaxOveracts(_ParserScraper):
url = 'http://occasionalcomics.com/'
stripUrl = url + '%s/'
css = True
imageSearch = '#comic img'
prevSearch = '.nav-previous > a'
help = 'Index format: nnn'
help = 'Index format: yyyymmdd'
class Meek(WordPressScraper):
@ -149,20 +137,22 @@ class MisfileHellHigh(Misfile):
help = 'Index format: yyyy-mm-dd'
class MistyTheMouse(WordPressScraper):
class MistyTheMouse(ParserScraper):
url = 'http://www.mistythemouse.com/'
prevSearch = '//a[@rel="prev"]'
firstStripUrl = 'http://www.mistythemouse.com/?p=12'
imageSearch = '//center/p/img'
prevSearch = '//a[img[contains(@src, "Previous")]]'
firstStripUrl = url + 'The_Live_In.html'
class MonkeyUser(_ParserScraper):
class MonkeyUser(ParserScraper):
url = 'https://www.monkeyuser.com/'
prevSearch = '//div[@title="previous"]/a'
imageSearch = '//div[d:class("content")]/p/img'
prevSearch = '//a[text()="Prev"]'
multipleImagesPerStrip = True
def shouldSkipUrl(self, url, data):
# videos
return data.xpath('//div[d:class("video-container")]', namespaces=NS)
return self.match(data, '//div[d:class("video-container")]')
class MonsieurLeChien(ParserScraper):
@ -195,43 +185,10 @@ class Moonlace(WordPressWebcomic):
return indirectStarter(self)
class Moonsticks(_ParserScraper):
url = "http://moonsticks.org/"
imageSearch = "//div[@class='entry']//img"
prevSearch = u"//a[text()='\u00AB Prev']"
class MrLovenstein(_BasicScraper):
url = 'http://www.mrlovenstein.com/'
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % '1'
imageSearch = (
# captures rollover comic
compile(tagre("div", "class", r'comic_image') + r'\s*.*\s*' +
tagre("div", "style", r'display: none;') + r'\s*.*\s' +
tagre("img", "src", r'(/images/comics/[^"]+)')),
# captures standard comic
compile(tagre("img", "src", r'(/images/comics/[^"]+)',
before="comic_main_image")),
)
prevSearch = compile(tagre("a", "href", r'([^"]+)') +
tagre("img", "src", "/images/nav_left.png"))
textSearch = compile(r'<meta name="description" content="(.+?)" />')
help = 'Index Format: n'
class MyCartoons(_BasicScraper):
url = 'http://mycartoons.de/'
rurl = escape(url)
stripUrl = url + 'page/%s'
imageSearch = (
compile(tagre("img", "src", r'(%swp-content/cartoons/(?:[^"]+/)?\d+-\d+-\d+[^"]+)' % rurl)),
compile(tagre("img", "src", r'(%scartoons/[^"]+/\d+-\d+-\d+[^"]+)' % rurl)),
)
prevSearch = compile(tagre("a", "href", r'(%spage/[^"]+)' % rurl) +
"&laquo;")
help = 'Index format: number'
lang = 'de'
class Moonsticks(ParserScraper):
url = "https://moonsticks.org/"
imageSearch = "//div[d:class('entry-content')]//img"
prevSearch = ('//a[@rel="prev"]', "//a[text()='\u00AB Prev']")
class MyLifeWithFel(ParserScraper):

View file

@ -11,6 +11,12 @@ from ..util import tagre
from .common import WordPressScraper, WordPressNavi
class OccasionalComicsDisorder(WordPressScraper):
url = 'https://occasionalcomics.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'latest-comic-2'
class OctopusPie(_ParserScraper):
url = 'http://www.octopuspie.com/'
rurl = escape(url)

View file

@ -604,7 +604,6 @@ class Removed(Scraper):
cls('WotNow'),
# Removed in 3.0
cls('CatenaManor/CatenaCafe'),
cls('ComicFury/AdventuresOftheGreatCaptainMaggieandCrew'),
cls('ComicFury/AWAKENING'),
cls('ComicFury/Beebleville'),
@ -833,8 +832,6 @@ class Removed(Scraper):
cls('ComicsKingdom/Redeye'),
cls('ComicsKingdom/RedeyeSundays'),
cls('CrapIDrewOnMyLunchBreak'),
cls('FalseStart'),
cls('Ginpu'),
cls('GoComics/060'),
cls('GoComics/2CowsAndAChicken'),
cls('GoComics/ABitSketch'),
@ -995,11 +992,9 @@ class Removed(Scraper):
cls('GoComics/Wrobbertcartoons'),
cls('GoComics/Zootopia'),
cls('JustAnotherEscape'),
cls('KemonoCafe/PrincessBunny'),
cls('Laiyu', 'brk'),
cls('MangaDex/DrStone', 'legal'),
cls('MangaDex/HeavensDesignTeam', 'legal'),
cls('MangaDex/ImTheMaxLevelNewbie', 'legal'),
cls('MangaDex/SPYxFAMILY', 'legal'),
cls('Ryugou'),
cls('SeelPeel'),
@ -1573,22 +1568,82 @@ class Removed(Scraper):
cls('SnafuComics/Tin'),
cls('SnafuComics/Titan'),
cls('StudioKhimera/Eorah', 'mov'),
cls('StudioKhimera/Mousechevious'),
cls('StuffNoOneToldMe'),
cls('TaleOfTenThousand'),
cls('TalesAndTactics'),
cls('TheCyantianChronicles/CookieCaper'),
cls('TheCyantianChronicles/Pawprints'),
cls('VampireHunterBoyfriends'),
cls('VGCats/Adventure'),
cls('VGCats/Super'),
cls('VictimsOfTheSystem'),
cls('WebDesignerCOTW'),
cls('WebToons/Adamsville'),
cls('WebToons/CrapIDrewOnMyLunchBreak'),
cls('WintersLight'),
# Removed in 3.1
cls('AbbysAgency', 'brk'),
cls('AcademyVale'),
cls('AhoyEarth', 'block'),
cls('Anaria', 'del'),
cls('Angels2200', 'del'),
cls('BlackRose', 'brk'),
cls('CatenaManor/CatenaCafe'),
cls('ComicsKingdom/AmazingSpiderman'),
cls('ComicsKingdom/AmazingSpidermanSpanish'),
cls('ComicsKingdom/BigBenBoltSundays'),
cls('ComicsKingdom/BonersArkSundays'),
cls('ComicsKingdom/BrianDuffy'),
cls('ComicsKingdom/Crankshaft'),
cls('ComicsKingdom/FlashGordonSundays'),
cls('ComicsKingdom/FunkyWinkerbean'),
cls('ComicsKingdom/FunkyWinkerbeanSunday'),
cls('ComicsKingdom/FunkyWinkerbeanSundays'),
cls('ComicsKingdom/FunkyWinkerbeanVintage'),
cls('ComicsKingdom/HeartOfJulietJonesSundays'),
cls('ComicsKingdom/KatzenjammerKidsSundays'),
cls('ComicsKingdom/Lockhorns'),
cls('ComicsKingdom/MandrakeTheMagicianSundays'),
cls('ComicsKingdom/MarkTrailVintage'),
cls('ComicsKingdom/MikePeters'),
cls('ComicsKingdom/MotherGooseAndGrimm'),
cls('ComicsKingdom/PhantomSundays'),
cls('ComicsKingdom/PrinceValiantSundays'),
cls('ComicsKingdom/Retail'),
cls('ComicsKingdom/TigerSundays'),
cls('ComicsKingdom/TigerVintage'),
cls('ComicsKingdom/TigerVintageSundays'),
cls('Everblue', 'block'),
cls('FalseStart'),
cls('Ginpu'),
cls('GoComics/9ChickweedLaneClassics'),
cls('GoComics/Badlands'),
cls('GoComics/BigNateFirstClass'),
cls('GoComics/BreakOfDay'),
cls('GoComics/Candorville'),
cls('GoComics/DilbertClassics'),
cls('GoComics/DilbertEnEspanol'),
cls('GoComics/DumbwichCastle'),
cls('GoComics/EyebeamClassic'),
cls('GoComics/GarfieldClassics'),
cls('GoComics/MakingIt'),
cls('GoComics/MtPleasant'),
cls('GoComics/PCAndPixel'),
cls('GoComics/SaltNPepper'),
cls('GoComics/SigneWilkinson'),
cls('GoComics/Snowflakes'),
cls('GoComics/StoneSoupClassics'),
cls('GoComics/StuartCarlson'),
cls('KemonoCafe/PrincessBunny'),
cls('Lackadaisy', 'block'),
cls('MangaDex/ImTheMaxLevelNewbie', 'legal'),
cls('MrLovenstein', 'jsh'),
cls('MyCartoons'),
cls('Shivae/BlackRose', 'brk'),
cls('StudioKhimera/Mousechevious'),
cls('TalesAndTactics'),
cls('VampireHunterBoyfriends'),
cls('WebToons/CrystalVirus'),
cls('WebToons/OVERPOWERED'),
cls('WintersLight'),
)
@ -1667,10 +1722,8 @@ class Renamed(Scraper):
# Renamed in 3.0
cls('AHClub', 'RickGriffinStudios/AHClub'),
cls('ComicFury/MuddlemarchMudCompany', 'ComicFury/MudCompany'),
cls('ComicsKingdom/FunkyWinkerbeanSundays', 'ComicsKingdom/FunkyWinkerbeanSunday'),
cls('ComicsKingdom/ShermansLagoon', 'GoComics/ShermansLagoon'),
cls('ComicsKingdom/TheLittleKing', 'ComicsKingdom/LittleKing'),
cls('ComicsKingdom/TigerSundays', 'ComicsKingdom/TigerVintageSundays'),
cls('GoComics/BloomCounty2017', 'GoComics/BloomCounty2019'),
cls('GoComics/Cathy', 'GoComics/CathyClassics'),
cls('GoComics/DarrinBell', 'ComicsKingdom/DarrinBell'),
@ -1681,7 +1734,6 @@ class Renamed(Scraper):
cls('GoComics/Widdershins', 'Widdershins'),
cls('Guardia', 'ComicFury/Guardia'),
cls('RadioactivePanda', 'Tapas/RadioactivePanda'),
cls('Shivae/BlackRose', 'BlackRose'),
cls('SmackJeeves/BlackTapestries', 'ComicFury/BlackTapestries'),
cls('SmackJeeves/ByTheBook', 'ByTheBook'),
cls('SmackJeeves/FurryExperience', 'ComicFury/FurryExperience'),
@ -1694,6 +1746,9 @@ class Renamed(Scraper):
cls('TracesOfThePast/NSFW', 'RickGriffinStudios/TracesOfThePastNSFW'),
# Renamed in 3.1
cls('ComicsKingdom/SlylockFoxAndComicsForKids', 'ComicsKingdom/SlylockFox'),
cls('ComicsKingdom/SlylockFoxAndComicsForKidsSpanish', 'ComicsKingdom/SlylockFoxSpanish'),
cls('Exiern', 'ComicFury/Exiern'),
cls('MaxOveracts', 'OccasionalComicsDisorder'),
cls('SafelyEndangered', 'WebToons/SafelyEndangered'),
)

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
@ -34,16 +34,11 @@ class ParadigmShift(_BasicScraper):
help = 'Index format: custom'
class ParallelUniversum(_BasicScraper):
url = 'http://www.paralleluniversum.net/'
rurl = escape(url)
class ParallelUniversum(WordPressScraper):
url = 'https://www.paralleluniversum.net/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '001-der-comic-ist-tot'
imageSearch = compile(tagre("img", "src",
r'(%scomics/\d+-\d+-\d+[^"]+)' % rurl))
prevSearch = compile(tagre("a", "href", r'(%s[^"]+/)' % rurl) +
tagre("span", "class", "prev"))
help = 'Index format: number-stripname'
prevSearch = '//a[@rel="prev"]'
lang = 'de'
@ -95,14 +90,12 @@ class PebbleVersion(_ParserScraper):
help = 'Index format: n (unpadded)'
class PennyAndAggie(_BasicScraper):
url = 'http://pennyandaggie.com/'
rurl = escape(url)
stripUrl = url + 'index.php?p=%s'
imageSearch = compile(tagre("img", "src", r'(http://www\.pennyandaggie\.com/comics/[^"]+)'))
prevSearch = compile(tagre("a", "href", r"(index\.php\?p\=\d+)", quote="'") +
tagre("img", "src", r'%simages/previous_day\.gif' % rurl, quote=""))
help = 'Index format: n (unpadded)'
class PennyAndAggie(ComicControlScraper):
url = 'https://pixietrixcomix.com/penny-and-aggie'
stripUrl = url + '/%s'
firstStripUrl = stripUrl % '2004-09-06'
endOfLife = True
help = 'Index format: yyyy-mm-dd'
class PennyArcade(_ParserScraper):
@ -117,19 +110,17 @@ class PennyArcade(_ParserScraper):
help = 'Index format: yyyy/mm/dd'
class PeppermintSaga(WordPressNavi):
class PeppermintSaga(WordPressScraper):
url = 'http://www.pepsaga.com/'
stripUrl = url + '?p=%s'
firstStripUrl = stripUrl % '3'
help = 'Index format: number'
stripUrl = url + 'comics/%s/'
firstStripUrl = stripUrl % 'the-sword-of-truth-vol1'
adult = True
class PeppermintSagaBGR(WordPressNavi):
class PeppermintSagaBGR(WordPressScraper):
url = 'http://bgr.pepsaga.com/'
stripUrl = url + '?p=%s'
firstStripUrl = stripUrl % '4'
help = 'Index format: number'
stripUrl = url + '?comic=%s'
firstStripUrl = stripUrl % '04172011'
adult = True
@ -150,14 +141,16 @@ class PeterAndWhitney(_ParserScraper):
prevSearch = '//a[./img[contains(@src, "nav_previous")]]'
class PHDComics(_ParserScraper):
class PHDComics(ParserScraper):
BROKEN_COMMENT_END = compile(r'--!>')
baseUrl = 'http://phdcomics.com/'
url = baseUrl + 'comics.php'
stripUrl = baseUrl + 'comics/archive.php?comicid=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//img[@id="comic2"]'
imageSearch = ('//img[@id="comic2"]',
r'//img[d:class("img-responsive") and re:test(@name, "comic\d+")]')
multipleImagesPerStrip = True
prevSearch = '//a[img[contains(@src, "prev_button")]]'
nextSearch = '//a[img[contains(@src, "next_button")]]'
help = 'Index format: n (unpadded)'
@ -173,7 +166,7 @@ class PHDComics(_ParserScraper):
# video
self.stripUrl % '1880',
self.stripUrl % '1669',
)
) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]')
class Picklewhistle(ComicControlScraper):
@ -333,11 +326,12 @@ class PS238(_ParserScraper):
class PvPOnline(ParserScraper):
baseUrl = 'https://www.toonhoundstudios.com/'
url = baseUrl + 'pvp/'
stripUrl = baseUrl + 'comic/%s/'
stripUrl = baseUrl + 'comic/%s/?sid=372'
url = stripUrl % 'pvp-2022-09-16'
firstStripUrl = stripUrl % '19980504'
imageSearch = '//div[@id="spliced-comic"]//img/@data-src-img'
prevSearch = '//a[d:class("prev")]'
endOfLife = True
def namer(self, imageUrl, pageUrl):
return 'pvp' + imageUrl.rsplit('/', 1)[-1]
def namer(self, image_url, page_url):
return 'pvp' + image_url.rsplit('/', 1)[-1]

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile
from urllib.parse import urljoin
@ -121,7 +121,7 @@ class Requiem(WordPressScraper):
firstStripUrl = stripUrl % '2004-06-07-3'
class Replay(_ParserScraper):
class Replay(ParserScraper):
url = 'http://replaycomic.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'red-desert'
@ -132,11 +132,11 @@ class Replay(_ParserScraper):
def starter(self):
# Retrieve archive page to identify chapters
archivePage = self.getPage(self.url + 'archive')
archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]')
archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]')
self.chapter = len(archive) - 1
self.startOfChapter = []
for archiveChapter in archive:
self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href'))
self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href'))
return bounceStarter(self)
def namer(self, imageUrl, pageUrl):

View file

@ -196,7 +196,7 @@ class Sharksplode(WordPressScraper):
class Sheldon(ParserScraper):
url = 'https://www.sheldoncomics.com/'
firstStripUrl = url + 'comic/well-who-is-this/'
imageSearch = '//div[@id="comic"]//img'
imageSearch = '//div[@id="comic"]//img/@data-src-img'
prevSearch = '//a[img[d:class("left")]]'
@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper):
def extract_image_urls(self, url, data):
# Website requires JS, so build the list of image URLs manually
imageurls = []
current = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value'))
for page in reversed(range(1, current + 1)):
imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
return imageurls
@ -636,16 +636,16 @@ class StrongFemaleProtagonist(_ParserScraper):
)
class StupidFox(_ParserScraper):
class StupidFox(ParserScraper):
url = 'http://stupidfox.net/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % 'hello'
imageSearch = '//div[@class="comicmid"]//img'
imageSearch = '//div[d:class("comicmid")]//img'
prevSearch = '//a[@accesskey="p"]'
def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
return title + '.' + imageUrl.rsplit('.', 1)[-1]

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2021 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from .common import WordPressSpliced
@ -12,22 +12,20 @@ class _CommonMulti(WordPressSpliced):
self.endOfLife = eol
class AbbysAgency(WordPressSpliced):
url = 'https://abbysagency.us/'
stripUrl = url + 'blog/comic/%s/'
firstStripUrl = stripUrl % 'a'
class AlienDice(WordPressSpliced):
url = 'https://aliendice.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '05162001'
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return not self.match(data, self.imageSearch)
def getPrevUrl(self, url, data):
# Fix broken navigation
if url == self.stripUrl % 'day-29-part-2-page-3-4':
return self.stripUrl % 'day-29-part-2-page-3-2'
return super(AlienDice, self).getPrevUrl(url, data)
return super().getPrevUrl(url, data)
def namer(self, imageUrl, pageUrl):
# Fix inconsistent filename
@ -47,12 +45,6 @@ class AlienDiceLegacy(WordPressSpliced):
return super().isfirststrip(url.rsplit('?', 1)[0])
class BlackRose(WordPressSpliced):
url = 'https://www.blackrose.monster/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '2004-11-01'
class TheCyantianChronicles(_CommonMulti):
baseUrl = 'https://cyantian.net/'
@ -81,9 +73,9 @@ class TheCyantianChronicles(_CommonMulti):
class Shivae(WordPressSpliced):
url = 'https://shivae.com/'
url = 'https://shivae.net/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '09202001'
firstStripUrl = stripUrl % '2002-02-27'
class ShivaeComics(_CommonMulti):

View file

@ -4,10 +4,7 @@
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, MULTILINE
try:
from functools import cached_property
except ImportError:
from cached_property import cached_property
from functools import cached_property
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
from ..helpers import indirectStarter, joinPathPartsNamer
@ -275,7 +272,7 @@ class ToonHole(ParserScraper):
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]'
starter = indirectStarter
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
class TrippingOverYou(_BasicScraper):

View file

@ -3,7 +3,6 @@
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..output import out
from ..scraper import ParserScraper
from ..xml import NS
class Tapas(ParserScraper):
@ -21,7 +20,7 @@ class Tapas(ParserScraper):
def starter(self):
# Retrieve comic metadata from info page
info = self.getPage(self.url)
series = info.xpath('//@data-series-id')[0]
series = self.match(info, '//@data-series-id')[0]
# Retrieve comic metadata from API
data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST')
data.raise_for_status()
@ -43,7 +42,7 @@ class Tapas(ParserScraper):
return self._cached_image_urls
def shouldSkipUrl(self, url, data):
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
if self.match(data, '//button[d:class("js-have-to-sign")]'):
out.warn(f'Nothing to download on "{url}", because a login is required.')
return True
return False

View file

@ -107,7 +107,7 @@ class Unsounded(ParserScraper):
return urls
def extract_css_bg(self, page) -> str | None:
comicdivs = page.xpath('//div[@id="comic"]')
comicdivs = self.match(page, '//div[@id="comic"]')
if comicdivs:
style = comicdivs[0].attrib.get('style')
if style:

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter
@ -27,7 +27,7 @@ class VGCats(_ParserScraper):
url = 'https://www.vgcats.com/comics/'
stripUrl = url + '?strip_id=%s'
firstStripUrl = stripUrl % '0'
imageSearch = '//td/img[contains(@src, "images/")]'
imageSearch = '//td/font/img[contains(@src, "images/")]'
prevSearch = '//a[img[contains(@src, "back.")]]'
help = 'Index format: n (unpadded)'
@ -44,15 +44,15 @@ class Vibe(ParserScraper):
help = 'Index format: VIBEnnn (padded)'
class VickiFox(_ParserScraper):
class VickiFox(ParserScraper):
url = 'http://www.vickifox.com/comic/strip'
stripUrl = url + '?id=%s'
firstStripUrl = stripUrl % '001'
imageSearch = '//img[contains(@src, "comic/")]'
prevSearch = '//button[@id="btnPrev"]/@value'
def getPrevUrl(self, url, data):
return self.stripUrl % self.getPage(url).xpath(self.prevSearch)[0]
def link_modifier(self, fromurl, tourl):
return self.stripUrl % tourl
class ViiviJaWagner(_ParserScraper):

View file

@ -1,8 +1,8 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from re import compile, escape, IGNORECASE
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
@ -17,7 +17,7 @@ class WapsiSquare(WordPressNaviIn):
def shouldSkipUrl(self, url, data):
"""Skip pages without images."""
return data.xpath('//iframe') # videos
return self.match(data, '//iframe') # videos
class WastedTalent(_ParserScraper):

View file

@ -24,9 +24,9 @@ class WebToons(ParserScraper):
self.session.cookies.set(cookie, 'false', domain='webtoons.com')
# Find current episode number
listPage = self.getPage(self.listUrl)
currentEpisode = listPage.xpath('//div[@class="detail_lst"]/ul/li')[0].attrib['data-episode-no']
currentEpisode = self.match(listPage, '//div[d:class("detail_lst")]/ul/li')[0].attrib['data-episode-no']
# Check for completed tag
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != [])
self.endOfLife = not self.match(listPage, '//div[@id="_asideDetail"]//span[d:class("txt_ico_completed2")]')
return self.stripUrl % currentEpisode
def extract_image_urls(self, url, data):
@ -52,6 +52,7 @@ class WebToons(ParserScraper):
cls('1111Animals', 'comedy/1111-animals', 437),
cls('2015SpaceSeries', 'sf/2015-space-series', 391),
cls('3SecondStrip', 'comedy/3-second-strip', 380),
cls('99ReinforcedStick', 'comedy/99-reinforced-wooden-stick', 4286),
cls('ABittersweetLife', 'slice-of-life/a-bittersweet-life', 294),
cls('AboutDeath', 'drama/about-death', 82),
cls('ABudgiesLife', 'slice-of-life/its-a-budgies-life', 985),
@ -64,6 +65,7 @@ class WebToons(ParserScraper):
cls('AGoodDayToBeADog', 'romance/a-good-day-tobe-a-dog', 1390),
cls('Aisopos', 'drama/aisopos', 76),
cls('AliceElise', 'fantasy/alice-elise', 1481),
cls('AlloyComics', 'canvas/alloy-comics', 747447),
cls('AllThatWeHopeToBe', 'slice-of-life/all-that-we-hope-to-be', 470),
cls('AllThatYouAre', 'drama/all-that-you-are', 403),
cls('AlwaysHuman', 'romance/always-human', 557),
@ -128,6 +130,7 @@ class WebToons(ParserScraper):
cls('CursedPrincessClub', 'comedy/cursed-princess-club', 1537),
cls('Cyberbunk', 'sf/cyberbunk', 466),
cls('Cyberforce', 'super-hero/cyberforce', 531),
cls('CydoniaShattering', 'fantasy/cydonia-shattering', 2881),
cls('CykoKO', 'super-hero/cyko-ko', 560),
cls('Darbi', 'action/darbi', 1098),
cls('Darchon', 'challenge/darchon', 532053),
@ -153,6 +156,8 @@ class WebToons(ParserScraper):
cls('DrawnToYou', 'challenge/drawn-to-you', 172022),
cls('DrFrost', 'drama/dr-frost', 371),
cls('DuelIdentity', 'challenge/duel-identity', 532064),
cls('DungeonCleaningLife', 'action/the-dungeon-cleaning-life-of-a-once-genius-hunter', 4677),
cls('DungeonsAndDoodlesTalesFromTheTables', 'canvas/dungeons-doodles-tales-from-the-tables', 682646),
cls('DungeonMinis', 'challenge/dungeonminis', 64132),
cls('Dustinteractive', 'comedy/dustinteractive', 907),
cls('DutyAfterSchool', 'sf/duty-after-school', 370),
@ -170,6 +175,7 @@ class WebToons(ParserScraper):
cls('FAMILYMAN', 'drama/family-man', 85),
cls('FantasySketchTheGame', 'sf/fantasy-sketch', 1020),
cls('Faust', 'supernatural/faust', 522),
cls('FinalRaidBoss', 'fantasy/the-final-raid-boss', 3921),
cls('FINALITY', 'mystery/finality', 1457),
cls('Firebrand', 'supernatural/firebrand', 877),
cls('FirstDefense', 'challenge/first-defense', 532072),
@ -204,11 +210,13 @@ class WebToons(ParserScraper):
cls('HeliosFemina', 'fantasy/helios-femina', 638),
cls('HelloWorld', 'slice-of-life/hello-world', 827),
cls('Hellper', 'fantasy/hellper', 185),
cls('Hench', 'canvas/hench/', 857225),
cls('HeroineChic', 'super-hero/heroine-chic', 561),
cls('HIVE', 'thriller/hive', 65),
cls('Hooky', 'fantasy/hooky', 425),
cls('HoovesOfDeath', 'fantasy/hooves-of-death', 1535),
cls('HouseOfStars', 'fantasy/house-of-stars', 1620),
cls('HowToBeAMindReaver', 'canvas/how-to-be-a-mind-reaver', 301213),
cls('HowToBecomeADragon', 'fantasy/how-to-become-a-dragon', 1973),
cls('HowToLove', 'slice-of-life/how-to-love', 472),
cls('IDontWantThisKindOfHero', 'super-hero/i-dont-want-this-kind-of-hero', 98),
@ -235,6 +243,7 @@ class WebToons(ParserScraper):
cls('KindOfLove', 'slice-of-life/kind-of-love', 1850),
cls('KissItGoodbye', 'challenge/kiss-it-goodbye', 443703),
cls('KnightRun', 'sf/knight-run', 67),
cls('KnightUnderMyHeart', 'action/knight-under-my-heart', 4215),
cls('Kubera', 'fantasy/kubera', 83),
cls('LalinsCurse', 'supernatural/lalins-curse', 1601),
cls('Lars', 'slice-of-life/lars', 358),
@ -261,6 +270,7 @@ class WebToons(ParserScraper):
cls('LUMINE', 'fantasy/lumine', 1022),
cls('Lunarbaboon', 'slice-of-life/lunarbaboon', 523),
cls('MageAndDemonQueen', 'comedy/mage-and-demon-queen', 1438),
cls('MageAndMimic', 'comedy/mage-and-mimic', 5973),
cls('Magical12thGraders', 'super-hero/magical-12th-graders', 90),
cls('Magician', 'fantasy/magician', 70),
cls('MagicSodaPop', 'fantasy/magic-soda-pop', 1947),
@ -292,6 +302,8 @@ class WebToons(ParserScraper):
cls('MyGiantNerdBoyfriend', 'slice-of-life/my-giant-nerd-boyfriend', 958),
cls('MyKittyAndOldDog', 'slice-of-life/my-kitty-and-old-dog', 184),
cls('MyNameIsBenny', 'slice-of-life/my-name-is-benny', 1279),
cls('MySClassHunter', 'action/my-s-class-hunters', 3963),
cls('MythicItemObtained', 'fantasy/mythic-item-obtained', 4582),
cls('MyWallflowerKiss', 'challenge/my-wallflower-kiss', 151869),
cls('NanoList', 'sf/nano-list', 700),
cls('NationalDogDay2016', 'slice-of-life/national-dog-day', 747),
@ -439,6 +451,7 @@ class WebToons(ParserScraper):
cls('UpAndOut', 'slice-of-life/up-and-out', 488),
cls('UrbanAnimal', 'super-hero/urban-animal', 1483),
cls('Uriah', 'horror/uriah', 1607),
cls('VampireFamily', 'comedy/vampire-family', 6402),
cls('VarsityNoir', 'mystery/varsity-noir', 1613),
cls('VersionDayAndNight', 'drama/version-day-and-night', 1796),
cls('WafflesAndPancakes', 'slice-of-life/waffles-and-pancakes', 1310),

View file

@ -1,6 +1,6 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2022 Daniel Ring
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from ..scraper import ParserScraper
from ..helpers import indirectStarter
@ -15,21 +15,21 @@ class Wrongside(ParserScraper):
def starter(self):
archivePage = self.getPage(self.url)
chapterUrls = archivePage.xpath('//ul[@class="albThumbs"]//a/@href')
chapterUrls = self.match(archivePage, '//ul[d:class("albThumbs")]//a/@href')
self.archive = []
for chapterUrl in chapterUrls:
chapterPage = self.getPage(chapterUrl)
self.archive.append(chapterPage.xpath('(//ul[@id="thumbnails"]//a/@href)[last()]')[0])
self.archive.append(self.match(chapterPage, '(//ul[@id="thumbnails"]//a/@href)[last()]')[0])
return self.archive[0]
def getPrevUrl(self, url, data):
if data.xpath(self.prevSearch) == [] and len(self.archive) > 0:
if self.match(data, self.prevSearch) == [] and len(self.archive) > 0:
return self.archive.pop()
return super(Wrongside, self).getPrevUrl(url, data)
def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0]
title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
@ -71,5 +71,5 @@ class WrongsideSideStories(ParserScraper):
def namer(self, imageUrl, pageUrl):
page = self.getPage(pageUrl)
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0]
title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]

View file

@ -23,7 +23,7 @@ class Zapiro(ParserScraper):
imageSearch = '//div[@id="cartoon"]/img'
prevSearch = '//a[d:class("left")]'
nextSearch = '//a[d:class("right")]'
namer = joinPathPartsNamer((-1,), ())
namer = joinPathPartsNamer(pageparts=(-1,))
class ZenPencils(WordPressNavi):
@ -60,7 +60,7 @@ class Zwarwald(BasicScraper):
tagre("img", "src",
r'http://zwarwald\.de/images/prev\.jpg',
quote="'"))
namer = joinPathPartsNamer((), (-3, -2, -1))
namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
help = 'Index format: number'
def shouldSkipUrl(self, url, data):

View file

@ -119,45 +119,45 @@ class Scraper:
if val:
self._indexes = tuple(sorted(val))
def __init__(self, name):
def __init__(self, name: str) -> None:
"""Initialize internal variables."""
self.name = name
self.urls = set()
self.urls: set[str] = set()
self._indexes = ()
self.skippedUrls = set()
self.skippedUrls: set[str] = set()
self.hitFirstStripUrl = False
def __hash__(self):
def __hash__(self) -> int:
"""Get hash value from name and index list."""
return hash((self.name, self.indexes))
def shouldSkipUrl(self, url, data):
def shouldSkipUrl(self, url: str, data) -> bool:
"""Determine if search for images in given URL should be skipped."""
return False
def getComicStrip(self, url, data):
def getComicStrip(self, url, data) -> ComicStrip:
"""Get comic strip downloader for given URL and data."""
imageUrls = self.extract_image_urls(url, data)
urls = self.extract_image_urls(url, data)
# map modifier function on image URLs
imageUrls = [self.imageUrlModifier(x, data) for x in imageUrls]
urls = [self.imageUrlModifier(x, data) for x in urls]
# remove duplicate URLs
imageUrls = uniq(imageUrls)
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
urls = uniq(urls)
if len(urls) > 1 and not self.multipleImagesPerStrip:
out.warn(
u"Found %d images instead of 1 at %s with expressions %s" %
(len(imageUrls), url, prettyMatcherList(self.imageSearch)))
image = imageUrls[0]
out.warn(u"Choosing image %s" % image)
imageUrls = (image,)
elif not imageUrls:
out.warn(u"Found no images at %s with expressions %s" % (url,
(len(urls), url, prettyMatcherList(self.imageSearch)))
image = urls[0]
out.warn("Choosing image %s" % image)
urls = (image,)
elif not urls:
out.warn("Found no images at %s with expressions %s" % (url,
prettyMatcherList(self.imageSearch)))
if self.textSearch:
text = self.fetchText(url, data, self.textSearch,
optional=self.textOptional)
else:
text = None
return ComicStrip(self, url, imageUrls, text=text)
return ComicStrip(self, url, urls, text=text)
def getStrips(self, maxstrips=None):
"""Get comic strips."""
@ -217,7 +217,7 @@ class Scraper:
break
url = prevUrl
def isfirststrip(self, url):
def isfirststrip(self, url: str) -> bool:
"""Check if the specified URL is the first strip of a comic. This is
specially for comics taken from archive.org, since the base URL of
archive.org changes whenever pages are taken from a different
@ -228,7 +228,7 @@ class Scraper:
currenturl = ARCHIVE_ORG_URL.sub('', url)
return firsturl == currenturl
def getPrevUrl(self, url, data):
def getPrevUrl(self, url: str, data) -> str | None:
"""Find previous URL."""
prevUrl = None
if self.prevSearch:
@ -243,40 +243,40 @@ class Scraper:
getHandler().comicPageLink(self, url, prevUrl)
return prevUrl
def getIndexStripUrl(self, index):
def getIndexStripUrl(self, index: str) -> str:
"""Get comic strip URL from index."""
return self.stripUrl % index
def starter(self):
def starter(self) -> str:
"""Get starter URL from where to scrape comic strips."""
return self.url
def namer(self, image_url, page_url):
def namer(self, image_url: str, page_url: str) -> str | None:
"""Return filename for given image and page URL."""
return
def link_modifier(self, fromurl, tourl):
def link_modifier(self, fromurl: str, tourl: str) -> str:
"""Optional modification of parsed link (previous/back/latest) URLs.
Useful if there are domain redirects. The default implementation does
not modify the URL.
"""
return tourl
def imageUrlModifier(self, image_url, data):
def imageUrlModifier(self, image_url: str, data) -> str:
"""Optional modification of parsed image URLs. Useful if the URL
needs to be fixed before usage. The default implementation does
not modify the URL. The given data is the URL page data.
"""
return image_url
def vote(self):
def vote(self) -> None:
"""Cast a public vote for this comic."""
uid = get_system_uid()
data = {"name": self.name.replace('/', '_'), "uid": uid}
response = self.session.post(configuration.VoteUrl, data=data)
response.raise_for_status()
def get_download_dir(self, basepath):
def get_download_dir(self, basepath: str) -> str:
"""Try to find the corect download directory, ignoring case
differences."""
path = basepath
@ -294,16 +294,16 @@ class Scraper:
path = os.path.join(path, part)
return path
def getCompleteFile(self, basepath):
def getCompleteFile(self, basepath: str) -> str:
"""Get filename indicating all comics are downloaded."""
dirname = self.get_download_dir(basepath)
return os.path.join(dirname, "complete.txt")
def isComplete(self, basepath):
def isComplete(self, basepath: str) -> bool:
"""Check if all comics are downloaded."""
return os.path.isfile(self.getCompleteFile(basepath))
def setComplete(self, basepath):
def setComplete(self, basepath: str) -> None:
"""Set complete flag for this comic, ie. all comics are downloaded."""
if self.endOfLife:
filename = self.getCompleteFile(basepath)
@ -521,15 +521,10 @@ class ParserScraper(Scraper):
return text.strip()
def _matchPattern(self, data, patterns):
if self.css:
searchFun = data.cssselect
else:
def searchFun(s):
return data.xpath(s, namespaces=NS)
patterns = makeSequence(patterns)
for search in patterns:
matched = False
for match in searchFun(search):
for match in self.match(data, search):
matched = True
yield match, search
@ -537,6 +532,13 @@ class ParserScraper(Scraper):
# do not search other links if one pattern matched
break
def match(self, data, pattern):
"""Match a pattern (XPath/CSS) against a page."""
if self.css:
return data.cssselect(pattern)
else:
return data.xpath(pattern, namespaces=NS)
def getDisabledReasons(self):
res = {}
if self.css and cssselect is None:

View file

@ -17,7 +17,6 @@ classifiers = [
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
@ -27,15 +26,13 @@ classifiers = [
"Topic :: Multimedia :: Graphics",
]
keywords = ["comic", "webcomic", "downloader", "archiver", "crawler"]
requires-python = ">=3.7"
requires-python = ">=3.8"
dependencies = [
"colorama",
"imagesize",
"lxml>=4.0.0",
"platformdirs",
"requests>=2.0",
"cached_property;python_version<'3.8'",
"importlib_metadata;python_version<'3.8'",
"importlib_resources>=5.0.0;python_version<'3.9'",
]
dynamic = ["version"]
@ -101,7 +98,7 @@ ignore = [
]
noqa-require-code = true
no-accept-encodings = true
min-version = "3.7"
min-version = "3.8"
extend-exclude = [
'.venv',
'build',

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# Copyright (C) 2019-2022 Tobias Gruetzmacher
# Copyright (C) 2019 Thomas W. Littauer
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Thomas W. Littauer
"""
Script to get a list of comicskingdom.com comics and save the info in a JSON
file for further processing.
@ -19,39 +19,17 @@ class ComicsKingdomUpdater(ComicListUpdater):
"ComicGenesis/%s",
)
def handle_startpage(self, page):
"""Parse list of comics from the bottom of the start page."""
for li in page.xpath('//div[d:class("comics-list")]//li', namespaces=NS):
link = li.xpath('./a')[0]
def handle_listing(self, page):
for link in page.xpath('//ul[d:class("index")]//a', namespaces=NS):
name = link.text_content().removeprefix('The ')
url = link.attrib['href']
name = link.text.removeprefix('The ')
lang = 'es' if ' (Spanish)' in name else None
self.add_comic(name, (url, None))
def handle_listing(self, page, lang: str = None, add: str = ''):
hasnew = True
while hasnew:
hasnew = False
for comicdiv in page.xpath('//div[d:class("tile")]', namespaces=NS):
nametag = comicdiv.xpath('./a/comic-name')
if len(nametag) == 0:
continue
name = nametag[0].text.removeprefix('The ') + add
url = comicdiv.xpath('./a')[0].attrib['href']
if self.add_comic(name, (url, lang)):
hasnew = True
nextlink = page.xpath('//a[./img[contains(@src, "page-right")]]')
page = self.get_url(nextlink[0].attrib['href'])
self.add_comic(name, (url, lang))
def collect_results(self):
"""Parse all search result pages."""
page = self.get_url('https://www.comicskingdom.com/')
self.handle_startpage(page)
self.handle_listing(page)
self.handle_listing(self.get_url('https://www.comicskingdom.com/spanish'), 'es', 'Spanish')
self.handle_listing(self.get_url('https://comicskingdom.com/features'))
def get_entry(self, name: str, data: tuple[str, str]):
opt = f", lang='{data[1]}'" if data[1] else ''

View file

@ -1,28 +1,30 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2017-2020 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2017 Tobias Gruetzmacher
import re
from importlib import metadata
# Idea from
# https://github.com/pyinstaller/pyinstaller/wiki/Recipe-Setuptools-Entry-Point,
# but with importlib
def Entrypoint(group, name, **kwargs):
import re
try:
from importlib.metadata import entry_points
except ImportError:
from importlib_metadata import entry_points
def entrypoint(group, name, **kwargs):
# get the entry point
eps = entry_points()[group]
ep = next(ep for ep in eps if ep.name == name)
module, attr = re.split(r'\s*:\s*', ep.value, 1)
eps = metadata.entry_points()
if 'select' in dir(eps):
# modern
ep = eps.select(group=group)[name]
else:
# legacy (pre-3.10)
ep = next(ep for ep in eps[group] if ep.name == name)
module, attr = re.split(r'\s*:\s*', ep.value, maxsplit=1)
# script name must not be a valid module name to avoid name clashes on import
script_path = os.path.join(workpath, name + '-script.py')
print("creating script for entry point", group, name)
with open(script_path, 'w') as fh:
with open(script_path, mode='w', encoding='utf-8') as fh:
print("import sys", file=fh)
print("import", module, file=fh)
print("sys.exit(%s.%s())" % (module, attr), file=fh)
print(f"sys.exit({module}.{attr}())", file=fh)
return Analysis(
[script_path] + kwargs.get('scripts', []),
@ -30,7 +32,7 @@ def Entrypoint(group, name, **kwargs):
)
a = Entrypoint('console_scripts', 'dosage')
a = entrypoint('console_scripts', 'dosage')
a.binaries = [x for x in a.binaries if not x[1].lower().startswith(r'c:\windows')]

View file

@ -1,8 +1,8 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
"""
Script to get a list of gocomics and save the info in a JSON file for further
processing.
@ -20,6 +20,8 @@ class GoComicsUpdater(ComicListUpdater):
excluded_comics = (
# too short
'LukeyMcGarrysTLDR',
# Has its own module
'Widdershins',
)
def handle_gocomics(self, url, outercss='a.gc-blended-link', lang=None):

View file

@ -61,7 +61,10 @@ def create_symlinks(d):
else:
order.extend(data["pages"][work]["images"].values())
if "prev" in data["pages"][work]:
work = data["pages"][work]["prev"]
if data["pages"][work]["prev"] == work:
work = None
else:
work = data["pages"][work]["prev"]
else:
work = None
order.reverse()

View file

@ -3,12 +3,15 @@
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
import re
from operator import attrgetter
import pytest
from dosagelib.scraper import scrapers
from dosagelib.plugins import old
class TestComicNames(object):
class TestComicNames:
def test_names(self):
for scraperobj in scrapers.all():
@ -20,11 +23,11 @@ class TestComicNames(object):
comicname = name
assert re.sub("[^0-9a-zA-Z_]", "", comicname) == comicname
def test_renamed(self):
for scraperobj in scrapers.all(include_removed=True):
if not isinstance(scraperobj, old.Renamed):
continue
assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper
newscraper = scrapers.find(scraperobj.newname)
assert len(newscraper.getDisabledReasons()) == 0
@pytest.mark.parametrize(('scraperobj'),
[obj for obj in scrapers.all(include_removed=True)
if isinstance(obj, old.Renamed)], ids=attrgetter('name'))
def test_renamed(self, scraperobj):
assert len(scraperobj.getDisabledReasons()) > 0
# Renamed scraper should only point to an non-disabled scraper
newscraper = scrapers.find(scraperobj.newname)
assert len(newscraper.getDisabledReasons()) == 0

View file

@ -1,9 +1,9 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2019 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
from dosagelib.helpers import joinPathPartsNamer, queryNamer
class TestNamer(object):
class TestNamer:
"""
Tests for comic namer.
"""
@ -16,6 +16,8 @@ class TestNamer(object):
def test_joinPathPartsNamer(self):
imgurl = 'https://HOST/wp-content/uploads/2019/02/tennis5wp-1.png'
pageurl = 'https://HOST/2019/03/11/12450/'
assert joinPathPartsNamer((0, 1, 2))(self, imgurl, pageurl) == '2019_03_11_tennis5wp-1.png'
assert joinPathPartsNamer((0, 1, 2), (-1,), '-')(self, imgurl, pageurl) == '2019-03-11-tennis5wp-1.png'
assert joinPathPartsNamer((0, -2), ())(self, imgurl, pageurl) == '2019_12450'
assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,))(self,
imgurl, pageurl) == '2019_03_11_tennis5wp-1.png'
assert joinPathPartsNamer(pageparts=(0, 1, 2), imageparts=(-1,), joinchar='-')(self,
imgurl, pageurl) == '2019-03-11-tennis5wp-1.png'
assert joinPathPartsNamer(pageparts=(0, -2))(self, imgurl, pageurl) == '2019_12450'

View file

@ -1,10 +1,9 @@
[tox]
envlist = py37, py38, py39, py310, py311, py312, flake8
envlist = py38, py39, py310, py311, py312, flake8
isolated_build = True
[gh-actions]
python =
3.7: py37
3.8: py38
3.9: py39
3.10: py310