dosage/dosagelib/plugins/z.py

# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
from re import compile, escape

from ..scraper import BasicScraper, ParserScraper
from ..util import tagre
from ..helpers import bounceStarter, joinPathPartsNamer
from .common import WordPressNavi


class ZapComic(ParserScraper):
    url = 'http://www.zapcomic.com/'
    css = True
    imageSearch = 'img.comic-item'
    prevSearch = 'a.previous-comic-link'


class Zapiro(ParserScraper):
    url = 'http://mg.co.za/zapiro/'
    starter = bounceStarter
    imageSearch = '//div[@id="cartoon"]/img'
    prevSearch = '//a[d:class("left")]'
    nextSearch = '//a[d:class("right")]'
    namer = joinPathPartsNamer(pageparts=(-1,))


class ZenPencils(WordPressNavi):
    url = 'https://web.archive.org/web/20200723091741/https://zenpencils.com/'
    multipleImagesPerStrip = True
    firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'
    starter = bounceStarter
    prevSearch = '//a[d:class("navi-prev")]'
    nextSearch = '//a[d:class("navi-next")]'
    endOfLife = True


class ZombieHunters(BasicScraper):
    url = 'http://www.thezombiehunters.com/'
    stripUrl = url + '?strip_id=%s'
    firstStripUrl = stripUrl % '1'
    imageSearch = compile(tagre("img", "src", r'(/istrip_files/strips/[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "id", "prevcomic"))
    help = 'Index format: n(unpadded)'


class Zwarwald(BasicScraper):
    url = "http://www.zwarwald.de/"
    rurl = escape(url)
    stripUrl = url + 'index.php/page/%s/'
    # anything before page 495 seems to be flash
    firstStripUrl = stripUrl % '495'
    lang = 'de'
    imageSearch = (
        compile(tagre("img", "src", r'(%simages/\d+/\d+/[^"]+)' % rurl)),
        compile(tagre("img", "src", r'(http://wp1163540\.wp190\.webpack\.hosteurope\.de/wordpress/images/\d+/\d+/[^"]+)')),
    )
    prevSearch = compile(tagre("a", "href", r'(%sindex\.php/page/\d+/)' % rurl) +
                         tagre("img", "src",
                               r'http://zwarwald\.de/images/prev\.jpg',
                               quote="'"))
    namer = joinPathPartsNamer(imageparts=(-3, -2, -1))
    help = 'Index format: number'

    def shouldSkipUrl(self, url, data):
        """Some pages have flash content."""
        return url in (
            self.stripUrl % "112",
            self.stripUrl % "222",
            self.stripUrl % "223",
            self.stripUrl % "246",
            self.stripUrl % "368",
            self.stripUrl % '495',
        )
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`# Copyright (C) 2015-2022 Tobias Gruetzmacher`
Use re.escape and add some firstStripUrl. 2013-04-10 16:19:11 +00:00			`from re import compile, escape`
Fix ZapComics, remove ZebraGirl. - ZebraGirl is now ComicFury/ZebraGirl... 2016-04-03 22:23:47 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from ..scraper import BasicScraper, ParserScraper`
Fix some comics. 2012-11-26 06:13:32 +00:00			`from ..util import tagre`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`from ..helpers import bounceStarter, joinPathPartsNamer`
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from .common import WordPressNavi`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class ZapComic(ParserScraper):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://www.zapcomic.com/'`
Fix ZapComics, remove ZebraGirl. - ZebraGirl is now ComicFury/ZebraGirl... 2016-04-03 22:23:47 +00:00			`css = True`
			`imageSearch = 'img.comic-item'`
			`prevSearch = 'a.previous-comic-link'`
Fix more comics. 2012-12-07 23:45:18 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Zapiro(ParserScraper):`
Fix some more comic modules. 2016-05-16 21:16:29 +00:00			`url = 'http://mg.co.za/zapiro/'`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = bounceStarter`
Random module fixes. 2017-05-21 22:30:31 +00:00			`imageSearch = '//div[@id="cartoon"]/img'`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`prevSearch = '//a[d:class("left")]'`
			`nextSearch = '//a[d:class("right")]'`
Update joinPathPartsNamer: Remove defaults 2024-02-18 17:02:02 +00:00			`namer = joinPathPartsNamer(pageparts=(-1,))`
Fix comics. 2012-12-04 06:02:40 +00:00
Initial commit to Github. 2012-06-20 19:58:13 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class ZenPencils(WordPressNavi):`
Try to fix some more comics using the Internet Archive 2021-01-31 22:40:21 +00:00			`url = 'https://web.archive.org/web/20200723091741/https://zenpencils.com/'`
ZenPencils: Allow multiple images per page. 2015-09-03 21:24:28 +00:00			`multipleImagesPerStrip = True`
Move more comics to common WordPressScraper. 2016-04-10 21:04:34 +00:00			`firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'`
Fix multiple imgs for json flag & ZenPencils bouncer (#133) When using the JSON output flag, if the page has more than one image, dictionary indexing cannot be used as list. For the ZenPencils comic, the bouncer is missing, saving the page url as the root url. 2019-06-19 05:09:33 +00:00			`starter = bounceStarter`
Replace xpath_class with custom xpath function 2020-07-31 20:56:30 +00:00			`prevSearch = '//a[d:class("navi-prev")]'`
			`nextSearch = '//a[d:class("navi-next")]'`
Try to fix some more comics using the Internet Archive 2021-01-31 22:40:21 +00:00			`endOfLife = True`
Added ZenPencils. 2013-04-09 17:38:47 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class ZombieHunters(BasicScraper):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://www.thezombiehunters.com/'`
			`stripUrl = url + '?strip_id=%s'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '1'`
Fix some comics. 2012-11-26 06:13:32 +00:00			`imageSearch = compile(tagre("img", "src", r'(/istrip_files/strips/[^"]+)'))`
			`prevSearch = compile(tagre("a", "href", r'(\?strip_id=\d+)') + tagre("img", "id", "prevcomic"))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`help = 'Index format: n(unpadded)'`
Added Zwarwald and AhoiPolloi 2013-03-07 22:51:55 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Zwarwald(BasicScraper):`
Added Zwarwald and AhoiPolloi 2013-03-07 22:51:55 +00:00			`url = "http://www.zwarwald.de/"`
Use re.escape and add some firstStripUrl. 2013-04-10 16:19:11 +00:00			`rurl = escape(url)`
Added Zwarwald and AhoiPolloi 2013-03-07 22:51:55 +00:00			`stripUrl = url + 'index.php/page/%s/'`
Fix some comics and add language tag. 2013-03-08 21:33:05 +00:00			`# anything before page 495 seems to be flash`
			`firstStripUrl = stripUrl % '495'`
			`lang = 'de'`
Fix zwarwald 2013-04-10 18:14:43 +00:00			`imageSearch = (`
			`compile(tagre("img", "src", r'(%simages/\d+/\d+/[^"]+)' % rurl)),`
			`compile(tagre("img", "src", r'(http://wp1163540\.wp190\.webpack\.hosteurope\.de/wordpress/images/\d+/\d+/[^"]+)')),`
			`)`
Use re.escape and add some firstStripUrl. 2013-04-10 16:19:11 +00:00			`prevSearch = compile(tagre("a", "href", r'(%sindex\.php/page/\d+/)' % rurl) +`
Fix ZapComics, remove ZebraGirl. - ZebraGirl is now ComicFury/ZebraGirl... 2016-04-03 22:23:47 +00:00			`tagre("img", "src",`
			`r'http://zwarwald\.de/images/prev\.jpg',`
			`quote="'"))`
Update joinPathPartsNamer: Remove defaults 2024-02-18 17:02:02 +00:00			`namer = joinPathPartsNamer(imageparts=(-3, -2, -1))`
Added Zwarwald and AhoiPolloi 2013-03-07 22:51:55 +00:00			`help = 'Index format: number'`

Provide page data in shouldSkipUrl() function 2014-02-10 20:58:09 +00:00			`def shouldSkipUrl(self, url, data):`
Fix some comics and add language tag. 2013-03-08 21:33:05 +00:00			`"""Some pages have flash content."""`
			`return url in (`
			`self.stripUrl % "112",`
			`self.stripUrl % "222",`
			`self.stripUrl % "223",`
			`self.stripUrl % "246",`
			`self.stripUrl % "368",`
			`self.stripUrl % '495',`
			`)`