dosage/dosagelib/plugins/g.py

# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
from re import compile, escape

from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter
from ..util import tagre
from .common import ComicControlScraper, WordPressScraper, WordPressNavi


class Galaxion(WordPressNavi):
    url = 'http://galaxioncomics.com/'
    stripUrl = url + '%s/'
    firstStripUrl = stripUrl % '1-comic/the-story-so-far/the-story-so-far'
    multipleImagesPerStrip = True
    help = 'Index format: n-comic/book-n/chapter-n/title-nnn'


class Garanos(WordPressScraper):
    stripUrl = ('https://web.archive.org/web/20180314181433/'
        'http://garanos.alexheberling.com/pages/%s/')
    url = stripUrl % 'page-487'
    firstStripUrl = stripUrl % 'vol01'
    endOfLife = True


class GastroPhobia(_ParserScraper):
    url = 'http://www.gastrophobia.com/'
    stripUrl = url + 'index.php?date=%s'
    firstStripUrl = stripUrl % '2008-07-30'
    imageSearch = '//div[@id="comic"]//img'
    prevSearch = '//div[@id="prev"]/a'
    help = 'Index format: yyyy-mm-dd'


class Geeks(_ParserScraper):
    url = ('https://web.archive.org/web/20190527194921/'
        'http://sevenfloorsdown.com/geeks/')
    stripUrl = url + 'archives/%s'
    firstStripUrl = stripUrl % '10'
    imageSearch = '//div[@id="comic"]/img'
    prevSearch = '//a[contains(text(), "Previous")]'
    endOfLife = True
    help = 'Index format: nnn'


class GeeksNextDoor(_ParserScraper):
    url = 'http://www.geeksnextcomic.com/'
    stripUrl = url + '%s.html'
    firstStripUrl = stripUrl % '2007-03-27'  # '2010-10-04'
    imageSearch = '//p/img'
    prevSearch = (
        '//a[img[contains(@src, "/nav_prev")]]',
        '//a[contains(text(), "< prev")]',  # start page is different
    )
    help = 'Index format: yyyy-mm-dd'


class Ginpu(WordPressNavi):
    url = 'https://www.ginpu.us/'
    stripUrl = url + 'comic/%s/'
    firstStripUrl = stripUrl % 'filler-2'

    def namer(self, imageUrl, pageUrl):
        filename = imageUrl.rsplit('/', 3)
        return '%s-%s_%s' % (filename[1], filename[2], filename[3])


class GirlGenius(_BasicScraper):
    baseUrl = 'http://www.girlgeniusonline.com/'
    rurl = escape(baseUrl)
    url = baseUrl + 'comic.php'
    stripUrl = url + '?date=%s'
    firstStripUrl = stripUrl % '20021104'
    imageSearch = compile(
        tagre("img", "src", r"(%sggmain/strips/[^']*)" % rurl, quote="'"))
    prevSearch = compile(tagre("a", "id", "topprev", quote="\"",
                               before=r"(%s[^\"']+)" % rurl))
    multipleImagesPerStrip = True
    help = 'Index format: yyyymmdd'


class GirlsWithSlingshots(_BasicScraper):
    url = 'https://girlswithslingshots.com/'
    rurl = escape(url)
    stripUrl = url + 'comic/%s'
    firstStripUrl = stripUrl % 'gws1'
    imageSearch = (
        compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)),
        compile(tagre("img", "src",
                      r'(http://cdn\.girlswithslingshots\.com/comics/[^"]+)')),
    )
    prevSearch = compile(tagre("a", "href", r'(%scomic/[^"]+)' % rurl,
                               before='rel="prev"'))
    help = 'Index format: stripname'


class GleefulNihilism(WordPressScraper):
    url = ('https://web.archive.org/web/20170911203122/'
        'http://gleefulnihilism.com/')
    stripUrl = url + 'comic/%s/'
    firstStripUrl = stripUrl % 'amoeba'
    endOfLife = True
    help = 'Index format: stripname'


class GoblinsComic(ComicControlScraper):
    url = 'http://www.goblinscomic.org/'


class GodChild(WordPressScraper):
    url = 'http://godchild.keenspot.com/'


class GoGetARoomie(ComicControlScraper):
    url = 'http://www.gogetaroomie.com'


class GoneWithTheBlastwave(_BasicScraper):
    url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1'
    starter = indirectStarter
    stripUrl = url[:-1] + '%s'
    firstStripUrl = stripUrl % '1'
    imageSearch = compile(r'<img.+src=".+(/comics/.+?)"')
    prevSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
                         r'<img src="images/page/default/previous')
    latestSearch = compile(r'href="(index.php\?p=comic&amp;nro=\d+)">' +
                           r'<img src="images/page/default/latest')
    help = 'Index format: n'

    def namer(self, image_url, page_url):
        return '%02d' % int(compile(r'nro=(\d+)').search(page_url).group(1))


class GrrlPower(WordPressScraper):
    url = 'https://grrlpowercomic.com/'
    stripUrl = url + 'archives/comic/%s/'
    firstStripUrl = stripUrl % 'gp0001'

    def __init__(self, name):
        super().__init__(name)
        self.session.add_throttle('grrlpowercomic.com', 1.0, 1.5)


class GuildedAge(WordPressScraper):
    url = 'http://guildedage.net/'
    firstStripUrl = url + 'comic/chapter-1-cover/'


class GUComics(_BasicScraper):
    url = 'http://www.gucomics.com/'
    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '20000710'
    imageSearch = compile(tagre("img", "src", r'(/comics/\d{4}/gu_[^"]+)'))
    prevSearch = compile(tagre("a", "href", r'(/\d+)') +
                         tagre("img", "src", r'/images/nav/prev\.png'))
    help = 'Index format: yyyymmdd'


class GunnerkriggCourt(_ParserScraper):
    url = 'http://www.gunnerkrigg.com/'
    stripUrl = url + '?p=%s'
    firstStripUrl = stripUrl % '1'
    imageSearch = '//img[@class="comic_image"]'
    prevSearch = '//a[./img[contains(@src, "prev")]]'
    help = 'Index format: number'


class Gunshow(_BasicScraper):
    url = 'http://gunshowcomic.com/'
    stripUrl = url + '%s'
    firstStripUrl = stripUrl % '1'
    imageSearch = compile(tagre("img", "src",
                                r'(http://gunshowcomic\.com/comics/[^"]+)'))
    multipleImagesPerStrip = True
    prevSearch = compile(
        tagre("a", "href", r'([^"]+)') +
        tagre("img", "src", r'[^"]*menu/small/previous\.gif'))
    help = 'Index format: n'
Update file headers The default encoding for source files is UTF-8 since Python 3, so we can drop all encoding headers. While we are at it, just replace them with SPDX headers. 2020-04-18 11:45:44 +00:00			`# SPDX-License-Identifier: MIT`
Fixup copyright years. 2016-10-28 22:21:41 +00:00			`# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs`
Updated copyright. 2014-01-05 15:50:57 +00:00			`# Copyright (C) 2012-2014 Bastian Kleineidam`
Clean up some minor warnings 2022-05-28 15:52:42 +00:00			`# Copyright (C) 2015-2022 Tobias Gruetzmacher`
Add self to authors list, update copyright headers 2020-01-13 06:34:05 +00:00			`# Copyright (C) 2019-2020 Daniel Ring`
Use re.escape and add some firstStripUrl. 2013-04-10 16:19:11 +00:00			`from re import compile, escape`
Initial commit to Github. 2012-06-20 19:58:13 +00:00
fixed GoGetARoomie 2015-06-01 03:11:16 +00:00			`from ..scraper import _BasicScraper, _ParserScraper`
Clean up some minor warnings 2022-05-28 15:52:42 +00:00			`from ..helpers import indirectStarter`
Updated documentation and fix some comics. 2012-11-20 17:53:53 +00:00			`from ..util import tagre`
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`from .common import ComicControlScraper, WordPressScraper, WordPressNavi`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Galaxion(WordPressNavi):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://galaxioncomics.com/'`
			`stripUrl = url + '%s/'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '1-comic/the-story-so-far/the-story-so-far'`
Fix Galaxion 2019-07-29 00:59:37 +00:00			`multipleImagesPerStrip = True`
Fix some comics. 2012-11-21 20:57:26 +00:00			`help = 'Index format: n-comic/book-n/chapter-n/title-nnn'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Garanos(WordPressScraper):`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`stripUrl = ('https://web.archive.org/web/20180314181433/'`
			`'http://garanos.alexheberling.com/pages/%s/')`
			`url = stripUrl % 'page-487'`
			`firstStripUrl = stripUrl % 'vol01'`
			`endOfLife = True`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`class GastroPhobia(_ParserScraper):`
Sort comics. 2013-03-06 19:21:10 +00:00			`url = 'http://www.gastrophobia.com/'`
			`stripUrl = url + 'index.php?date=%s'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '2008-07-30'`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`imageSearch = '//div[@id="comic"]//img'`
			`prevSearch = '//div[@id="prev"]/a'`
Sort comics. 2013-03-06 19:21:10 +00:00			`help = 'Index format: yyyy-mm-dd'`


Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`class Geeks(_ParserScraper):`
			`url = ('https://web.archive.org/web/20190527194921/'`
			`'http://sevenfloorsdown.com/geeks/')`
Sort comics. 2013-03-06 19:21:10 +00:00			`stripUrl = url + 'archives/%s'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '10'`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`imageSearch = '//div[@id="comic"]/img'`
			`prevSearch = '//a[contains(text(), "Previous")]'`
			`endOfLife = True`
Sort comics. 2013-03-06 19:21:10 +00:00			`help = 'Index format: nnn'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Fix GeeksNextDoor (& switch to ParserScraper) 2020-10-18 18:44:47 +00:00			`class GeeksNextDoor(_ParserScraper):`
Added GeeksNextDoor. 2013-03-11 21:51:45 +00:00			`url = 'http://www.geeksnextcomic.com/'`
			`stripUrl = url + '%s.html'`
Fix GeeksNextDoor (& switch to ParserScraper) 2020-10-18 18:44:47 +00:00			`firstStripUrl = stripUrl % '2007-03-27' # '2010-10-04'`
			`imageSearch = '//p/img'`
			`prevSearch = (`
			`'//a[img[contains(@src, "/nav_prev")]]',`
			`'//a[contains(text(), "< prev")]', # start page is different`
			`)`
Added GeeksNextDoor. 2013-03-11 21:51:45 +00:00			`help = 'Index format: yyyy-mm-dd'`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class Ginpu(WordPressNavi):`
Minor URL fixes, switched some modules to https 2019-12-31 00:44:19 +00:00			`url = 'https://www.ginpu.us/'`
Add Ginpu 2019-07-21 23:47:14 +00:00			`stripUrl = url + 'comic/%s/'`
			`firstStripUrl = stripUrl % 'filler-2'`

			`def namer(self, imageUrl, pageUrl):`
			`filename = imageUrl.rsplit('/', 3)`
			`return '%s-%s_%s' % (filename[1], filename[2], filename[3])`


Added GirlGenius 2013-04-25 18:58:24 +00:00			`class GirlGenius(_BasicScraper):`
			`baseUrl = 'http://www.girlgeniusonline.com/'`
			`rurl = escape(baseUrl)`
			`url = baseUrl + 'comic.php'`
Fix GirlGenious strip url. 2013-04-26 17:52:45 +00:00			`stripUrl = url + '?date=%s'`
Added GirlGenius 2013-04-25 18:58:24 +00:00			`firstStripUrl = stripUrl % '20021104'`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`imageSearch = compile(`
			`tagre("img", "src", r"(%sggmain/strips/[^']*)" % rurl, quote="'"))`
Updated GirlGenius to new markup GG markup has changed, so I fixed the prevSearch regex to find the "previous" button on the redesigned page. As well, I set multipleImagesPerStrip to true, since there are quite a few comics with multiple images that were being discarded. 2014-06-13 20:43:40 +00:00			`prevSearch = compile(tagre("a", "id", "topprev", quote="\"",`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`before=r"(%s[^\"']+)" % rurl))`
Updated GirlGenius to new markup GG markup has changed, so I fixed the prevSearch regex to find the "previous" button on the redesigned page. As well, I set multipleImagesPerStrip to true, since there are quite a few comics with multiple images that were being discarded. 2014-06-13 20:43:40 +00:00			`multipleImagesPerStrip = True`
Added GirlGenius 2013-04-25 18:58:24 +00:00			`help = 'Index format: yyyymmdd'`

Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`class GirlsWithSlingshots(_BasicScraper):`
Fix Girls with Slingshots matchers (#136) Domain name and URLs have changed slightly. Fixes #105. 2019-06-26 21:22:45 +00:00			`url = 'https://girlswithslingshots.com/'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`rurl = escape(url)`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`stripUrl = url + 'comic/%s'`
Fix some comics. 2014-07-02 17:51:53 +00:00			`firstStripUrl = stripUrl % 'gws1'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`imageSearch = (`
			`compile(tagre("img", "src", r'(%scomics/[^"]+)' % rurl)),`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`compile(tagre("img", "src",`
			`r'(http://cdn\.girlswithslingshots\.com/comics/[^"]+)')),`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`)`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`prevSearch = compile(tagre("a", "href", r'(%scomic/[^"]+)' % rurl,`
Fix Girls with Slingshots matchers (#136) Domain name and URLs have changed slightly. Fixes #105. 2019-06-26 21:22:45 +00:00			`before='rel="prev"'))`
Fix some comics. 2014-07-02 17:51:53 +00:00			`help = 'Index format: stripname'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GleefulNihilism(WordPressScraper):`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`url = ('https://web.archive.org/web/20170911203122/'`
			`'http://gleefulnihilism.com/')`
Fix broken comics 2013-11-12 17:33:14 +00:00			`stripUrl = url + 'comic/%s/'`
			`firstStripUrl = stripUrl % 'amoeba'`
Fix some old modules using the Internet Archive 2020-01-09 16:38:13 +00:00			`endOfLife = True`
Fix broken comics 2013-11-12 17:33:14 +00:00			`help = 'Index format: stripname'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GoblinsComic(ComicControlScraper):`
Fix some comics. 2013-07-09 20:21:17 +00:00			`url = 'http://www.goblinscomic.org/'`
Added GoblinsComic 2013-04-09 17:37:24 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GodChild(WordPressScraper):`
Rework/fix KeenSpot modules. 2016-10-13 22:14:53 +00:00			`url = 'http://godchild.keenspot.com/'`


Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GoGetARoomie(ComicControlScraper):`
Move ComicControl into common module. - Move all comics using ComicControl into alphabetical files. - Add BalderDash & Picklewhistle 2016-04-03 22:12:53 +00:00			`url = 'http://www.gogetaroomie.com'`


Initial commit to Github. 2012-06-20 19:58:13 +00:00			`class GoneWithTheBlastwave(_BasicScraper):`
Rename latestUrl in url 2013-02-05 18:51:46 +00:00			`url = 'http://www.blastwave-comic.com/index.php?p=comic&nro=1'`
Refactor: Convert starter to simple method. 2016-04-13 18:01:51 +00:00			`starter = indirectStarter`
Rename latestUrl in url 2013-02-05 18:51:46 +00:00			`stripUrl = url[:-1] + '%s'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '1'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`imageSearch = compile(r'<img.+src=".+(/comics/.+?)"')`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`prevSearch = compile(r'href="(index.php\?p=comic&nro=\d+)">' +`
			`r'<img src="images/page/default/previous')`
Read starter parameters from class. This allows to specify starters in a more declarative and dynamic way. 2016-04-12 21:11:39 +00:00			`latestSearch = compile(r'href="(index.php\?p=comic&nro=\d+)">' +`
			`r'<img src="images/page/default/latest')`
Initial commit to Github. 2012-06-20 19:58:13 +00:00			`help = 'Index format: n'`

Refactor: Make namer a method. When #42 is realized, the naming of files might differ between comic modules, so the namer's logical location is the instance, not the class. 2016-04-21 06:20:49 +00:00			`def namer(self, image_url, page_url):`
			`return '%02d' % int(compile(r'nro=(\d+)').search(page_url).group(1))`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GrrlPower(WordPressScraper):`
Minor URL fixes, switched some modules to https 2019-12-31 00:44:19 +00:00			`url = 'https://grrlpowercomic.com/'`
Fix GrrlPower 2019-06-13 06:04:09 +00:00			`stripUrl = url + 'archives/comic/%s/'`
			`firstStripUrl = stripUrl % 'gp0001'`
Add GrrlPower comic. 2013-01-29 20:42:10 +00:00
Throttle GrrlPower (fixes #179) 2021-03-19 23:52:28 +00:00			`def __init__(self, name):`
			`super().__init__(name)`
			`self.session.add_throttle('grrlpowercomic.com', 1.0, 1.5)`

Add GrrlPower comic. 2013-01-29 20:42:10 +00:00
Deprecate underscore-prefixed parent classes This is trying to strike a balance between updating as much existing classes as possible, but not making the diff too big... 2022-06-06 10:08:32 +00:00			`class GuildedAge(WordPressScraper):`
Add GuildedAge 2020-09-13 14:52:38 +00:00			`url = 'http://guildedage.net/'`
			`firstStripUrl = url + 'comic/chapter-1-cover/'`


Sort comics alphabetically & PEP8 style fixes. 2016-03-31 21:13:54 +00:00			`class GUComics(_BasicScraper):`
			`url = 'http://www.gucomics.com/'`
			`stripUrl = url + '%s'`
			`firstStripUrl = stripUrl % '20000710'`
			`imageSearch = compile(tagre("img", "src", r'(/comics/\d{4}/gu_[^"]+)'))`
			`prevSearch = compile(tagre("a", "href", r'(/\d+)') +`
			`tagre("img", "src", r'/images/nav/prev\.png'))`
			`help = 'Index format: yyyymmdd'`


Fix GunnerkriggCourt 2019-06-12 04:26:42 +00:00			`class GunnerkriggCourt(_ParserScraper):`
			`url = 'http://www.gunnerkrigg.com/'`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`stripUrl = url + '?p=%s'`
Fix GunnerkriggCourt 2019-06-12 04:26:42 +00:00			`firstStripUrl = stripUrl % '1'`
			`imageSearch = '//img[@class="comic_image"]'`
			`prevSearch = '//a[./img[contains(@src, "prev")]]'`
Fix GunnerkrigCourt 2013-01-29 18:00:29 +00:00			`help = 'Index format: number'`
Initial commit to Github. 2012-06-20 19:58:13 +00:00

			`class Gunshow(_BasicScraper):`
Always have an url attribute in comic scrapers. 2013-02-04 20:00:26 +00:00			`url = 'http://gunshowcomic.com/'`
			`stripUrl = url + '%s'`
Add firstStripUrls. 2013-04-10 21:57:09 +00:00			`firstStripUrl = stripUrl % '1'`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`imageSearch = compile(tagre("img", "src",`
			`r'(http://gunshowcomic\.com/comics/[^"]+)'))`
Fix comics. 2012-12-04 06:02:40 +00:00			`multipleImagesPerStrip = True`
Fix GastroPhobia, remove GeneralProtectionFault. (& formatting) 2016-03-20 19:10:04 +00:00			`prevSearch = compile(`
			`tagre("a", "href", r'([^"]+)') +`
			`tagre("img", "src", r'[^"]*menu/small/previous\.gif'))`
Fix some comics. 2012-11-21 20:57:26 +00:00			`help = 'Index format: n'`