Unify XPath NS config over modules
This commit is contained in:
parent
2e912bcd2c
commit
23125c74d4
22 changed files with 81 additions and 86 deletions
|
@ -228,7 +228,7 @@ class Amya(WordPressScraper):
|
||||||
url = 'http://www.amyachronicles.com/'
|
url = 'http://www.amyachronicles.com/'
|
||||||
|
|
||||||
|
|
||||||
class Angband(_ParserScraper):
|
class Angband(ParserScraper):
|
||||||
url = 'http://angband.calamarain.net/'
|
url = 'http://angband.calamarain.net/'
|
||||||
stripUrl = url + '%s'
|
stripUrl = url + '%s'
|
||||||
imageSearch = '//img'
|
imageSearch = '//img'
|
||||||
|
@ -237,7 +237,7 @@ class Angband(_ParserScraper):
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
page = self.getPage(self.url)
|
page = self.getPage(self.url)
|
||||||
self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href')
|
self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href')
|
||||||
self.firstStripUrl = self.pages[0]
|
self.firstStripUrl = self.pages[0]
|
||||||
return self.pages[-1]
|
return self.pages[-1]
|
||||||
|
|
||||||
|
@ -267,7 +267,7 @@ class Annyseed(_ParserScraper):
|
||||||
return tourl
|
return tourl
|
||||||
|
|
||||||
|
|
||||||
class AntiheroForHire(_ParserScraper):
|
class AntiheroForHire(ParserScraper):
|
||||||
stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s'
|
stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s'
|
||||||
firstStripUrl = stripUrl % '2016/6/8/entrance-vigil'
|
firstStripUrl = stripUrl % '2016/6/8/entrance-vigil'
|
||||||
url = firstStripUrl
|
url = firstStripUrl
|
||||||
|
@ -278,7 +278,7 @@ class AntiheroForHire(_ParserScraper):
|
||||||
def starter(self):
|
def starter(self):
|
||||||
# Build list of chapters for navigation
|
# Build list of chapters for navigation
|
||||||
page = self.getPage(self.url)
|
page = self.getPage(self.url)
|
||||||
self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href')
|
self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href')
|
||||||
return self.chapters[0]
|
return self.chapters[0]
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
|
@ -314,7 +314,7 @@ class ArtificialIncident(WordPressWebcomic):
|
||||||
firstStripUrl = stripUrl % 'issue-one-life-changing'
|
firstStripUrl = stripUrl % 'issue-one-life-changing'
|
||||||
|
|
||||||
|
|
||||||
class AstronomyPOTD(_ParserScraper):
|
class AstronomyPOTD(ParserScraper):
|
||||||
baseUrl = 'http://apod.nasa.gov/apod/'
|
baseUrl = 'http://apod.nasa.gov/apod/'
|
||||||
url = baseUrl + 'astropix.html'
|
url = baseUrl + 'astropix.html'
|
||||||
starter = bounceStarter
|
starter = bounceStarter
|
||||||
|
@ -328,7 +328,7 @@ class AstronomyPOTD(_ParserScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return data.xpath('//iframe') # videos
|
return self.match(data, '//iframe') # videos
|
||||||
|
|
||||||
def namer(self, image_url, page_url):
|
def namer(self, image_url, page_url):
|
||||||
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],
|
return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:],
|
||||||
|
|
|
@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper):
|
||||||
help = 'Index format: yyyy/mm/dd/nnn-stripname'
|
help = 'Index format: yyyy/mm/dd/nnn-stripname'
|
||||||
|
|
||||||
|
|
||||||
class CarryOn(_ParserScraper):
|
class CarryOn(ParserScraper):
|
||||||
url = 'http://www.hirezfox.com/km/co/'
|
url = 'http://www.hirezfox.com/km/co/'
|
||||||
stripUrl = url + 'd/%s.html'
|
stripUrl = url + 'd/%s.html'
|
||||||
firstStripUrl = stripUrl % '20040701'
|
firstStripUrl = stripUrl % '20040701'
|
||||||
imageSearch = '//div[@class="strip"]/img'
|
imageSearch = '//div[d:class("strip")]/img'
|
||||||
prevSearch = '//a[text()="Previous Day"]'
|
prevSearch = '//a[text()="Previous Day"]'
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
|
||||||
|
@ -122,13 +122,13 @@ class CatAndGirl(_ParserScraper):
|
||||||
prevSearch = '//a[d:class("pager--prev")]'
|
prevSearch = '//a[d:class("pager--prev")]'
|
||||||
|
|
||||||
|
|
||||||
class CatenaManor(_ParserScraper):
|
class CatenaManor(ParserScraper):
|
||||||
baseUrl = ('https://web.archive.org/web/20141027141116/'
|
baseUrl = ('https://web.archive.org/web/20141027141116/'
|
||||||
'http://catenamanor.com/')
|
'http://catenamanor.com/')
|
||||||
url = baseUrl + 'archives'
|
url = baseUrl + 'archives'
|
||||||
stripUrl = baseUrl + '%s/'
|
stripUrl = baseUrl + '%s/'
|
||||||
firstStripUrl = stripUrl % '2003/07'
|
firstStripUrl = stripUrl % '2003/07'
|
||||||
imageSearch = '//img[@class="comicthumbnail"]'
|
imageSearch = '//img[d:class("comicthumbnail")]'
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
endOfLife = True
|
endOfLife = True
|
||||||
strips: List[str] = []
|
strips: List[str] = []
|
||||||
|
@ -136,7 +136,7 @@ class CatenaManor(_ParserScraper):
|
||||||
def starter(self):
|
def starter(self):
|
||||||
# Retrieve archive links and select valid range
|
# Retrieve archive links and select valid range
|
||||||
archivePage = self.getPage(self.url)
|
archivePage = self.getPage(self.url)
|
||||||
archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a')
|
archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a')
|
||||||
valid = False
|
valid = False
|
||||||
for link in archiveStrips:
|
for link in archiveStrips:
|
||||||
if self.stripUrl % '2012/01' in link.get('href'):
|
if self.stripUrl % '2012/01' in link.get('href'):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
|
@ -79,7 +79,7 @@ class ComicFury(ParserScraper):
|
||||||
num = parts[-1]
|
num = parts[-1]
|
||||||
if self.multipleImagesPerStrip:
|
if self.multipleImagesPerStrip:
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
images = page.xpath('//img[@class="comicsegmentimage"]/@src')
|
images = self.match(page, '//img[d:class("comicsegmentimage")]/@src')
|
||||||
if len(images) > 1:
|
if len(images) > 1:
|
||||||
imageIndex = images.index(imageUrl) + 1
|
imageIndex = images.index(imageUrl) + 1
|
||||||
return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext)
|
return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext)
|
||||||
|
@ -88,8 +88,8 @@ class ComicFury(ParserScraper):
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
# Videos on Underverse
|
# Videos on Underverse
|
||||||
return (data.xpath('//div[@id="comicimagewrap"]//video') and
|
return (self.match(data, '//div[@id="comicimagewrap"]//video') and
|
||||||
not data.xpath('//div[@id="comicimagewrap"]//img'))
|
not self.match(data, '//div[@id="comicimagewrap"]//img'))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getmodules(cls): # noqa: CFQ001
|
def getmodules(cls): # noqa: CFQ001
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile, escape
|
from re import compile, escape
|
||||||
|
|
||||||
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
|
from ..scraper import _BasicScraper, _ParserScraper, ParserScraper
|
||||||
|
@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper):
|
||||||
help = 'Index format: n'
|
help = 'Index format: n'
|
||||||
|
|
||||||
|
|
||||||
class DresdenCodak(_ParserScraper):
|
class DresdenCodak(ParserScraper):
|
||||||
url = 'http://dresdencodak.com/'
|
url = 'http://dresdencodak.com/'
|
||||||
startUrl = url + 'cat/comic/'
|
|
||||||
firstStripUrl = url + '2007/02/08/pom/'
|
firstStripUrl = url + '2007/02/08/pom/'
|
||||||
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
|
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
|
||||||
prevSearch = '//a[img[contains(@src, "prev")]]'
|
prevSearch = '//a[img[contains(@src, "prev")]]'
|
||||||
latestSearch = '//a[d:class("tc-grid-bg-link")]'
|
latestSearch = '//a[d:class("tc-grid-bg-link")]'
|
||||||
starter = indirectStarter
|
starter = indirectStarter
|
||||||
|
|
||||||
# Blog and comic are mixed...
|
|
||||||
def shouldSkipUrl(self, url, data):
|
|
||||||
return not data.xpath(self.imageSearch)
|
|
||||||
|
|
||||||
|
|
||||||
class DrFun(_ParserScraper):
|
class DrFun(_ParserScraper):
|
||||||
baseUrl = ('https://web.archive.org/web/20180726145737/'
|
baseUrl = ('https://web.archive.org/web/20180726145737/'
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ class Derideal(ParserScraper):
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
indexPage = self.getPage(self.url)
|
indexPage = self.getPage(self.url)
|
||||||
self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href')
|
self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href')
|
||||||
self.currentChapter = len(self.chapters)
|
self.currentChapter = len(self.chapters)
|
||||||
return indirectStarter(self)
|
return indirectStarter(self)
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ class Erfworld(ParserScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return not data.xpath(self.imageSearch)
|
return not self.match(data, self.imageSearch)
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Fix inconsistent filenames
|
# Fix inconsistent filenames
|
||||||
|
@ -232,7 +232,7 @@ class ExtraFabulousComics(WordPressScraper):
|
||||||
return '_'.join((pagepart, imagename))
|
return '_'.join((pagepart, imagename))
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
return data.xpath('//div[@id="comic"]//iframe')
|
return self.match(data, '//div[@id="comic"]//iframe')
|
||||||
|
|
||||||
|
|
||||||
class ExtraLife(_BasicScraper):
|
class ExtraLife(_BasicScraper):
|
||||||
|
|
|
@ -140,7 +140,7 @@ class FoxDad(ParserScraper):
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
post = page.xpath('//li[@class="timestamp"]/a/@href')[0]
|
post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0]
|
||||||
post = post.replace('https://foxdad.com/post/', '')
|
post = post.replace('https://foxdad.com/post/', '')
|
||||||
if '-consider-support' in post:
|
if '-consider-support' in post:
|
||||||
post = post.split('-consider-support')[0]
|
post = post.split('-consider-support')[0]
|
||||||
|
@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper):
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
|
strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')
|
||||||
return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1]
|
return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ class GoComics(ParserScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return data.xpath('//img[contains(@src, "content-error-missing")]')
|
return self.match(data, '//img[contains(@src, "content-error-missing")]')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getmodules(cls): # noqa: CFQ001
|
def getmodules(cls): # noqa: CFQ001
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ class KemonoCafe(ParserScraper):
|
||||||
# Fix unordered filenames
|
# Fix unordered filenames
|
||||||
if 'addictivescience' in pageUrl:
|
if 'addictivescience' in pageUrl:
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
|
num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', ''))
|
||||||
filename = '%04d_%s' % (num, filename)
|
filename = '%04d_%s' % (num, filename)
|
||||||
elif 'CaughtInOrbit' in filename:
|
elif 'CaughtInOrbit' in filename:
|
||||||
filename = filename.replace('CaughtInOrbit', 'CIO')
|
filename = filename.replace('CaughtInOrbit', 'CIO')
|
||||||
|
|
|
@ -38,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
# Fix broken navigation links
|
# Fix broken navigation links
|
||||||
if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
|
if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00':
|
||||||
return self.stripUrl % 'summer21'
|
return self.stripUrl % 'summer21'
|
||||||
return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data)
|
return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data)
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ from re import compile, IGNORECASE
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
from ..util import tagre
|
from ..util import tagre
|
||||||
from ..xml import NS
|
|
||||||
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
|
from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,7 +152,7 @@ class MonkeyUser(ParserScraper):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
# videos
|
# videos
|
||||||
return data.xpath('//div[d:class("video-container")]', namespaces=NS)
|
return self.match(data, '//div[d:class("video-container")]')
|
||||||
|
|
||||||
|
|
||||||
class MonsieurLeChien(ParserScraper):
|
class MonsieurLeChien(ParserScraper):
|
||||||
|
|
|
@ -166,7 +166,7 @@ class PHDComics(ParserScraper):
|
||||||
# video
|
# video
|
||||||
self.stripUrl % '1880',
|
self.stripUrl % '1880',
|
||||||
self.stripUrl % '1669',
|
self.stripUrl % '1669',
|
||||||
) or data.xpath('//img[@id="comic" and contains(@src, "phd083123s")]')
|
) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]')
|
||||||
|
|
||||||
|
|
||||||
class Picklewhistle(ComicControlScraper):
|
class Picklewhistle(ComicControlScraper):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2021 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile
|
from re import compile
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
@ -121,7 +121,7 @@ class Requiem(WordPressScraper):
|
||||||
firstStripUrl = stripUrl % '2004-06-07-3'
|
firstStripUrl = stripUrl % '2004-06-07-3'
|
||||||
|
|
||||||
|
|
||||||
class Replay(_ParserScraper):
|
class Replay(ParserScraper):
|
||||||
url = 'http://replaycomic.com/'
|
url = 'http://replaycomic.com/'
|
||||||
stripUrl = url + 'comic/%s/'
|
stripUrl = url + 'comic/%s/'
|
||||||
firstStripUrl = stripUrl % 'red-desert'
|
firstStripUrl = stripUrl % 'red-desert'
|
||||||
|
@ -132,11 +132,11 @@ class Replay(_ParserScraper):
|
||||||
def starter(self):
|
def starter(self):
|
||||||
# Retrieve archive page to identify chapters
|
# Retrieve archive page to identify chapters
|
||||||
archivePage = self.getPage(self.url + 'archive')
|
archivePage = self.getPage(self.url + 'archive')
|
||||||
archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]')
|
archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]')
|
||||||
self.chapter = len(archive) - 1
|
self.chapter = len(archive) - 1
|
||||||
self.startOfChapter = []
|
self.startOfChapter = []
|
||||||
for archiveChapter in archive:
|
for archiveChapter in archive:
|
||||||
self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href'))
|
self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href'))
|
||||||
return bounceStarter(self)
|
return bounceStarter(self)
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
|
|
|
@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper):
|
||||||
def extract_image_urls(self, url, data):
|
def extract_image_urls(self, url, data):
|
||||||
# Website requires JS, so build the list of image URLs manually
|
# Website requires JS, so build the list of image URLs manually
|
||||||
imageurls = []
|
imageurls = []
|
||||||
current = int(data.xpath('//input[@name="pagnum"]')[0].get('value'))
|
current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value'))
|
||||||
for page in reversed(range(1, current + 1)):
|
for page in reversed(range(1, current + 1)):
|
||||||
imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
|
imageurls.append(self.url + 'comics/' + str(page) + '.jpg')
|
||||||
return imageurls
|
return imageurls
|
||||||
|
@ -636,16 +636,16 @@ class StrongFemaleProtagonist(_ParserScraper):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class StupidFox(_ParserScraper):
|
class StupidFox(ParserScraper):
|
||||||
url = 'http://stupidfox.net/'
|
url = 'http://stupidfox.net/'
|
||||||
stripUrl = url + '%s'
|
stripUrl = url + '%s'
|
||||||
firstStripUrl = stripUrl % 'hello'
|
firstStripUrl = stripUrl % 'hello'
|
||||||
imageSearch = '//div[@class="comicmid"]//img'
|
imageSearch = '//div[d:class("comicmid")]//img'
|
||||||
prevSearch = '//a[@accesskey="p"]'
|
prevSearch = '//a[@accesskey="p"]'
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
|
title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-')
|
||||||
return title + '.' + imageUrl.rsplit('.', 1)[-1]
|
return title + '.' + imageUrl.rsplit('.', 1)[-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ class AlienDice(WordPressSpliced):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return not data.xpath(self.imageSearch)
|
return not self.match(data, self.imageSearch)
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
# Fix broken navigation
|
# Fix broken navigation
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..output import out
|
from ..output import out
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
from ..xml import NS
|
|
||||||
|
|
||||||
|
|
||||||
class Tapas(ParserScraper):
|
class Tapas(ParserScraper):
|
||||||
|
@ -21,7 +20,7 @@ class Tapas(ParserScraper):
|
||||||
def starter(self):
|
def starter(self):
|
||||||
# Retrieve comic metadata from info page
|
# Retrieve comic metadata from info page
|
||||||
info = self.getPage(self.url)
|
info = self.getPage(self.url)
|
||||||
series = info.xpath('//@data-series-id')[0]
|
series = self.match(info, '//@data-series-id')[0]
|
||||||
# Retrieve comic metadata from API
|
# Retrieve comic metadata from API
|
||||||
data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST')
|
data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST')
|
||||||
data.raise_for_status()
|
data.raise_for_status()
|
||||||
|
@ -43,7 +42,7 @@ class Tapas(ParserScraper):
|
||||||
return self._cached_image_urls
|
return self._cached_image_urls
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS):
|
if self.match(data, '//button[d:class("js-have-to-sign")]'):
|
||||||
out.warn(f'Nothing to download on "{url}", because a login is required.')
|
out.warn(f'Nothing to download on "{url}", because a login is required.')
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -107,7 +107,7 @@ class Unsounded(ParserScraper):
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def extract_css_bg(self, page) -> str | None:
|
def extract_css_bg(self, page) -> str | None:
|
||||||
comicdivs = page.xpath('//div[@id="comic"]')
|
comicdivs = self.match(page, '//div[@id="comic"]')
|
||||||
if comicdivs:
|
if comicdivs:
|
||||||
style = comicdivs[0].attrib.get('style')
|
style = comicdivs[0].attrib.get('style')
|
||||||
if style:
|
if style:
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
|
|
||||||
from ..scraper import ParserScraper, _ParserScraper
|
from ..scraper import ParserScraper, _ParserScraper
|
||||||
from ..helpers import bounceStarter, indirectStarter
|
from ..helpers import bounceStarter, indirectStarter
|
||||||
|
@ -44,15 +44,15 @@ class Vibe(ParserScraper):
|
||||||
help = 'Index format: VIBEnnn (padded)'
|
help = 'Index format: VIBEnnn (padded)'
|
||||||
|
|
||||||
|
|
||||||
class VickiFox(_ParserScraper):
|
class VickiFox(ParserScraper):
|
||||||
url = 'http://www.vickifox.com/comic/strip'
|
url = 'http://www.vickifox.com/comic/strip'
|
||||||
stripUrl = url + '?id=%s'
|
stripUrl = url + '?id=%s'
|
||||||
firstStripUrl = stripUrl % '001'
|
firstStripUrl = stripUrl % '001'
|
||||||
imageSearch = '//img[contains(@src, "comic/")]'
|
imageSearch = '//img[contains(@src, "comic/")]'
|
||||||
prevSearch = '//button[@id="btnPrev"]/@value'
|
prevSearch = '//button[@id="btnPrev"]/@value'
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def link_modifier(self, fromurl, tourl):
|
||||||
return self.stripUrl % self.getPage(url).xpath(self.prevSearch)[0]
|
return self.stripUrl % tourl
|
||||||
|
|
||||||
|
|
||||||
class ViiviJaWagner(_ParserScraper):
|
class ViiviJaWagner(_ParserScraper):
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
|
||||||
# Copyright (C) 2015-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2020 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from re import compile, escape, IGNORECASE
|
from re import compile, escape, IGNORECASE
|
||||||
|
|
||||||
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
from ..scraper import ParserScraper, _BasicScraper, _ParserScraper
|
||||||
|
@ -17,7 +17,7 @@ class WapsiSquare(WordPressNaviIn):
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
def shouldSkipUrl(self, url, data):
|
||||||
"""Skip pages without images."""
|
"""Skip pages without images."""
|
||||||
return data.xpath('//iframe') # videos
|
return self.match(data, '//iframe') # videos
|
||||||
|
|
||||||
|
|
||||||
class WastedTalent(_ParserScraper):
|
class WastedTalent(_ParserScraper):
|
||||||
|
|
|
@ -24,9 +24,9 @@ class WebToons(ParserScraper):
|
||||||
self.session.cookies.set(cookie, 'false', domain='webtoons.com')
|
self.session.cookies.set(cookie, 'false', domain='webtoons.com')
|
||||||
# Find current episode number
|
# Find current episode number
|
||||||
listPage = self.getPage(self.listUrl)
|
listPage = self.getPage(self.listUrl)
|
||||||
currentEpisode = listPage.xpath('//div[@class="detail_lst"]/ul/li')[0].attrib['data-episode-no']
|
currentEpisode = self.match(listPage, '//div[d:class("detail_lst")]/ul/li')[0].attrib['data-episode-no']
|
||||||
# Check for completed tag
|
# Check for completed tag
|
||||||
self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != [])
|
self.endOfLife = not self.match(listPage, '//div[@id="_asideDetail"]//span[d:class("txt_ico_completed2")]')
|
||||||
return self.stripUrl % currentEpisode
|
return self.stripUrl % currentEpisode
|
||||||
|
|
||||||
def extract_image_urls(self, url, data):
|
def extract_image_urls(self, url, data):
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
# Copyright (C) 2019-2022 Tobias Gruetzmacher
|
# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher
|
||||||
# Copyright (C) 2019-2022 Daniel Ring
|
# SPDX-FileCopyrightText: © 2019 Daniel Ring
|
||||||
from ..scraper import ParserScraper
|
from ..scraper import ParserScraper
|
||||||
from ..helpers import indirectStarter
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
@ -15,21 +15,21 @@ class Wrongside(ParserScraper):
|
||||||
|
|
||||||
def starter(self):
|
def starter(self):
|
||||||
archivePage = self.getPage(self.url)
|
archivePage = self.getPage(self.url)
|
||||||
chapterUrls = archivePage.xpath('//ul[@class="albThumbs"]//a/@href')
|
chapterUrls = self.match(archivePage, '//ul[d:class("albThumbs")]//a/@href')
|
||||||
self.archive = []
|
self.archive = []
|
||||||
for chapterUrl in chapterUrls:
|
for chapterUrl in chapterUrls:
|
||||||
chapterPage = self.getPage(chapterUrl)
|
chapterPage = self.getPage(chapterUrl)
|
||||||
self.archive.append(chapterPage.xpath('(//ul[@id="thumbnails"]//a/@href)[last()]')[0])
|
self.archive.append(self.match(chapterPage, '(//ul[@id="thumbnails"]//a/@href)[last()]')[0])
|
||||||
return self.archive[0]
|
return self.archive[0]
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
def getPrevUrl(self, url, data):
|
||||||
if data.xpath(self.prevSearch) == [] and len(self.archive) > 0:
|
if self.match(data, self.prevSearch) == [] and len(self.archive) > 0:
|
||||||
return self.archive.pop()
|
return self.archive.pop()
|
||||||
return super(Wrongside, self).getPrevUrl(url, data)
|
return super(Wrongside, self).getPrevUrl(url, data)
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0]
|
title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
|
||||||
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
|
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
|
||||||
|
|
||||||
|
|
||||||
|
@ -71,5 +71,5 @@ class WrongsideSideStories(ParserScraper):
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
page = self.getPage(pageUrl)
|
page = self.getPage(pageUrl)
|
||||||
title = page.xpath('//div[@class="browsePath"]/h2/text()')[0]
|
title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0]
|
||||||
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
|
return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1]
|
||||||
|
|
|
@ -521,15 +521,10 @@ class ParserScraper(Scraper):
|
||||||
return text.strip()
|
return text.strip()
|
||||||
|
|
||||||
def _matchPattern(self, data, patterns):
|
def _matchPattern(self, data, patterns):
|
||||||
if self.css:
|
|
||||||
searchFun = data.cssselect
|
|
||||||
else:
|
|
||||||
def searchFun(s):
|
|
||||||
return data.xpath(s, namespaces=NS)
|
|
||||||
patterns = makeSequence(patterns)
|
patterns = makeSequence(patterns)
|
||||||
for search in patterns:
|
for search in patterns:
|
||||||
matched = False
|
matched = False
|
||||||
for match in searchFun(search):
|
for match in self.match(data, search):
|
||||||
matched = True
|
matched = True
|
||||||
yield match, search
|
yield match, search
|
||||||
|
|
||||||
|
@ -537,6 +532,13 @@ class ParserScraper(Scraper):
|
||||||
# do not search other links if one pattern matched
|
# do not search other links if one pattern matched
|
||||||
break
|
break
|
||||||
|
|
||||||
|
def match(self, data, pattern):
|
||||||
|
"""Match a pattern (XPath/CSS) against a page."""
|
||||||
|
if self.css:
|
||||||
|
return data.cssselect(pattern)
|
||||||
|
else:
|
||||||
|
return data.xpath(pattern, namespaces=NS)
|
||||||
|
|
||||||
def getDisabledReasons(self):
|
def getDisabledReasons(self):
|
||||||
res = {}
|
res = {}
|
||||||
if self.css and cssselect is None:
|
if self.css and cssselect is None:
|
||||||
|
|
Loading…
Reference in a new issue