diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py index 2a4ef5b70..51492ce4d 100644 --- a/dosagelib/plugins/a.py +++ b/dosagelib/plugins/a.py @@ -228,7 +228,7 @@ class Amya(WordPressScraper): url = 'http://www.amyachronicles.com/' -class Angband(_ParserScraper): +class Angband(ParserScraper): url = 'http://angband.calamarain.net/' stripUrl = url + '%s' imageSearch = '//img' @@ -237,7 +237,7 @@ class Angband(_ParserScraper): def starter(self): page = self.getPage(self.url) - self.pages = page.xpath('//p/a[not(contains(@href, "cast"))]/@href') + self.pages = self.match(page, '//p/a[not(contains(@href, "cast"))]/@href') self.firstStripUrl = self.pages[0] return self.pages[-1] @@ -267,7 +267,7 @@ class Annyseed(_ParserScraper): return tourl -class AntiheroForHire(_ParserScraper): +class AntiheroForHire(ParserScraper): stripUrl = 'https://www.giantrobot.club/antihero-for-hire/%s' firstStripUrl = stripUrl % '2016/6/8/entrance-vigil' url = firstStripUrl @@ -278,7 +278,7 @@ class AntiheroForHire(_ParserScraper): def starter(self): # Build list of chapters for navigation page = self.getPage(self.url) - self.chapters = page.xpath('//ul[@class="archive-group-list"]//a[contains(@class, "archive-item-link")]/@href') + self.chapters = self.match(page, '//ul[d:class("archive-group-list")]//a[d:class("archive-item-link")]/@href') return self.chapters[0] def getPrevUrl(self, url, data): @@ -314,7 +314,7 @@ class ArtificialIncident(WordPressWebcomic): firstStripUrl = stripUrl % 'issue-one-life-changing' -class AstronomyPOTD(_ParserScraper): +class AstronomyPOTD(ParserScraper): baseUrl = 'http://apod.nasa.gov/apod/' url = baseUrl + 'astropix.html' starter = bounceStarter @@ -328,7 +328,7 @@ class AstronomyPOTD(_ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//iframe') # videos + return self.match(data, '//iframe') # videos def namer(self, image_url, page_url): return '%s-%s' % (page_url.split('/')[-1].split('.')[0][2:], diff --git a/dosagelib/plugins/c.py b/dosagelib/plugins/c.py index 27f7278d2..78b17399a 100644 --- a/dosagelib/plugins/c.py +++ b/dosagelib/plugins/c.py @@ -34,11 +34,11 @@ class CaptainSNES(_BasicScraper): help = 'Index format: yyyy/mm/dd/nnn-stripname' -class CarryOn(_ParserScraper): +class CarryOn(ParserScraper): url = 'http://www.hirezfox.com/km/co/' stripUrl = url + 'd/%s.html' firstStripUrl = stripUrl % '20040701' - imageSearch = '//div[@class="strip"]/img' + imageSearch = '//div[d:class("strip")]/img' prevSearch = '//a[text()="Previous Day"]' multipleImagesPerStrip = True @@ -122,13 +122,13 @@ class CatAndGirl(_ParserScraper): prevSearch = '//a[d:class("pager--prev")]' -class CatenaManor(_ParserScraper): +class CatenaManor(ParserScraper): baseUrl = ('https://web.archive.org/web/20141027141116/' 'http://catenamanor.com/') url = baseUrl + 'archives' stripUrl = baseUrl + '%s/' firstStripUrl = stripUrl % '2003/07' - imageSearch = '//img[@class="comicthumbnail"]' + imageSearch = '//img[d:class("comicthumbnail")]' multipleImagesPerStrip = True endOfLife = True strips: List[str] = [] @@ -136,7 +136,7 @@ class CatenaManor(_ParserScraper): def starter(self): # Retrieve archive links and select valid range archivePage = self.getPage(self.url) - archiveStrips = archivePage.xpath('//div[@id="archivepage"]//a') + archiveStrips = self.match(archivePage, '//div[@id="archivepage"]//a') valid = False for link in archiveStrips: if self.stripUrl % '2012/01' in link.get('href'): diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index 0a7a9c108..f5962db33 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring import os from ..scraper import ParserScraper @@ -79,7 +79,7 @@ class ComicFury(ParserScraper): num = parts[-1] if self.multipleImagesPerStrip: page = self.getPage(pageUrl) - images = page.xpath('//img[@class="comicsegmentimage"]/@src') + images = self.match(page, '//img[d:class("comicsegmentimage")]/@src') if len(images) > 1: imageIndex = images.index(imageUrl) + 1 return "%s_%s-%d%s" % (self.prefix, num, imageIndex, ext) @@ -88,8 +88,8 @@ class ComicFury(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" # Videos on Underverse - return (data.xpath('//div[@id="comicimagewrap"]//video') and - not data.xpath('//div[@id="comicimagewrap"]//img')) + return (self.match(data, '//div[@id="comicimagewrap"]//video') and + not self.match(data, '//div[@id="comicimagewrap"]//img')) @classmethod def getmodules(cls): # noqa: CFQ001 diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index f7a2e1933..3bfa0cf7c 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper, ParserScraper @@ -328,19 +328,14 @@ class DreamKeepersPrelude(_ParserScraper): help = 'Index format: n' -class DresdenCodak(_ParserScraper): +class DresdenCodak(ParserScraper): url = 'http://dresdencodak.com/' - startUrl = url + 'cat/comic/' firstStripUrl = url + '2007/02/08/pom/' imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' prevSearch = '//a[img[contains(@src, "prev")]]' latestSearch = '//a[d:class("tc-grid-bg-link")]' starter = indirectStarter - # Blog and comic are mixed... - def shouldSkipUrl(self, url, data): - return not data.xpath(self.imageSearch) - class DrFun(_ParserScraper): baseUrl = ('https://web.archive.org/web/20180726145737/' diff --git a/dosagelib/plugins/derideal.py b/dosagelib/plugins/derideal.py index 7b8d2e298..ca75a2e73 100644 --- a/dosagelib/plugins/derideal.py +++ b/dosagelib/plugins/derideal.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper from ..helpers import indirectStarter @@ -27,7 +27,7 @@ class Derideal(ParserScraper): def starter(self): indexPage = self.getPage(self.url) - self.chapters = indexPage.xpath('//a[contains(text(), "Read this episode")]/@href') + self.chapters = self.match(indexPage, '//a[contains(text(), "Read this episode")]/@href') self.currentChapter = len(self.chapters) return indirectStarter(self) diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index 5329256d6..857776ec0 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -113,7 +113,7 @@ class Erfworld(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return not data.xpath(self.imageSearch) + return not self.match(data, self.imageSearch) def namer(self, imageUrl, pageUrl): # Fix inconsistent filenames @@ -232,7 +232,7 @@ class ExtraFabulousComics(WordPressScraper): return '_'.join((pagepart, imagename)) def shouldSkipUrl(self, url, data): - return data.xpath('//div[@id="comic"]//iframe') + return self.match(data, '//div[@id="comic"]//iframe') class ExtraLife(_BasicScraper): diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index 01c43da33..360b6ba39 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -140,7 +140,7 @@ class FoxDad(ParserScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - post = page.xpath('//li[@class="timestamp"]/a/@href')[0] + post = self.match(page, '//li[d:class("timestamp")]/a/@href')[0] post = post.replace('https://foxdad.com/post/', '') if '-consider-support' in post: post = post.split('-consider-support')[0] @@ -216,7 +216,7 @@ class FriendsYouAreStuckWith(WordPressScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - strip = page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '') + strip = self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '') return strip + '_' + imageUrl.rstrip('/').rsplit('/', 1)[-1] diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index 1faee4bdd..140c112b9 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -31,7 +31,7 @@ class GoComics(ParserScraper): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//img[contains(@src, "content-error-missing")]') + return self.match(data, '//img[contains(@src, "content-error-missing")]') @classmethod def getmodules(cls): # noqa: CFQ001 diff --git a/dosagelib/plugins/kemonocafe.py b/dosagelib/plugins/kemonocafe.py index 788ab1eaf..22692d85e 100644 --- a/dosagelib/plugins/kemonocafe.py +++ b/dosagelib/plugins/kemonocafe.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper @@ -44,7 +44,7 @@ class KemonoCafe(ParserScraper): # Fix unordered filenames if 'addictivescience' in pageUrl: page = self.getPage(pageUrl) - num = int(page.xpath('//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')) + num = int(self.match(page, '//div[@id="comic-wrap"]/@class')[0].replace('comic-id-', '')) filename = '%04d_%s' % (num, filename) elif 'CaughtInOrbit' in filename: filename = filename.replace('CaughtInOrbit', 'CIO') diff --git a/dosagelib/plugins/l.py b/dosagelib/plugins/l.py index e04f168f0..f2a536c19 100644 --- a/dosagelib/plugins/l.py +++ b/dosagelib/plugins/l.py @@ -38,7 +38,7 @@ class LazJonesAndTheMayfieldRegulatorsSideStories(LazJonesAndTheMayfieldRegulato def getPrevUrl(self, url, data): # Fix broken navigation links - if url == self.url and data.xpath(self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00': + if url == self.url and self.match(data, self.prevSearch + '/@href')[0] == self.stripUrl % 'summer00': return self.stripUrl % 'summer21' return super(LazJonesAndTheMayfieldRegulators, self).getPrevUrl(url, data) diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index d69531e36..b54370f1b 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -9,7 +9,6 @@ from re import compile, IGNORECASE from ..helpers import indirectStarter from ..scraper import ParserScraper, _BasicScraper, _ParserScraper from ..util import tagre -from ..xml import NS from .common import ComicControlScraper, WordPressScraper, WordPressWebcomic @@ -153,7 +152,7 @@ class MonkeyUser(ParserScraper): def shouldSkipUrl(self, url, data): # videos - return data.xpath('//div[d:class("video-container")]', namespaces=NS) + return self.match(data, '//div[d:class("video-container")]') class MonsieurLeChien(ParserScraper): diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index cc5319aa4..0a2cf0037 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -166,7 +166,7 @@ class PHDComics(ParserScraper): # video self.stripUrl % '1880', self.stripUrl % '1669', - ) or data.xpath('//img[@id="comic" and contains(@src, "phd083123s")]') + ) or self.match(data, '//img[@id="comic" and contains(@src, "phd083123s")]') class Picklewhistle(ComicControlScraper): diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py index 5a10455cc..b20714d3a 100644 --- a/dosagelib/plugins/r.py +++ b/dosagelib/plugins/r.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2021 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile from urllib.parse import urljoin @@ -121,7 +121,7 @@ class Requiem(WordPressScraper): firstStripUrl = stripUrl % '2004-06-07-3' -class Replay(_ParserScraper): +class Replay(ParserScraper): url = 'http://replaycomic.com/' stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % 'red-desert' @@ -132,11 +132,11 @@ class Replay(_ParserScraper): def starter(self): # Retrieve archive page to identify chapters archivePage = self.getPage(self.url + 'archive') - archive = archivePage.xpath('//div[@class="comic-archive-chapter-wrap"]') + archive = self.match(archivePage, '//div[d:class("comic-archive-chapter-wrap")]') self.chapter = len(archive) - 1 self.startOfChapter = [] for archiveChapter in archive: - self.startOfChapter.append(archiveChapter.xpath('.//a')[0].get('href')) + self.startOfChapter.append(self.match(archiveChapter, './/a')[0].get('href')) return bounceStarter(self) def namer(self, imageUrl, pageUrl): diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index fb115b943..d14cbb546 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -435,7 +435,7 @@ class SpaceFurries(ParserScraper): def extract_image_urls(self, url, data): # Website requires JS, so build the list of image URLs manually imageurls = [] - current = int(data.xpath('//input[@name="pagnum"]')[0].get('value')) + current = int(self.match(data, '//input[@name="pagnum"]')[0].get('value')) for page in reversed(range(1, current + 1)): imageurls.append(self.url + 'comics/' + str(page) + '.jpg') return imageurls @@ -636,16 +636,16 @@ class StrongFemaleProtagonist(_ParserScraper): ) -class StupidFox(_ParserScraper): +class StupidFox(ParserScraper): url = 'http://stupidfox.net/' stripUrl = url + '%s' firstStripUrl = stripUrl % 'hello' - imageSearch = '//div[@class="comicmid"]//img' + imageSearch = '//div[d:class("comicmid")]//img' prevSearch = '//a[@accesskey="p"]' def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath(self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-') + title = self.match(page, self.imageSearch + '/@title')[0].replace(' - ', '-').replace(' ', '-') return title + '.' + imageUrl.rsplit('.', 1)[-1] diff --git a/dosagelib/plugins/shivaestudios.py b/dosagelib/plugins/shivaestudios.py index 2f508cabe..6bedc28a7 100644 --- a/dosagelib/plugins/shivaestudios.py +++ b/dosagelib/plugins/shivaestudios.py @@ -19,7 +19,7 @@ class AlienDice(WordPressSpliced): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return not data.xpath(self.imageSearch) + return not self.match(data, self.imageSearch) def getPrevUrl(self, url, data): # Fix broken navigation diff --git a/dosagelib/plugins/tapas.py b/dosagelib/plugins/tapas.py index f3c6088fb..68b1ee9ac 100644 --- a/dosagelib/plugins/tapas.py +++ b/dosagelib/plugins/tapas.py @@ -3,7 +3,6 @@ # SPDX-FileCopyrightText: © 2019 Daniel Ring from ..output import out from ..scraper import ParserScraper -from ..xml import NS class Tapas(ParserScraper): @@ -21,7 +20,7 @@ class Tapas(ParserScraper): def starter(self): # Retrieve comic metadata from info page info = self.getPage(self.url) - series = info.xpath('//@data-series-id')[0] + series = self.match(info, '//@data-series-id')[0] # Retrieve comic metadata from API data = self.session.get(self.baseUrl + 'series/' + series + '/episodes?sort=NEWEST') data.raise_for_status() @@ -43,7 +42,7 @@ class Tapas(ParserScraper): return self._cached_image_urls def shouldSkipUrl(self, url, data): - if data.xpath('//button[d:class("js-have-to-sign")]', namespaces=NS): + if self.match(data, '//button[d:class("js-have-to-sign")]'): out.warn(f'Nothing to download on "{url}", because a login is required.') return True return False diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index 8254a1dbd..e9e2300a0 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -107,7 +107,7 @@ class Unsounded(ParserScraper): return urls def extract_css_bg(self, page) -> str | None: - comicdivs = page.xpath('//div[@id="comic"]') + comicdivs = self.match(page, '//div[@id="comic"]') if comicdivs: style = comicdivs[0].attrib.get('style') if style: diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py index 33e26b317..04b6a2a02 100644 --- a/dosagelib/plugins/v.py +++ b/dosagelib/plugins/v.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2020 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper, _ParserScraper from ..helpers import bounceStarter, indirectStarter @@ -44,15 +44,15 @@ class Vibe(ParserScraper): help = 'Index format: VIBEnnn (padded)' -class VickiFox(_ParserScraper): +class VickiFox(ParserScraper): url = 'http://www.vickifox.com/comic/strip' stripUrl = url + '?id=%s' firstStripUrl = stripUrl % '001' imageSearch = '//img[contains(@src, "comic/")]' prevSearch = '//button[@id="btnPrev"]/@value' - def getPrevUrl(self, url, data): - return self.stripUrl % self.getPage(url).xpath(self.prevSearch)[0] + def link_modifier(self, fromurl, tourl): + return self.stripUrl % tourl class ViiviJaWagner(_ParserScraper): diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index 0af93415b..11543ce0d 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs -# Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2020 Daniel Ring +# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs +# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam +# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from re import compile, escape, IGNORECASE from ..scraper import ParserScraper, _BasicScraper, _ParserScraper @@ -17,7 +17,7 @@ class WapsiSquare(WordPressNaviIn): def shouldSkipUrl(self, url, data): """Skip pages without images.""" - return data.xpath('//iframe') # videos + return self.match(data, '//iframe') # videos class WastedTalent(_ParserScraper): diff --git a/dosagelib/plugins/webtoons.py b/dosagelib/plugins/webtoons.py index 393f9d809..46fa5b9e3 100644 --- a/dosagelib/plugins/webtoons.py +++ b/dosagelib/plugins/webtoons.py @@ -24,9 +24,9 @@ class WebToons(ParserScraper): self.session.cookies.set(cookie, 'false', domain='webtoons.com') # Find current episode number listPage = self.getPage(self.listUrl) - currentEpisode = listPage.xpath('//div[@class="detail_lst"]/ul/li')[0].attrib['data-episode-no'] + currentEpisode = self.match(listPage, '//div[d:class("detail_lst")]/ul/li')[0].attrib['data-episode-no'] # Check for completed tag - self.endOfLife = (listPage.xpath('//div[@id="_asideDetail"]//span[@class="txt_ico_completed2"]') != []) + self.endOfLife = not self.match(listPage, '//div[@id="_asideDetail"]//span[d:class("txt_ico_completed2")]') return self.stripUrl % currentEpisode def extract_image_urls(self, url, data): diff --git a/dosagelib/plugins/wrongside.py b/dosagelib/plugins/wrongside.py index 78bc4a080..ce75d38bf 100644 --- a/dosagelib/plugins/wrongside.py +++ b/dosagelib/plugins/wrongside.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2019-2022 Tobias Gruetzmacher -# Copyright (C) 2019-2022 Daniel Ring +# SPDX-FileCopyrightText: © 2019 Tobias Gruetzmacher +# SPDX-FileCopyrightText: © 2019 Daniel Ring from ..scraper import ParserScraper from ..helpers import indirectStarter @@ -15,21 +15,21 @@ class Wrongside(ParserScraper): def starter(self): archivePage = self.getPage(self.url) - chapterUrls = archivePage.xpath('//ul[@class="albThumbs"]//a/@href') + chapterUrls = self.match(archivePage, '//ul[d:class("albThumbs")]//a/@href') self.archive = [] for chapterUrl in chapterUrls: chapterPage = self.getPage(chapterUrl) - self.archive.append(chapterPage.xpath('(//ul[@id="thumbnails"]//a/@href)[last()]')[0]) + self.archive.append(self.match(chapterPage, '(//ul[@id="thumbnails"]//a/@href)[last()]')[0]) return self.archive[0] def getPrevUrl(self, url, data): - if data.xpath(self.prevSearch) == [] and len(self.archive) > 0: + if self.match(data, self.prevSearch) == [] and len(self.archive) > 0: return self.archive.pop() return super(Wrongside, self).getPrevUrl(url, data) def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] + title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] @@ -71,5 +71,5 @@ class WrongsideSideStories(ParserScraper): def namer(self, imageUrl, pageUrl): page = self.getPage(pageUrl) - title = page.xpath('//div[@class="browsePath"]/h2/text()')[0] + title = self.match(page, '//div[d:class("browsePath")]/h2/text()')[0] return title.replace('"', '') + '.' + imageUrl.rsplit('.', 1)[-1] diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index e9928c391..b0f436744 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -521,15 +521,10 @@ class ParserScraper(Scraper): return text.strip() def _matchPattern(self, data, patterns): - if self.css: - searchFun = data.cssselect - else: - def searchFun(s): - return data.xpath(s, namespaces=NS) patterns = makeSequence(patterns) for search in patterns: matched = False - for match in searchFun(search): + for match in self.match(data, search): matched = True yield match, search @@ -537,6 +532,13 @@ class ParserScraper(Scraper): # do not search other links if one pattern matched break + def match(self, data, pattern): + """Match a pattern (XPath/CSS) against a page.""" + if self.css: + return data.cssselect(pattern) + else: + return data.xpath(pattern, namespaces=NS) + def getDisabledReasons(self): res = {} if self.css and cssselect is None: