diff --git a/dosagelib/helpers.py b/dosagelib/helpers.py index 7d587cf17..d53e04cfb 100644 --- a/dosagelib/helpers.py +++ b/dosagelib/helpers.py @@ -62,9 +62,3 @@ def indirectStarter(self): data = self.getPage(url) newurl = self.fetchUrl(url, data, self.latestSearch) return self.link_modifier(url, newurl) - - -def xpath_class(name): - """Returns an XPath expressions which finds a tag which has a specified - class.""" - return 'contains(concat(" ", @class, " "), " %s ")' % name diff --git a/dosagelib/plugins/b.py b/dosagelib/plugins/b.py index 89cd80aa8..e71e4c08c 100644 --- a/dosagelib/plugins/b.py +++ b/dosagelib/plugins/b.py @@ -7,7 +7,7 @@ from re import compile, escape from ..util import tagre from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic @@ -71,7 +71,7 @@ class Baroquen(_BasicScraper): class Bearmageddon(_WordPressScraper): url = 'http://bearmageddon.com/bearmo/page-1/' firstStripUrl = url - latestSearch = '//a[%s]' % xpath_class('comic-nav-last') + latestSearch = '//a[d:class("comic-nav-last")]' starter = indirectStarter @@ -187,8 +187,8 @@ class BlankIt(_ParserScraper): url = 'http://blankitcomics.com/' firstStripUrl = url + 'comic/well-what-would-you-do' imageSearch = '//div[@id="comic"]//img' - prevSearch = '//a[%s]' % xpath_class('comic-nav-previous') - latestSearch = '//a[%s]' % xpath_class('comic-nav-last') + prevSearch = '//a[d:class("comic-nav-previous")]' + latestSearch = '//a[d:class("comic-nav-last")]' starter = indirectStarter @@ -235,7 +235,7 @@ class BMovieComic(_BasicScraper): class BobWhite(_ParserScraper): url = 'http://www.bobwhitecomics.com/' - imageSearch = '//span[%s]/img' % xpath_class('webcomic-object') + imageSearch = '//span[d:class("webcomic-object")]/img' prevSearch = '//a[@rel="previous"]' @@ -296,7 +296,6 @@ class ButImACatPerson(_WordPressScraper): endOfLife = True - class ButtercupFestival(_ParserScraper): url = 'http://www.buttercupfestival.com/' stripUrl = url + '%s.htm' diff --git a/dosagelib/plugins/clonemanga.py b/dosagelib/plugins/clonemanga.py index 5cc342d3f..30a4482a6 100644 --- a/dosagelib/plugins/clonemanga.py +++ b/dosagelib/plugins/clonemanga.py @@ -1,15 +1,15 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher -from ..helpers import indirectStarter, xpath_class +# Copyright (C) 2015-2020 Tobias Gruetzmacher +from ..helpers import indirectStarter from ..scraper import _ParserScraper from ..util import getQueryParams class CloneManga(_ParserScraper): baseUrl = 'http://manga.clone-army.org' - imageSearch = '//div[%s]//img' % xpath_class('subsectionContainer') + imageSearch = '//div[d:class("subsectionContainer")]//img' prevSearch = '//a[span[text()="<<"]]' latestSearch = '//a[span[text()=">|"]]' starter = indirectStarter diff --git a/dosagelib/plugins/comicfury.py b/dosagelib/plugins/comicfury.py index 32afe3827..f7134a36f 100644 --- a/dosagelib/plugins/comicfury.py +++ b/dosagelib/plugins/comicfury.py @@ -6,10 +6,10 @@ import os from ..scraper import _ParserScraper -from ..helpers import bounceStarter, xpath_class +from ..helpers import bounceStarter -XPATH_LINK = '//a[%s and contains(text(), "%s")]' -XPATH_IMG = '//div[{}]//a[img[contains(@alt, "%s")]]'.format(xpath_class('comicnav')) +XPATH_LINK = '//a[d:class("%s") and contains(text(), "%s")]' +XPATH_IMG = '//div[d:class("comicnav")]//a[img[contains(@alt, "%s")]]' class ComicFury(_ParserScraper): @@ -21,12 +21,12 @@ class ComicFury(_ParserScraper): # 137 (needs to be before the generic a@rel, because layout is wrong) '//a[contains(@title, "previous")]', '//a[@rel="prev"]', - XPATH_LINK % (xpath_class("comicnavlink"), "Previous"), + XPATH_LINK % ('comicnavlink', 'Previous'), XPATH_IMG % ('Previous'), # Art, ConsolersDLC, etc. u'//nav//a[contains(text(), "\u2039")]', # LatchkeyKingdom - '//a[%s and img[contains(@src, "Previous")]]' % xpath_class('navi'), + '//a[d:class("navi") and img[contains(@src, "Previous")]]', # RedSpot '//a[contains(text(), "Back")]', # KATRAN @@ -37,12 +37,12 @@ class ComicFury(_ParserScraper): # 137 (see above) '//a[contains(@title, "next")]', '//a[@rel="next"]', - XPATH_LINK % (xpath_class("comicnavlink"), "Next"), + XPATH_LINK % ('comicnavlink', 'Next'), XPATH_IMG % ('Next'), # Art, ConsolersDLC, etc. u'//nav//a[contains(text(), "\u203A")]', # LatchkeyKingdom - '//a[%s and img[contains(@src, "Next")]]' % xpath_class('navi'), + '//a[d:class("navi") and img[contains(@src, "Next")]]', # RedSpot, KATRAN '//a[contains(text(), "Next")]', ) diff --git a/dosagelib/plugins/common.py b/dosagelib/plugins/common.py index 37662eb63..f56063851 100644 --- a/dosagelib/plugins/common.py +++ b/dosagelib/plugins/common.py @@ -4,7 +4,6 @@ # Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2019-2020 Daniel Ring from ..scraper import _ParserScraper -from ..helpers import indirectStarter, xpath_class # Common base classes for comics with the same structure (same hosting # software, for example) go here. Since those are shared by many modules, @@ -14,24 +13,24 @@ from ..helpers import indirectStarter, xpath_class class _WordPressScraper(_ParserScraper): imageSearch = '//div[@id="comic"]//img' - prevSearch = '//a[%s]' % xpath_class('comic-nav-previous') - nextSearch = '//a[%s]' % xpath_class('comic-nav-next') - latestSearch = '//a[%s]' % xpath_class('comic-nav-last') + prevSearch = '//a[d:class("comic-nav-previous")]' + nextSearch = '//a[d:class("comic-nav-next")]' + latestSearch = '//a[d:class("comic-nav-last")]' class _WPNavi(_WordPressScraper): - prevSearch = '//a[%s]' % xpath_class('navi-prev') + prevSearch = '//a[d:class("navi-prev")]' class _WPNaviIn(_WordPressScraper): - prevSearch = '//a[%s]' % xpath_class('navi-prev-in') + prevSearch = '//a[d:class("navi-prev-in")]' class _WPWebcomic(_WordPressScraper): - imageSearch = '//div[{}]//img'.format(xpath_class('webcomic-image')) - prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link')) - nextSearch = '///a[{}]'.format(xpath_class('next-webcomic-link')) - latestSearch = '//a[{}]'.format(xpath_class('last-webcomic-link')) + imageSearch = '//div[d:class("webcomic-image")]//img' + prevSearch = '//a[d:class("previous-webcomic-link")]' + nextSearch = '///a[d:class("next-webcomic-link")]' + latestSearch = '//a[d:class("last-webcomic-link")]' class _ComicControlScraper(_ParserScraper): diff --git a/dosagelib/plugins/d.py b/dosagelib/plugins/d.py index 07429f3ae..05273a324 100644 --- a/dosagelib/plugins/d.py +++ b/dosagelib/plugins/d.py @@ -6,7 +6,7 @@ from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, bounceStarter, xpath_class +from ..helpers import indirectStarter, bounceStarter from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNaviIn, _WPWebcomic @@ -190,8 +190,8 @@ class Dilbert(_ParserScraper): stripUrl = url + 'strip/%s' firstStripUrl = stripUrl % '1989-04-16' starter = indirectStarter - prevSearch = '//div[%s]/a' % xpath_class('nav-left') - imageSearch = '//img[%s]' % xpath_class('img-comic') + prevSearch = '//div[d:class("nav-left")]/a' + imageSearch = '//img[d:class("img-comic")]' latestSearch = '//a[@class="img-comic-link"]' help = 'Index format: yyyy-mm-dd' @@ -260,14 +260,14 @@ class DominicDeegan(_ParserScraper): class DorkTower(_ParserScraper): url = 'http://www.dorktower.com/' firstStripUrl = url + '1997/01/01/shadis-magazine-strip-1/' - imageSearch = '//div[%s]//a/img' % xpath_class('entry-content') - prevSearch = '//a[%s][text()="Previous"]' % xpath_class('btn') + imageSearch = '//div[d:class("entry-content")]//a/img' + prevSearch = '//a[d:class("btn")][text()="Previous"]' class DoomsdayMyDear(_ParserScraper): url = 'http://doomsdaymydear.com/' - imageSearch = '//img[{}]'.format(xpath_class('attachment-full')) - prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link')) + imageSearch = '//img[d:class("attachment-full")]' + prevSearch = '//a[d:class("previous-webcomic-link")]' class Draconia(_WPWebcomic): @@ -307,10 +307,9 @@ class DresdenCodak(_ParserScraper): url = 'http://dresdencodak.com/' startUrl = url + 'cat/comic/' firstStripUrl = url + '2007/02/08/pom/' - imageSearch = '//section[%s]//img[%s]' % ( - xpath_class('entry-content'), xpath_class('aligncenter')) + imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]' prevSearch = '//a[img[contains(@src, "prev")]]' - latestSearch = '//a[%s]' % xpath_class('tc-grid-bg-link') + latestSearch = '//a[d:class("tc-grid-bg-link")]' starter = indirectStarter # Blog and comic are mixed... diff --git a/dosagelib/plugins/e.py b/dosagelib/plugins/e.py index 75984320f..c84970699 100644 --- a/dosagelib/plugins/e.py +++ b/dosagelib/plugins/e.py @@ -6,7 +6,7 @@ import os from re import compile, IGNORECASE -from ..helpers import bounceStarter, indirectStarter, xpath_class +from ..helpers import bounceStarter, indirectStarter from ..scraper import _BasicScraper, _ParserScraper from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi @@ -35,7 +35,7 @@ class EatLiver(_ParserScraper): url = 'http://www.eatliver.com/' starter = indirectStarter multipleImagesPerStrip = True - imageSearch = '//div[%s]//img' % xpath_class('post-content') + imageSearch = '//div[d:class("post-content")]//img' prevSearch = '//a[@rel="prev"]' latestSearch = '//a[@rel="bookmark"]' @@ -175,7 +175,7 @@ class Everblue(_ParserScraper): class EverybodyLovesEricRaymond(_ParserScraper): url = 'http://geekz.co.uk/lovesraymond/' firstStripUrl = url + 'archive/slashdotted' - imageSearch = '//div[%s]//img' % xpath_class('entry-content') + imageSearch = '//div[d:class("entry-content")]//img' prevSearch = '//a[@rel="prev"]' @@ -255,6 +255,6 @@ class ExtraOrdinary(_ParserScraper): url = 'https://www.exocomics.com/' stripUrl = url + '%s' firstStripUrl = stripUrl % '01' - prevSearch = '//a[%s]' % xpath_class('prev') - imageSearch = '//img[%s]' % xpath_class('image-style-main-comic') + prevSearch = '//a[d:class("prev")]' + imageSearch = '//img[d:class("image-style-main-comic")]' help = 'Index format: number' diff --git a/dosagelib/plugins/f.py b/dosagelib/plugins/f.py index d07e87974..57cda2b0a 100644 --- a/dosagelib/plugins/f.py +++ b/dosagelib/plugins/f.py @@ -7,7 +7,7 @@ from re import compile, escape from ..util import tagre from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, joinPathPartsNamer, xpath_class +from ..helpers import indirectStarter, joinPathPartsNamer from .common import _ComicControlScraper, _WPNaviIn, _WordPressScraper @@ -62,8 +62,8 @@ class FirstWorldProblems(_ParserScraper): 'http://bradcolbow.com/archive/C5/') stripUrl = url + '%s/' firstStripUrl = stripUrl % 'P10' - imageSearch = '//div[{}]//img'.format(xpath_class('entry')) - prevSearch = '//a[{}]'.format(xpath_class('prev')) + imageSearch = '//div[d:class("entry")]//img' + prevSearch = '//a[d:class("prev")]' multipleImagesPerStrip = True endOfLife = True @@ -83,7 +83,7 @@ class Flemcomics(_ParserScraper): 'http://www.flemcomics.com/') stripUrl = url + 'd/%s.html' firstStripUrl = stripUrl % '19980101' - imageSearch = '//img[{}]'.format(xpath_class('ksc')) + imageSearch = '//img[d:class("ksc")]' prevSearch = '//a[@rel="prev"]' endOfLife = True help = 'Index format: yyyymmdd' @@ -174,10 +174,10 @@ class FredoAndPidjin(_ParserScraper): url = 'https://www.pidjin.net/' stripUrl = url + '%s/' firstStripUrl = stripUrl % '2006/02/19/goofy-monday' - imageSearch = '//div[%s]//img' % xpath_class("episode") + imageSearch = '//div[d:class("episode")]//img' multipleImagesPerStrip = True - prevSearch = '//span[%s]/a' % xpath_class("prev") - latestSearch = '//section[%s]//a' % xpath_class("latest") + prevSearch = '//span[d:class("prev")]/a' + latestSearch = '//section[d:class("latest")]//a' starter = indirectStarter namer = joinPathPartsNamer((0, 1, 2)) diff --git a/dosagelib/plugins/gocomics.py b/dosagelib/plugins/gocomics.py index d51d0be54..f013c4421 100644 --- a/dosagelib/plugins/gocomics.py +++ b/dosagelib/plugins/gocomics.py @@ -3,14 +3,14 @@ # Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2015-2020 Tobias Gruetzmacher from ..scraper import _ParserScraper -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter class GoComics(_ParserScraper): url = 'https://www.gocomics.com/' - imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image')) - prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic')) - latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0')) + imageSearch = '//picture[d:class("item-comic-image")]/img' + prevSearch = '//a[d:class("js-previous-comic")]' + latestSearch = '//div[d:class("gc-deck--cta-0")]//a' starter = indirectStarter help = 'Index format: yyyy/mm/dd' diff --git a/dosagelib/plugins/j.py b/dosagelib/plugins/j.py index 5689be77a..c5bf06f67 100644 --- a/dosagelib/plugins/j.py +++ b/dosagelib/plugins/j.py @@ -6,7 +6,7 @@ from re import compile, escape from ..scraper import _BasicScraper from ..util import tagre -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from .common import _ComicControlScraper @@ -41,7 +41,7 @@ class JoeAndMonkey(_BasicScraper): class JohnnyWander(_ComicControlScraper): - imageSearch = ('//ul[%s]/li/@data-src' % xpath_class('cc-showbig'), + imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src', _ComicControlScraper.imageSearch) url = 'http://www.johnnywander.com/' diff --git a/dosagelib/plugins/m.py b/dosagelib/plugins/m.py index 5ca724ce1..5f530c59a 100644 --- a/dosagelib/plugins/m.py +++ b/dosagelib/plugins/m.py @@ -6,7 +6,7 @@ import json from re import compile, escape, IGNORECASE -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from ..scraper import _BasicScraper, _ParserScraper from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPWebcomic @@ -74,7 +74,7 @@ class MarriedToTheSea(_ParserScraper): url = 'http://marriedtothesea.com/' stripUrl = url + '%s' firstStripUrl = stripUrl % '022806' - imageSearch = '//div[%s]//p/img' % xpath_class('jumbotron') + imageSearch = '//div[d:class("jumbotron")]//p/img' prevSearch = '//a[contains(text(), "Yesterday")]' help = 'Index format: mmddyy' diff --git a/dosagelib/plugins/n.py b/dosagelib/plugins/n.py index 677a7fdc9..7c86ca0df 100644 --- a/dosagelib/plugins/n.py +++ b/dosagelib/plugins/n.py @@ -6,7 +6,7 @@ from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPWebcomic @@ -134,7 +134,7 @@ class Nimona(_ParserScraper): 'http://gingerhaze.com/nimona/') stripUrl = url + 'comic/%s' firstStripUrl = stripUrl % "page-1" - imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page')) + imageSearch = '//div[d:class("field-name-field-comic-page")]//img' prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]' endOfLife = True diff --git a/dosagelib/plugins/p.py b/dosagelib/plugins/p.py index 89b2ae63c..d9f62c62a 100644 --- a/dosagelib/plugins/p.py +++ b/dosagelib/plugins/p.py @@ -6,7 +6,7 @@ from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import bounceStarter, queryNamer, indirectStarter, xpath_class +from ..helpers import bounceStarter, queryNamer, indirectStarter from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi @@ -99,8 +99,8 @@ class PennyArcade(_ParserScraper): stripUrl = url + '%s' firstStripUrl = stripUrl % '1998/11/18' imageSearch = '//div[@id="comicFrame"]//img' - prevSearch = '//a[%s]' % xpath_class('btnPrev') - nextSearch = '//a[%s]' % xpath_class('btnNext') + prevSearch = '//a[d:class("btnPrev")]' + nextSearch = '//a[d:class("btnNext")]' starter = bounceStarter help = 'Index format: yyyy/mm/dd' @@ -231,7 +231,7 @@ class PokeyThePenguin(_ParserScraper): class PoorlyDrawnLines(_ParserScraper): url = 'http://poorlydrawnlines.com/comic/' firstStripUrl = url + 'campus-characters/' - imageSearch = '//div[%s]//img' % xpath_class('comic') + imageSearch = '//div[d:class("comic")]//img' prevSearch = '//a[@rel="prev"]' @@ -269,7 +269,7 @@ class PrinceOfSartar(_WPNavi): url = 'http://www.princeofsartar.com/' stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % 'introduction-chapter-1' - nextSearch = '//a[%s]' % xpath_class('navi-next') + nextSearch = '//a[d:class("navi-next")]' starter = bounceStarter help = 'Index format: name' diff --git a/dosagelib/plugins/q.py b/dosagelib/plugins/q.py index 4eed75a29..9e553e463 100644 --- a/dosagelib/plugins/q.py +++ b/dosagelib/plugins/q.py @@ -4,7 +4,6 @@ # Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2019-2020 Daniel Ring from ..scraper import _ParserScraper -from ..helpers import xpath_class class QuantumVibe(_ParserScraper): @@ -28,6 +27,6 @@ class Qwantz(_ParserScraper): url = 'http://www.qwantz.com/index.php' stripUrl = url + '?comic=%s' firstStripUrl = stripUrl % '1' - imageSearch = '//img[{}]'.format(xpath_class('comic')) + imageSearch = '//img[d:class("comic")]' prevSearch = '//a[@rel="prev"]' help = 'Index format: n' diff --git a/dosagelib/plugins/r.py b/dosagelib/plugins/r.py index 4b1bfb6f0..e74830e4f 100644 --- a/dosagelib/plugins/r.py +++ b/dosagelib/plugins/r.py @@ -6,7 +6,7 @@ from re import compile from urllib.parse import urljoin -from ..helpers import bounceStarter, xpath_class +from ..helpers import bounceStarter from ..scraper import _BasicScraper, _ParserScraper from ..util import tagre from .common import _WordPressScraper, _WPWebcomic @@ -107,7 +107,7 @@ class RomanticallyApocalyptic(_ParserScraper): url = 'http://romanticallyapocalyptic.com/' stripUrl = url + '%s' firstStripUrl = stripUrl % '0' - imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel') + imageSearch = '//div[d:class("comicpanel")]/center//img' prevSearch = '//a[@accesskey="p"]' help = 'Index format: n' adult = True diff --git a/dosagelib/plugins/s.py b/dosagelib/plugins/s.py index 485881b82..fc4984c3f 100644 --- a/dosagelib/plugins/s.py +++ b/dosagelib/plugins/s.py @@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE, sub from os.path import splitext from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer, xpath_class +from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic @@ -120,7 +120,7 @@ class SchoolBites(_ParserScraper): url = ('https://web.archive.org/web/20170215065523/' 'http://schoolbites.net/') stripUrl = url + 'd/%s.html' - imageSearch = '//img[{}]'.format(xpath_class('ksc')) + imageSearch = '//img[d:class("ksc")]' prevSearch = '//a[@rel="prev"]' endOfLife = True help = 'Index format: yyyymmdd' @@ -132,7 +132,7 @@ class Schuelert(_ParserScraper): stripUrl = url + 'index.php?paged=%s' firstStripUrl = stripUrl % '3' imageSearch = '//img[contains(@src, "wp-content")]' - prevSearch = '//span[{}]/a'.format(xpath_class('prevlink')) + prevSearch = '//span[d:class("prevlink")]/a' multipleImagesPerStrip = True endOfLife = True lang = 'de' @@ -143,7 +143,7 @@ class Science(_ParserScraper): 'http://sci-ence.org/%s/') url = stripUrl % 'new-york-comic-con-2013' firstStripUrl = stripUrl % 'periodic-table-element-ass' - prevSearch = '//a[{}]'.format(xpath_class('navi-prev')) + prevSearch = '//a[d:class("navi-prev")]' imageSearch = '//div[@class="comicpane"]//img' endOfLife = True @@ -159,7 +159,7 @@ class SequentialArt(_ParserScraper): url = 'https://www.collectedcurios.com/sequentialart.php' stripUrl = url + '?s=%s' firstStripUrl = stripUrl % '1' - imageSearch = '//img[{}]'.format(xpath_class('w3-image')) + imageSearch = '//img[d:class("w3-image")]' prevSearch = '//a[@id="backOne"]' help = 'Index format: name' @@ -286,9 +286,9 @@ class SluggyFreelance(_ParserScraper): url = 'http://sluggy.com/' stripUrl = 'http://archives.sluggy.com/book.php?chapter=%s' firstStripUrl = stripUrl % '1' - imageSearch = '//div[%s]/img/@data-src' % xpath_class('comic_content') - prevSearch = '//div[%s]/a' % xpath_class('previous') - latestSearch = '//a[%s]' % xpath_class('archives_link') + imageSearch = '//div[d:class("comic_content")]/img/@data-src' + prevSearch = '//div[d:class("previous")]/a' + latestSearch = '//a[d:class("archives_link")]' starter = indirectStarter multipleImagesPerStrip = True help = 'Index format: chapter' @@ -374,7 +374,7 @@ class SpaceJunkArlia(_ParserScraper): url = 'http://spacejunkarlia.com/' stripUrl = url + '?strip_id=%s' firstStripUrl = stripUrl % '0' - imageSearch = '//div[%s]/img' % xpath_class('content') + imageSearch = '//div[d:class("content")]/img' prevSearch = '//a[text()="<"]' help = 'Index format: number' @@ -382,7 +382,7 @@ class SpaceJunkArlia(_ParserScraper): class SpaceTrawler(_ParserScraper): url = 'https://www.baldwinpage.com/spacetrawler/' firstStripUrl = url + '2010/01/01/spacetrawler-4/' - imageSearch = '//img[%s]' % xpath_class('size-full') + imageSearch = '//img[d:class("size-full")]' prevSearch = '//a[@rel="prev"]' diff --git a/dosagelib/plugins/smackjeeves.py b/dosagelib/plugins/smackjeeves.py index 814160295..a490c5e5c 100644 --- a/dosagelib/plugins/smackjeeves.py +++ b/dosagelib/plugins/smackjeeves.py @@ -3,14 +3,13 @@ # Copyright (C) 2019-2020 Daniel Ring import re -from ..helpers import xpath_class from ..scraper import _ParserScraper class SmackJeeves(_ParserScraper): baseUrl = 'https://www.smackjeeves.com/discover/' apiBase = 'https://www.smackjeeves.com/api/discover/' - prevSearch = '//a[i[{}]]'.format(xpath_class('i-arrow-double-left-black')) + prevSearch = '//a[i[d:class("i-arrow-double-left-black")]]' imageSearch = re.compile("comicData:[^']*'([^']*)'", re.DOTALL) help = 'Index format: n' diff --git a/dosagelib/plugins/t.py b/dosagelib/plugins/t.py index 887d5562a..96a8d1417 100644 --- a/dosagelib/plugins/t.py +++ b/dosagelib/plugins/t.py @@ -10,7 +10,7 @@ except ImportError: from cached_property import cached_property from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic @@ -40,8 +40,8 @@ class TheBrads(_ParserScraper): 'http://bradcolbow.com/archive/C4/') stripUrl = url + '%s/' firstStripUrl = stripUrl % 'P125' - imageSearch = '//div[{}]//img'.format(xpath_class('entry')) - prevSearch = '//a[{}]'.format(xpath_class('prev')) + imageSearch = '//div[d:class("entry")]//img' + prevSearch = '//a[d:class("prev")]' multipleImagesPerStrip = True endOfLife = True @@ -120,7 +120,7 @@ class TheLandscaper(_ParserScraper): 'http://landscaper.visual-assault.net/comic/%s') url = stripUrl % 'latest' firstStripUrl = stripUrl % '1' - imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic')) + imageSearch = '//article[d:class("comic")]//img[1]' prevSearch = '//a[contains(text(), "Previous")]' endOfLife = True @@ -294,8 +294,8 @@ class TumbleDryComics(_WordPressScraper): class Turnoff(_ParserScraper): name = 'turnoff' url = 'https://turnoff.us/' - imageSearch = '//article[%s]//img' % xpath_class('post-content') - prevSearch = '//div[%s]//a' % xpath_class('prev') + imageSearch = '//article[d:class("post-content")]//img' + prevSearch = '//div[d:class("prev")]//a' stripUrl = url + 'geek/%s' firstStripUrl = stripUrl % 'tcp-buddies' multipleImagesPerStrip = True @@ -341,8 +341,8 @@ class Twokinds(_ParserScraper): url = 'http://twokinds.keenspot.com/' stripUrl = url + 'comic/%s/' firstStripUrl = stripUrl % '1' - imageSearch = '//article[%s]//img' % xpath_class('comic') - prevSearch = '//a[%s]' % xpath_class('navprev') + imageSearch = '//article[d:class("comic")]//img' + prevSearch = '//a[d:class("navprev")]' help = 'Index format: n (unpadded)' diff --git a/dosagelib/plugins/u.py b/dosagelib/plugins/u.py index d0a5b8ad3..0ef199e12 100644 --- a/dosagelib/plugins/u.py +++ b/dosagelib/plugins/u.py @@ -6,7 +6,7 @@ from re import compile from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import indirectStarter, xpath_class +from ..helpers import indirectStarter from ..util import tagre from .common import _ComicControlScraper, _WordPressScraper, _WPNavi @@ -46,7 +46,7 @@ class Unsounded(_ParserScraper): stripUrl = url + 'comic/ch%s/ch%s_%s.html' firstStripUrl = stripUrl % ('01', '01', '01') imageSearch = '//img[contains(@src, "pageart/")]' - prevSearch = '//a[%s]' % xpath_class('back') + prevSearch = '//a[d:class("back")]' latestSearch = '//div[@id="chapter_box"][1]//a[last()]' multipleImagesPerStrip = True starter = indirectStarter diff --git a/dosagelib/plugins/v.py b/dosagelib/plugins/v.py index c83fddc73..7fb909d73 100644 --- a/dosagelib/plugins/v.py +++ b/dosagelib/plugins/v.py @@ -6,7 +6,7 @@ from re import compile from ..scraper import _BasicScraper, _ParserScraper -from ..helpers import bounceStarter, indirectStarter, xpath_class +from ..helpers import bounceStarter, indirectStarter from ..util import tagre @@ -71,8 +71,8 @@ class VictimsOfTheSystem(_BasicScraper): class ViiviJaWagner(_ParserScraper): url = 'http://www.hs.fi/viivijawagner/' imageSearch = '//meta[@property="og:image"]/@content' - prevSearch = '//a[%s]' % xpath_class('prev') - latestSearch = '//div[%s]//a' % xpath_class('cartoon-content') + prevSearch = '//a[d:class("prev")]' + latestSearch = '//div[d:class("cartoon-content")]//a' starter = indirectStarter lang = 'fi' diff --git a/dosagelib/plugins/w.py b/dosagelib/plugins/w.py index 59e0ea29b..33c949791 100644 --- a/dosagelib/plugins/w.py +++ b/dosagelib/plugins/w.py @@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE from ..scraper import _BasicScraper, _ParserScraper from ..util import tagre -from ..helpers import bounceStarter, indirectStarter, xpath_class +from ..helpers import bounceStarter, indirectStarter from .common import _ComicControlScraper, _WPNavi, _WPNaviIn, _WPWebcomic @@ -28,8 +28,8 @@ class WastedTalent(_BasicScraper): class WebcomicName(_ParserScraper): url = 'https://webcomicname.com/' - imageSearch = '//figure[{}]//img'.format(xpath_class('tmblr-full')) - prevSearch = '//a[{}]'.format(xpath_class('next')) + imageSearch = '//figure[d:class("tmblr-full")]//img' + prevSearch = '//a[d:class("next")]' multipleImagesPerStrip = True @@ -38,10 +38,10 @@ class WebDesignerCOTW(_ParserScraper): url = baseUrl + 'category/comics/' starter = indirectStarter firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/' - imageSearch = '//article[%s]//img' % xpath_class('article-content') + imageSearch = '//article[d:class("article-content")]//img' multipleImagesPerStrip = True - prevSearch = '//a[span[%s]]' % xpath_class('icon-right-small') - latestSearch = '//a[%s]' % xpath_class('anim-link') + prevSearch = '//a[span[d:class("icon-right-small")]]' + latestSearch = '//a[d:class("anim-link")]' def shouldSkipUrl(self, url, data): """Skip non-comic URLs.""" diff --git a/dosagelib/plugins/z.py b/dosagelib/plugins/z.py index 456c73e7a..2cd2583f1 100644 --- a/dosagelib/plugins/z.py +++ b/dosagelib/plugins/z.py @@ -1,12 +1,12 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015-2019 Tobias Gruetzmacher +# Copyright (C) 2015-2020 Tobias Gruetzmacher from re import compile, escape from ..scraper import _BasicScraper, _ParserScraper from ..util import tagre -from ..helpers import bounceStarter, joinPathPartsNamer, xpath_class +from ..helpers import bounceStarter, joinPathPartsNamer from .common import _WPNavi @@ -21,8 +21,8 @@ class Zapiro(_ParserScraper): url = 'http://mg.co.za/zapiro/' starter = bounceStarter imageSearch = '//div[@id="cartoon"]/img' - prevSearch = '//a[%s]' % xpath_class('left') - nextSearch = '//a[%s]' % xpath_class('right') + prevSearch = '//a[d:class("left")]' + nextSearch = '//a[d:class("right")]' namer = joinPathPartsNamer((-1,), ()) @@ -31,8 +31,8 @@ class ZenPencils(_WPNavi): multipleImagesPerStrip = True firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/' starter = bounceStarter - prevSearch = '//a[%s]' % xpath_class('navi-prev') - nextSearch = '//a[%s]' % xpath_class('navi-next') + prevSearch = '//a[d:class("navi-prev")]' + nextSearch = '//a[d:class("navi-next")]' class ZombieHunters(_BasicScraper): diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index f1ab5376f..33b37790c 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL, from .comic import ComicStrip from .output import out from .events import getHandler +from .xml import NS ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') @@ -434,10 +435,6 @@ class _ParserScraper(Scraper): XML_DECL = re.compile( r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) - NS = { - "re": "http://exslt.org/regular-expressions" - } - # Switch between CSS and XPath selectors for this class. Since CSS needs # another Python module, XPath is the default for now. css = False @@ -519,7 +516,7 @@ class _ParserScraper(Scraper): searchFun = data.cssselect else: def searchFun(s): - return data.xpath(s, namespaces=self.NS) + return data.xpath(s, namespaces=NS) patterns = makeSequence(patterns) for search in patterns: matched = False diff --git a/dosagelib/xml.py b/dosagelib/xml.py new file mode 100644 index 000000000..bdac1f73e --- /dev/null +++ b/dosagelib/xml.py @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2020 Tobias Gruetzmacher +from lxml import etree + + +NS = { + 'd': 'https://dosage.rocks/xpath', + 're': 'http://exslt.org/regular-expressions' +} + + +def find_by_class(context, cls): + attributes = context.context_node.attrib + if 'class' in attributes: + return cls in attributes['class'].split(' ') + return False + + +dosagens = etree.FunctionNamespace(NS['d']) +dosagens['class'] = find_by_class diff --git a/tests/httpmocks.py b/tests/httpmocks.py index 3e833b78e..4a50d38b6 100644 --- a/tests/httpmocks.py +++ b/tests/httpmocks.py @@ -14,7 +14,7 @@ def _file(name): @lru_cache() -def _content(name): +def content(name): with gzip.open(_file(name + '.html.gz'), 'r') as f: return f.read() @@ -26,7 +26,7 @@ def _img(name): def page(url, pagename): - add(GET, url, _content(pagename)) + add(GET, url, content(pagename)) def png(url, name='empty'): diff --git a/tests/test_xml.py b/tests/test_xml.py new file mode 100644 index 000000000..0172aa6f1 --- /dev/null +++ b/tests/test_xml.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2020 Tobias Gruetzmacher + +from lxml import html + +from dosagelib.xml import NS + +import httpmocks + + +tree = html.document_fromstring(httpmocks.content('zp-222')) + + +class TestXML: + def xpath(self, path): + return tree.xpath(path, namespaces=NS) + + def test_class_ext(self): + assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1 + assert len(self.xpath('//ul[d:class("menu")]')) == 1 + assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2 + assert len(self.xpath('//li[d:class("menu-item")]')) == 25 + + def test_re_ext(self): + assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1