Merge pull request #169 from webcomics/xpath-ext

Replace xpath_class function with an XPath extension
This commit is contained in:
Tobias Gruetzmacher 2020-08-03 22:18:52 +02:00 committed by GitHub
commit 912b30191d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 148 additions and 117 deletions

View file

@ -62,9 +62,3 @@ def indirectStarter(self):
data = self.getPage(url) data = self.getPage(url)
newurl = self.fetchUrl(url, data, self.latestSearch) newurl = self.fetchUrl(url, data, self.latestSearch)
return self.link_modifier(url, newurl) return self.link_modifier(url, newurl)
def xpath_class(name):
"""Returns an XPath expressions which finds a tag which has a specified
class."""
return 'contains(concat(" ", @class, " "), " %s ")' % name

View file

@ -7,7 +7,7 @@ from re import compile, escape
from ..util import tagre from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -71,7 +71,7 @@ class Baroquen(_BasicScraper):
class Bearmageddon(_WordPressScraper): class Bearmageddon(_WordPressScraper):
url = 'http://bearmageddon.com/bearmo/page-1/' url = 'http://bearmageddon.com/bearmo/page-1/'
firstStripUrl = url firstStripUrl = url
latestSearch = '//a[%s]' % xpath_class('comic-nav-last') latestSearch = '//a[d:class("comic-nav-last")]'
starter = indirectStarter starter = indirectStarter
@ -187,8 +187,8 @@ class BlankIt(_ParserScraper):
url = 'http://blankitcomics.com/' url = 'http://blankitcomics.com/'
firstStripUrl = url + 'comic/well-what-would-you-do' firstStripUrl = url + 'comic/well-what-would-you-do'
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous') prevSearch = '//a[d:class("comic-nav-previous")]'
latestSearch = '//a[%s]' % xpath_class('comic-nav-last') latestSearch = '//a[d:class("comic-nav-last")]'
starter = indirectStarter starter = indirectStarter
@ -235,7 +235,7 @@ class BMovieComic(_BasicScraper):
class BobWhite(_ParserScraper): class BobWhite(_ParserScraper):
url = 'http://www.bobwhitecomics.com/' url = 'http://www.bobwhitecomics.com/'
imageSearch = '//span[%s]/img' % xpath_class('webcomic-object') imageSearch = '//span[d:class("webcomic-object")]/img'
prevSearch = '//a[@rel="previous"]' prevSearch = '//a[@rel="previous"]'
@ -296,7 +296,6 @@ class ButImACatPerson(_WordPressScraper):
endOfLife = True endOfLife = True
class ButtercupFestival(_ParserScraper): class ButtercupFestival(_ParserScraper):
url = 'http://www.buttercupfestival.com/' url = 'http://www.buttercupfestival.com/'
stripUrl = url + '%s.htm' stripUrl = url + '%s.htm'

View file

@ -1,15 +1,15 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..util import getQueryParams from ..util import getQueryParams
class CloneManga(_ParserScraper): class CloneManga(_ParserScraper):
baseUrl = 'http://manga.clone-army.org' baseUrl = 'http://manga.clone-army.org'
imageSearch = '//div[%s]//img' % xpath_class('subsectionContainer') imageSearch = '//div[d:class("subsectionContainer")]//img'
prevSearch = '//a[span[text()="<<"]]' prevSearch = '//a[span[text()="<<"]]'
latestSearch = '//a[span[text()=">|"]]' latestSearch = '//a[span[text()=">|"]]'
starter = indirectStarter starter = indirectStarter

View file

@ -6,10 +6,10 @@
import os import os
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..helpers import bounceStarter, xpath_class from ..helpers import bounceStarter
XPATH_LINK = '//a[%s and contains(text(), "%s")]' XPATH_LINK = '//a[d:class("%s") and contains(text(), "%s")]'
XPATH_IMG = '//div[{}]//a[img[contains(@alt, "%s")]]'.format(xpath_class('comicnav')) XPATH_IMG = '//div[d:class("comicnav")]//a[img[contains(@alt, "%s")]]'
class ComicFury(_ParserScraper): class ComicFury(_ParserScraper):
@ -21,12 +21,12 @@ class ComicFury(_ParserScraper):
# 137 (needs to be before the generic a@rel, because layout is wrong) # 137 (needs to be before the generic a@rel, because layout is wrong)
'//a[contains(@title, "previous")]', '//a[contains(@title, "previous")]',
'//a[@rel="prev"]', '//a[@rel="prev"]',
XPATH_LINK % (xpath_class("comicnavlink"), "Previous"), XPATH_LINK % ('comicnavlink', 'Previous'),
XPATH_IMG % ('Previous'), XPATH_IMG % ('Previous'),
# Art, ConsolersDLC, etc. # Art, ConsolersDLC, etc.
u'//nav//a[contains(text(), "\u2039")]', u'//nav//a[contains(text(), "\u2039")]',
# LatchkeyKingdom # LatchkeyKingdom
'//a[%s and img[contains(@src, "Previous")]]' % xpath_class('navi'), '//a[d:class("navi") and img[contains(@src, "Previous")]]',
# RedSpot # RedSpot
'//a[contains(text(), "Back")]', '//a[contains(text(), "Back")]',
# KATRAN # KATRAN
@ -37,12 +37,12 @@ class ComicFury(_ParserScraper):
# 137 (see above) # 137 (see above)
'//a[contains(@title, "next")]', '//a[contains(@title, "next")]',
'//a[@rel="next"]', '//a[@rel="next"]',
XPATH_LINK % (xpath_class("comicnavlink"), "Next"), XPATH_LINK % ('comicnavlink', 'Next'),
XPATH_IMG % ('Next'), XPATH_IMG % ('Next'),
# Art, ConsolersDLC, etc. # Art, ConsolersDLC, etc.
u'//nav//a[contains(text(), "\u203A")]', u'//nav//a[contains(text(), "\u203A")]',
# LatchkeyKingdom # LatchkeyKingdom
'//a[%s and img[contains(@src, "Next")]]' % xpath_class('navi'), '//a[d:class("navi") and img[contains(@src, "Next")]]',
# RedSpot, KATRAN # RedSpot, KATRAN
'//a[contains(text(), "Next")]', '//a[contains(text(), "Next")]',
) )

View file

@ -4,7 +4,6 @@
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..helpers import indirectStarter, xpath_class
# Common base classes for comics with the same structure (same hosting # Common base classes for comics with the same structure (same hosting
# software, for example) go here. Since those are shared by many modules, # software, for example) go here. Since those are shared by many modules,
@ -14,24 +13,24 @@ from ..helpers import indirectStarter, xpath_class
class _WordPressScraper(_ParserScraper): class _WordPressScraper(_ParserScraper):
imageSearch = '//div[@id="comic"]//img' imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous') prevSearch = '//a[d:class("comic-nav-previous")]'
nextSearch = '//a[%s]' % xpath_class('comic-nav-next') nextSearch = '//a[d:class("comic-nav-next")]'
latestSearch = '//a[%s]' % xpath_class('comic-nav-last') latestSearch = '//a[d:class("comic-nav-last")]'
class _WPNavi(_WordPressScraper): class _WPNavi(_WordPressScraper):
prevSearch = '//a[%s]' % xpath_class('navi-prev') prevSearch = '//a[d:class("navi-prev")]'
class _WPNaviIn(_WordPressScraper): class _WPNaviIn(_WordPressScraper):
prevSearch = '//a[%s]' % xpath_class('navi-prev-in') prevSearch = '//a[d:class("navi-prev-in")]'
class _WPWebcomic(_WordPressScraper): class _WPWebcomic(_WordPressScraper):
imageSearch = '//div[{}]//img'.format(xpath_class('webcomic-image')) imageSearch = '//div[d:class("webcomic-image")]//img'
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link')) prevSearch = '//a[d:class("previous-webcomic-link")]'
nextSearch = '///a[{}]'.format(xpath_class('next-webcomic-link')) nextSearch = '///a[d:class("next-webcomic-link")]'
latestSearch = '//a[{}]'.format(xpath_class('last-webcomic-link')) latestSearch = '//a[d:class("last-webcomic-link")]'
class _ComicControlScraper(_ParserScraper): class _ComicControlScraper(_ParserScraper):

View file

@ -6,7 +6,7 @@
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter, xpath_class from ..helpers import indirectStarter, bounceStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNaviIn, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPNaviIn, _WPWebcomic
@ -190,8 +190,8 @@ class Dilbert(_ParserScraper):
stripUrl = url + 'strip/%s' stripUrl = url + 'strip/%s'
firstStripUrl = stripUrl % '1989-04-16' firstStripUrl = stripUrl % '1989-04-16'
starter = indirectStarter starter = indirectStarter
prevSearch = '//div[%s]/a' % xpath_class('nav-left') prevSearch = '//div[d:class("nav-left")]/a'
imageSearch = '//img[%s]' % xpath_class('img-comic') imageSearch = '//img[d:class("img-comic")]'
latestSearch = '//a[@class="img-comic-link"]' latestSearch = '//a[@class="img-comic-link"]'
help = 'Index format: yyyy-mm-dd' help = 'Index format: yyyy-mm-dd'
@ -260,14 +260,14 @@ class DominicDeegan(_ParserScraper):
class DorkTower(_ParserScraper): class DorkTower(_ParserScraper):
url = 'http://www.dorktower.com/' url = 'http://www.dorktower.com/'
firstStripUrl = url + '1997/01/01/shadis-magazine-strip-1/' firstStripUrl = url + '1997/01/01/shadis-magazine-strip-1/'
imageSearch = '//div[%s]//a/img' % xpath_class('entry-content') imageSearch = '//div[d:class("entry-content")]//a/img'
prevSearch = '//a[%s][text()="Previous"]' % xpath_class('btn') prevSearch = '//a[d:class("btn")][text()="Previous"]'
class DoomsdayMyDear(_ParserScraper): class DoomsdayMyDear(_ParserScraper):
url = 'http://doomsdaymydear.com/' url = 'http://doomsdaymydear.com/'
imageSearch = '//img[{}]'.format(xpath_class('attachment-full')) imageSearch = '//img[d:class("attachment-full")]'
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link')) prevSearch = '//a[d:class("previous-webcomic-link")]'
class Draconia(_WPWebcomic): class Draconia(_WPWebcomic):
@ -307,10 +307,9 @@ class DresdenCodak(_ParserScraper):
url = 'http://dresdencodak.com/' url = 'http://dresdencodak.com/'
startUrl = url + 'cat/comic/' startUrl = url + 'cat/comic/'
firstStripUrl = url + '2007/02/08/pom/' firstStripUrl = url + '2007/02/08/pom/'
imageSearch = '//section[%s]//img[%s]' % ( imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
xpath_class('entry-content'), xpath_class('aligncenter'))
prevSearch = '//a[img[contains(@src, "prev")]]' prevSearch = '//a[img[contains(@src, "prev")]]'
latestSearch = '//a[%s]' % xpath_class('tc-grid-bg-link') latestSearch = '//a[d:class("tc-grid-bg-link")]'
starter = indirectStarter starter = indirectStarter
# Blog and comic are mixed... # Blog and comic are mixed...

View file

@ -6,7 +6,7 @@
import os import os
from re import compile, IGNORECASE from re import compile, IGNORECASE
from ..helpers import bounceStarter, indirectStarter, xpath_class from ..helpers import bounceStarter, indirectStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -35,7 +35,7 @@ class EatLiver(_ParserScraper):
url = 'http://www.eatliver.com/' url = 'http://www.eatliver.com/'
starter = indirectStarter starter = indirectStarter
multipleImagesPerStrip = True multipleImagesPerStrip = True
imageSearch = '//div[%s]//img' % xpath_class('post-content') imageSearch = '//div[d:class("post-content")]//img'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]' latestSearch = '//a[@rel="bookmark"]'
@ -175,7 +175,7 @@ class Everblue(_ParserScraper):
class EverybodyLovesEricRaymond(_ParserScraper): class EverybodyLovesEricRaymond(_ParserScraper):
url = 'http://geekz.co.uk/lovesraymond/' url = 'http://geekz.co.uk/lovesraymond/'
firstStripUrl = url + 'archive/slashdotted' firstStripUrl = url + 'archive/slashdotted'
imageSearch = '//div[%s]//img' % xpath_class('entry-content') imageSearch = '//div[d:class("entry-content")]//img'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
@ -255,6 +255,6 @@ class ExtraOrdinary(_ParserScraper):
url = 'https://www.exocomics.com/' url = 'https://www.exocomics.com/'
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '01' firstStripUrl = stripUrl % '01'
prevSearch = '//a[%s]' % xpath_class('prev') prevSearch = '//a[d:class("prev")]'
imageSearch = '//img[%s]' % xpath_class('image-style-main-comic') imageSearch = '//img[d:class("image-style-main-comic")]'
help = 'Index format: number' help = 'Index format: number'

View file

@ -7,7 +7,7 @@ from re import compile, escape
from ..util import tagre from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, joinPathPartsNamer, xpath_class from ..helpers import indirectStarter, joinPathPartsNamer
from .common import _ComicControlScraper, _WPNaviIn, _WordPressScraper from .common import _ComicControlScraper, _WPNaviIn, _WordPressScraper
@ -62,8 +62,8 @@ class FirstWorldProblems(_ParserScraper):
'http://bradcolbow.com/archive/C5/') 'http://bradcolbow.com/archive/C5/')
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P10' firstStripUrl = stripUrl % 'P10'
imageSearch = '//div[{}]//img'.format(xpath_class('entry')) imageSearch = '//div[d:class("entry")]//img'
prevSearch = '//a[{}]'.format(xpath_class('prev')) prevSearch = '//a[d:class("prev")]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True endOfLife = True
@ -83,7 +83,7 @@ class Flemcomics(_ParserScraper):
'http://www.flemcomics.com/') 'http://www.flemcomics.com/')
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'
firstStripUrl = stripUrl % '19980101' firstStripUrl = stripUrl % '19980101'
imageSearch = '//img[{}]'.format(xpath_class('ksc')) imageSearch = '//img[d:class("ksc")]'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
endOfLife = True endOfLife = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -174,10 +174,10 @@ class FredoAndPidjin(_ParserScraper):
url = 'https://www.pidjin.net/' url = 'https://www.pidjin.net/'
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/02/19/goofy-monday' firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
imageSearch = '//div[%s]//img' % xpath_class("episode") imageSearch = '//div[d:class("episode")]//img'
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = '//span[%s]/a' % xpath_class("prev") prevSearch = '//span[d:class("prev")]/a'
latestSearch = '//section[%s]//a' % xpath_class("latest") latestSearch = '//section[d:class("latest")]//a'
starter = indirectStarter starter = indirectStarter
namer = joinPathPartsNamer((0, 1, 2)) namer = joinPathPartsNamer((0, 1, 2))

View file

@ -3,14 +3,14 @@
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
class GoComics(_ParserScraper): class GoComics(_ParserScraper):
url = 'https://www.gocomics.com/' url = 'https://www.gocomics.com/'
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image')) imageSearch = '//picture[d:class("item-comic-image")]/img'
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic')) prevSearch = '//a[d:class("js-previous-comic")]'
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0')) latestSearch = '//div[d:class("gc-deck--cta-0")]//a'
starter = indirectStarter starter = indirectStarter
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'

View file

@ -6,7 +6,7 @@ from re import compile, escape
from ..scraper import _BasicScraper from ..scraper import _BasicScraper
from ..util import tagre from ..util import tagre
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from .common import _ComicControlScraper from .common import _ComicControlScraper
@ -41,7 +41,7 @@ class JoeAndMonkey(_BasicScraper):
class JohnnyWander(_ComicControlScraper): class JohnnyWander(_ComicControlScraper):
imageSearch = ('//ul[%s]/li/@data-src' % xpath_class('cc-showbig'), imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src',
_ComicControlScraper.imageSearch) _ComicControlScraper.imageSearch)
url = 'http://www.johnnywander.com/' url = 'http://www.johnnywander.com/'

View file

@ -6,7 +6,7 @@
import json import json
from re import compile, escape, IGNORECASE from re import compile, escape, IGNORECASE
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPWebcomic
@ -74,7 +74,7 @@ class MarriedToTheSea(_ParserScraper):
url = 'http://marriedtothesea.com/' url = 'http://marriedtothesea.com/'
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '022806' firstStripUrl = stripUrl % '022806'
imageSearch = '//div[%s]//p/img' % xpath_class('jumbotron') imageSearch = '//div[d:class("jumbotron")]//p/img'
prevSearch = '//a[contains(text(), "Yesterday")]' prevSearch = '//a[contains(text(), "Yesterday")]'
help = 'Index format: mmddyy' help = 'Index format: mmddyy'

View file

@ -6,7 +6,7 @@
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPWebcomic
@ -134,7 +134,7 @@ class Nimona(_ParserScraper):
'http://gingerhaze.com/nimona/') 'http://gingerhaze.com/nimona/')
stripUrl = url + 'comic/%s' stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % "page-1" firstStripUrl = stripUrl % "page-1"
imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page')) imageSearch = '//div[d:class("field-name-field-comic-page")]//img'
prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]' prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
endOfLife = True endOfLife = True

View file

@ -6,7 +6,7 @@
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter, queryNamer, indirectStarter, xpath_class from ..helpers import bounceStarter, queryNamer, indirectStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -99,8 +99,8 @@ class PennyArcade(_ParserScraper):
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '1998/11/18' firstStripUrl = stripUrl % '1998/11/18'
imageSearch = '//div[@id="comicFrame"]//img' imageSearch = '//div[@id="comicFrame"]//img'
prevSearch = '//a[%s]' % xpath_class('btnPrev') prevSearch = '//a[d:class("btnPrev")]'
nextSearch = '//a[%s]' % xpath_class('btnNext') nextSearch = '//a[d:class("btnNext")]'
starter = bounceStarter starter = bounceStarter
help = 'Index format: yyyy/mm/dd' help = 'Index format: yyyy/mm/dd'
@ -231,7 +231,7 @@ class PokeyThePenguin(_ParserScraper):
class PoorlyDrawnLines(_ParserScraper): class PoorlyDrawnLines(_ParserScraper):
url = 'http://poorlydrawnlines.com/comic/' url = 'http://poorlydrawnlines.com/comic/'
firstStripUrl = url + 'campus-characters/' firstStripUrl = url + 'campus-characters/'
imageSearch = '//div[%s]//img' % xpath_class('comic') imageSearch = '//div[d:class("comic")]//img'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
@ -269,7 +269,7 @@ class PrinceOfSartar(_WPNavi):
url = 'http://www.princeofsartar.com/' url = 'http://www.princeofsartar.com/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'introduction-chapter-1' firstStripUrl = stripUrl % 'introduction-chapter-1'
nextSearch = '//a[%s]' % xpath_class('navi-next') nextSearch = '//a[d:class("navi-next")]'
starter = bounceStarter starter = bounceStarter
help = 'Index format: name' help = 'Index format: name'

View file

@ -4,7 +4,6 @@
# Copyright (C) 2015-2020 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..helpers import xpath_class
class QuantumVibe(_ParserScraper): class QuantumVibe(_ParserScraper):
@ -28,6 +27,6 @@ class Qwantz(_ParserScraper):
url = 'http://www.qwantz.com/index.php' url = 'http://www.qwantz.com/index.php'
stripUrl = url + '?comic=%s' stripUrl = url + '?comic=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//img[{}]'.format(xpath_class('comic')) imageSearch = '//img[d:class("comic")]'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
help = 'Index format: n' help = 'Index format: n'

View file

@ -6,7 +6,7 @@
from re import compile from re import compile
from urllib.parse import urljoin from urllib.parse import urljoin
from ..helpers import bounceStarter, xpath_class from ..helpers import bounceStarter
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from .common import _WordPressScraper, _WPWebcomic from .common import _WordPressScraper, _WPWebcomic
@ -107,7 +107,7 @@ class RomanticallyApocalyptic(_ParserScraper):
url = 'http://romanticallyapocalyptic.com/' url = 'http://romanticallyapocalyptic.com/'
stripUrl = url + '%s' stripUrl = url + '%s'
firstStripUrl = stripUrl % '0' firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel') imageSearch = '//div[d:class("comicpanel")]/center//img'
prevSearch = '//a[@accesskey="p"]' prevSearch = '//a[@accesskey="p"]'
help = 'Index format: n' help = 'Index format: n'
adult = True adult = True

View file

@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE, sub
from os.path import splitext from os.path import splitext
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer, xpath_class from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -120,7 +120,7 @@ class SchoolBites(_ParserScraper):
url = ('https://web.archive.org/web/20170215065523/' url = ('https://web.archive.org/web/20170215065523/'
'http://schoolbites.net/') 'http://schoolbites.net/')
stripUrl = url + 'd/%s.html' stripUrl = url + 'd/%s.html'
imageSearch = '//img[{}]'.format(xpath_class('ksc')) imageSearch = '//img[d:class("ksc")]'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'
endOfLife = True endOfLife = True
help = 'Index format: yyyymmdd' help = 'Index format: yyyymmdd'
@ -132,7 +132,7 @@ class Schuelert(_ParserScraper):
stripUrl = url + 'index.php?paged=%s' stripUrl = url + 'index.php?paged=%s'
firstStripUrl = stripUrl % '3' firstStripUrl = stripUrl % '3'
imageSearch = '//img[contains(@src, "wp-content")]' imageSearch = '//img[contains(@src, "wp-content")]'
prevSearch = '//span[{}]/a'.format(xpath_class('prevlink')) prevSearch = '//span[d:class("prevlink")]/a'
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True endOfLife = True
lang = 'de' lang = 'de'
@ -143,7 +143,7 @@ class Science(_ParserScraper):
'http://sci-ence.org/%s/') 'http://sci-ence.org/%s/')
url = stripUrl % 'new-york-comic-con-2013' url = stripUrl % 'new-york-comic-con-2013'
firstStripUrl = stripUrl % 'periodic-table-element-ass' firstStripUrl = stripUrl % 'periodic-table-element-ass'
prevSearch = '//a[{}]'.format(xpath_class('navi-prev')) prevSearch = '//a[d:class("navi-prev")]'
imageSearch = '//div[@class="comicpane"]//img' imageSearch = '//div[@class="comicpane"]//img'
endOfLife = True endOfLife = True
@ -159,7 +159,7 @@ class SequentialArt(_ParserScraper):
url = 'https://www.collectedcurios.com/sequentialart.php' url = 'https://www.collectedcurios.com/sequentialart.php'
stripUrl = url + '?s=%s' stripUrl = url + '?s=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//img[{}]'.format(xpath_class('w3-image')) imageSearch = '//img[d:class("w3-image")]'
prevSearch = '//a[@id="backOne"]' prevSearch = '//a[@id="backOne"]'
help = 'Index format: name' help = 'Index format: name'
@ -286,9 +286,9 @@ class SluggyFreelance(_ParserScraper):
url = 'http://sluggy.com/' url = 'http://sluggy.com/'
stripUrl = 'http://archives.sluggy.com/book.php?chapter=%s' stripUrl = 'http://archives.sluggy.com/book.php?chapter=%s'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//div[%s]/img/@data-src' % xpath_class('comic_content') imageSearch = '//div[d:class("comic_content")]/img/@data-src'
prevSearch = '//div[%s]/a' % xpath_class('previous') prevSearch = '//div[d:class("previous")]/a'
latestSearch = '//a[%s]' % xpath_class('archives_link') latestSearch = '//a[d:class("archives_link")]'
starter = indirectStarter starter = indirectStarter
multipleImagesPerStrip = True multipleImagesPerStrip = True
help = 'Index format: chapter' help = 'Index format: chapter'
@ -374,7 +374,7 @@ class SpaceJunkArlia(_ParserScraper):
url = 'http://spacejunkarlia.com/' url = 'http://spacejunkarlia.com/'
stripUrl = url + '?strip_id=%s' stripUrl = url + '?strip_id=%s'
firstStripUrl = stripUrl % '0' firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/img' % xpath_class('content') imageSearch = '//div[d:class("content")]/img'
prevSearch = '//a[text()="<"]' prevSearch = '//a[text()="<"]'
help = 'Index format: number' help = 'Index format: number'
@ -382,7 +382,7 @@ class SpaceJunkArlia(_ParserScraper):
class SpaceTrawler(_ParserScraper): class SpaceTrawler(_ParserScraper):
url = 'https://www.baldwinpage.com/spacetrawler/' url = 'https://www.baldwinpage.com/spacetrawler/'
firstStripUrl = url + '2010/01/01/spacetrawler-4/' firstStripUrl = url + '2010/01/01/spacetrawler-4/'
imageSearch = '//img[%s]' % xpath_class('size-full') imageSearch = '//img[d:class("size-full")]'
prevSearch = '//a[@rel="prev"]' prevSearch = '//a[@rel="prev"]'

View file

@ -3,14 +3,13 @@
# Copyright (C) 2019-2020 Daniel Ring # Copyright (C) 2019-2020 Daniel Ring
import re import re
from ..helpers import xpath_class
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
class SmackJeeves(_ParserScraper): class SmackJeeves(_ParserScraper):
baseUrl = 'https://www.smackjeeves.com/discover/' baseUrl = 'https://www.smackjeeves.com/discover/'
apiBase = 'https://www.smackjeeves.com/api/discover/' apiBase = 'https://www.smackjeeves.com/api/discover/'
prevSearch = '//a[i[{}]]'.format(xpath_class('i-arrow-double-left-black')) prevSearch = '//a[i[d:class("i-arrow-double-left-black")]]'
imageSearch = re.compile("comicData:[^']*'([^']*)'", re.DOTALL) imageSearch = re.compile("comicData:[^']*'([^']*)'", re.DOTALL)
help = 'Index format: n' help = 'Index format: n'

View file

@ -10,7 +10,7 @@ except ImportError:
from cached_property import cached_property from cached_property import cached_property
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -40,8 +40,8 @@ class TheBrads(_ParserScraper):
'http://bradcolbow.com/archive/C4/') 'http://bradcolbow.com/archive/C4/')
stripUrl = url + '%s/' stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P125' firstStripUrl = stripUrl % 'P125'
imageSearch = '//div[{}]//img'.format(xpath_class('entry')) imageSearch = '//div[d:class("entry")]//img'
prevSearch = '//a[{}]'.format(xpath_class('prev')) prevSearch = '//a[d:class("prev")]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
endOfLife = True endOfLife = True
@ -120,7 +120,7 @@ class TheLandscaper(_ParserScraper):
'http://landscaper.visual-assault.net/comic/%s') 'http://landscaper.visual-assault.net/comic/%s')
url = stripUrl % 'latest' url = stripUrl % 'latest'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic')) imageSearch = '//article[d:class("comic")]//img[1]'
prevSearch = '//a[contains(text(), "Previous")]' prevSearch = '//a[contains(text(), "Previous")]'
endOfLife = True endOfLife = True
@ -294,8 +294,8 @@ class TumbleDryComics(_WordPressScraper):
class Turnoff(_ParserScraper): class Turnoff(_ParserScraper):
name = 'turnoff' name = 'turnoff'
url = 'https://turnoff.us/' url = 'https://turnoff.us/'
imageSearch = '//article[%s]//img' % xpath_class('post-content') imageSearch = '//article[d:class("post-content")]//img'
prevSearch = '//div[%s]//a' % xpath_class('prev') prevSearch = '//div[d:class("prev")]//a'
stripUrl = url + 'geek/%s' stripUrl = url + 'geek/%s'
firstStripUrl = stripUrl % 'tcp-buddies' firstStripUrl = stripUrl % 'tcp-buddies'
multipleImagesPerStrip = True multipleImagesPerStrip = True
@ -341,8 +341,8 @@ class Twokinds(_ParserScraper):
url = 'http://twokinds.keenspot.com/' url = 'http://twokinds.keenspot.com/'
stripUrl = url + 'comic/%s/' stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '1' firstStripUrl = stripUrl % '1'
imageSearch = '//article[%s]//img' % xpath_class('comic') imageSearch = '//article[d:class("comic")]//img'
prevSearch = '//a[%s]' % xpath_class('navprev') prevSearch = '//a[d:class("navprev")]'
help = 'Index format: n (unpadded)' help = 'Index format: n (unpadded)'

View file

@ -6,7 +6,7 @@
from re import compile from re import compile
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class from ..helpers import indirectStarter
from ..util import tagre from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -46,7 +46,7 @@ class Unsounded(_ParserScraper):
stripUrl = url + 'comic/ch%s/ch%s_%s.html' stripUrl = url + 'comic/ch%s/ch%s_%s.html'
firstStripUrl = stripUrl % ('01', '01', '01') firstStripUrl = stripUrl % ('01', '01', '01')
imageSearch = '//img[contains(@src, "pageart/")]' imageSearch = '//img[contains(@src, "pageart/")]'
prevSearch = '//a[%s]' % xpath_class('back') prevSearch = '//a[d:class("back")]'
latestSearch = '//div[@id="chapter_box"][1]//a[last()]' latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
starter = indirectStarter starter = indirectStarter

View file

@ -6,7 +6,7 @@
from re import compile from re import compile
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter, xpath_class from ..helpers import bounceStarter, indirectStarter
from ..util import tagre from ..util import tagre
@ -71,8 +71,8 @@ class VictimsOfTheSystem(_BasicScraper):
class ViiviJaWagner(_ParserScraper): class ViiviJaWagner(_ParserScraper):
url = 'http://www.hs.fi/viivijawagner/' url = 'http://www.hs.fi/viivijawagner/'
imageSearch = '//meta[@property="og:image"]/@content' imageSearch = '//meta[@property="og:image"]/@content'
prevSearch = '//a[%s]' % xpath_class('prev') prevSearch = '//a[d:class("prev")]'
latestSearch = '//div[%s]//a' % xpath_class('cartoon-content') latestSearch = '//div[d:class("cartoon-content")]//a'
starter = indirectStarter starter = indirectStarter
lang = 'fi' lang = 'fi'

View file

@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from ..helpers import bounceStarter, indirectStarter, xpath_class from ..helpers import bounceStarter, indirectStarter
from .common import _ComicControlScraper, _WPNavi, _WPNaviIn, _WPWebcomic from .common import _ComicControlScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -28,8 +28,8 @@ class WastedTalent(_BasicScraper):
class WebcomicName(_ParserScraper): class WebcomicName(_ParserScraper):
url = 'https://webcomicname.com/' url = 'https://webcomicname.com/'
imageSearch = '//figure[{}]//img'.format(xpath_class('tmblr-full')) imageSearch = '//figure[d:class("tmblr-full")]//img'
prevSearch = '//a[{}]'.format(xpath_class('next')) prevSearch = '//a[d:class("next")]'
multipleImagesPerStrip = True multipleImagesPerStrip = True
@ -38,10 +38,10 @@ class WebDesignerCOTW(_ParserScraper):
url = baseUrl + 'category/comics/' url = baseUrl + 'category/comics/'
starter = indirectStarter starter = indirectStarter
firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/' firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/'
imageSearch = '//article[%s]//img' % xpath_class('article-content') imageSearch = '//article[d:class("article-content")]//img'
multipleImagesPerStrip = True multipleImagesPerStrip = True
prevSearch = '//a[span[%s]]' % xpath_class('icon-right-small') prevSearch = '//a[span[d:class("icon-right-small")]]'
latestSearch = '//a[%s]' % xpath_class('anim-link') latestSearch = '//a[d:class("anim-link")]'
def shouldSkipUrl(self, url, data): def shouldSkipUrl(self, url, data):
"""Skip non-comic URLs.""" """Skip non-comic URLs."""

View file

@ -1,12 +1,12 @@
# SPDX-License-Identifier: MIT # SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher # Copyright (C) 2015-2020 Tobias Gruetzmacher
from re import compile, escape from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre from ..util import tagre
from ..helpers import bounceStarter, joinPathPartsNamer, xpath_class from ..helpers import bounceStarter, joinPathPartsNamer
from .common import _WPNavi from .common import _WPNavi
@ -21,8 +21,8 @@ class Zapiro(_ParserScraper):
url = 'http://mg.co.za/zapiro/' url = 'http://mg.co.za/zapiro/'
starter = bounceStarter starter = bounceStarter
imageSearch = '//div[@id="cartoon"]/img' imageSearch = '//div[@id="cartoon"]/img'
prevSearch = '//a[%s]' % xpath_class('left') prevSearch = '//a[d:class("left")]'
nextSearch = '//a[%s]' % xpath_class('right') nextSearch = '//a[d:class("right")]'
namer = joinPathPartsNamer((-1,), ()) namer = joinPathPartsNamer((-1,), ())
@ -31,8 +31,8 @@ class ZenPencils(_WPNavi):
multipleImagesPerStrip = True multipleImagesPerStrip = True
firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/' firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'
starter = bounceStarter starter = bounceStarter
prevSearch = '//a[%s]' % xpath_class('navi-prev') prevSearch = '//a[d:class("navi-prev")]'
nextSearch = '//a[%s]' % xpath_class('navi-next') nextSearch = '//a[d:class("navi-next")]'
class ZombieHunters(_BasicScraper): class ZombieHunters(_BasicScraper):

View file

@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/') ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
XML_DECL = re.compile( XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
NS = {
"re": "http://exslt.org/regular-expressions"
}
# Switch between CSS and XPath selectors for this class. Since CSS needs # Switch between CSS and XPath selectors for this class. Since CSS needs
# another Python module, XPath is the default for now. # another Python module, XPath is the default for now.
css = False css = False
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
searchFun = data.cssselect searchFun = data.cssselect
else: else:
def searchFun(s): def searchFun(s):
return data.xpath(s, namespaces=self.NS) return data.xpath(s, namespaces=NS)
patterns = makeSequence(patterns) patterns = makeSequence(patterns)
for search in patterns: for search in patterns:
matched = False matched = False

20
dosagelib/xml.py Normal file
View file

@ -0,0 +1,20 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import etree
NS = {
'd': 'https://dosage.rocks/xpath',
're': 'http://exslt.org/regular-expressions'
}
def find_by_class(context, cls):
attributes = context.context_node.attrib
if 'class' in attributes:
return cls in attributes['class'].split(' ')
return False
dosagens = etree.FunctionNamespace(NS['d'])
dosagens['class'] = find_by_class

View file

@ -14,7 +14,7 @@ def _file(name):
@lru_cache() @lru_cache()
def _content(name): def content(name):
with gzip.open(_file(name + '.html.gz'), 'r') as f: with gzip.open(_file(name + '.html.gz'), 'r') as f:
return f.read() return f.read()
@ -26,7 +26,7 @@ def _img(name):
def page(url, pagename): def page(url, pagename):
add(GET, url, _content(pagename)) add(GET, url, content(pagename))
def png(url, name='empty'): def png(url, name='empty'):

25
tests/test_xml.py Normal file
View file

@ -0,0 +1,25 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import html
from dosagelib.xml import NS
import httpmocks
tree = html.document_fromstring(httpmocks.content('zp-222'))
class TestXML:
def xpath(self, path):
return tree.xpath(path, namespaces=NS)
def test_class_ext(self):
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
assert len(self.xpath('//ul[d:class("menu")]')) == 1
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
def test_re_ext(self):
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1