Merge pull request #169 from webcomics/xpath-ext

Replace xpath_class function with an XPath extension
This commit is contained in:
Tobias Gruetzmacher 2020-08-03 22:18:52 +02:00 committed by GitHub
commit 912b30191d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
26 changed files with 148 additions and 117 deletions

View file

@ -62,9 +62,3 @@ def indirectStarter(self):
data = self.getPage(url)
newurl = self.fetchUrl(url, data, self.latestSearch)
return self.link_modifier(url, newurl)
def xpath_class(name):
"""Returns an XPath expressions which finds a tag which has a specified
class."""
return 'contains(concat(" ", @class, " "), " %s ")' % name

View file

@ -7,7 +7,7 @@ from re import compile, escape
from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -71,7 +71,7 @@ class Baroquen(_BasicScraper):
class Bearmageddon(_WordPressScraper):
url = 'http://bearmageddon.com/bearmo/page-1/'
firstStripUrl = url
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
latestSearch = '//a[d:class("comic-nav-last")]'
starter = indirectStarter
@ -187,8 +187,8 @@ class BlankIt(_ParserScraper):
url = 'http://blankitcomics.com/'
firstStripUrl = url + 'comic/well-what-would-you-do'
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous')
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
prevSearch = '//a[d:class("comic-nav-previous")]'
latestSearch = '//a[d:class("comic-nav-last")]'
starter = indirectStarter
@ -235,7 +235,7 @@ class BMovieComic(_BasicScraper):
class BobWhite(_ParserScraper):
url = 'http://www.bobwhitecomics.com/'
imageSearch = '//span[%s]/img' % xpath_class('webcomic-object')
imageSearch = '//span[d:class("webcomic-object")]/img'
prevSearch = '//a[@rel="previous"]'
@ -296,7 +296,6 @@ class ButImACatPerson(_WordPressScraper):
endOfLife = True
class ButtercupFestival(_ParserScraper):
url = 'http://www.buttercupfestival.com/'
stripUrl = url + '%s.htm'

View file

@ -1,15 +1,15 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from ..helpers import indirectStarter, xpath_class
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from ..helpers import indirectStarter
from ..scraper import _ParserScraper
from ..util import getQueryParams
class CloneManga(_ParserScraper):
baseUrl = 'http://manga.clone-army.org'
imageSearch = '//div[%s]//img' % xpath_class('subsectionContainer')
imageSearch = '//div[d:class("subsectionContainer")]//img'
prevSearch = '//a[span[text()="<<"]]'
latestSearch = '//a[span[text()=">|"]]'
starter = indirectStarter

View file

@ -6,10 +6,10 @@
import os
from ..scraper import _ParserScraper
from ..helpers import bounceStarter, xpath_class
from ..helpers import bounceStarter
XPATH_LINK = '//a[%s and contains(text(), "%s")]'
XPATH_IMG = '//div[{}]//a[img[contains(@alt, "%s")]]'.format(xpath_class('comicnav'))
XPATH_LINK = '//a[d:class("%s") and contains(text(), "%s")]'
XPATH_IMG = '//div[d:class("comicnav")]//a[img[contains(@alt, "%s")]]'
class ComicFury(_ParserScraper):
@ -21,12 +21,12 @@ class ComicFury(_ParserScraper):
# 137 (needs to be before the generic a@rel, because layout is wrong)
'//a[contains(@title, "previous")]',
'//a[@rel="prev"]',
XPATH_LINK % (xpath_class("comicnavlink"), "Previous"),
XPATH_LINK % ('comicnavlink', 'Previous'),
XPATH_IMG % ('Previous'),
# Art, ConsolersDLC, etc.
u'//nav//a[contains(text(), "\u2039")]',
# LatchkeyKingdom
'//a[%s and img[contains(@src, "Previous")]]' % xpath_class('navi'),
'//a[d:class("navi") and img[contains(@src, "Previous")]]',
# RedSpot
'//a[contains(text(), "Back")]',
# KATRAN
@ -37,12 +37,12 @@ class ComicFury(_ParserScraper):
# 137 (see above)
'//a[contains(@title, "next")]',
'//a[@rel="next"]',
XPATH_LINK % (xpath_class("comicnavlink"), "Next"),
XPATH_LINK % ('comicnavlink', 'Next'),
XPATH_IMG % ('Next'),
# Art, ConsolersDLC, etc.
u'//nav//a[contains(text(), "\u203A")]',
# LatchkeyKingdom
'//a[%s and img[contains(@src, "Next")]]' % xpath_class('navi'),
'//a[d:class("navi") and img[contains(@src, "Next")]]',
# RedSpot, KATRAN
'//a[contains(text(), "Next")]',
)

View file

@ -4,7 +4,6 @@
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
from ..scraper import _ParserScraper
from ..helpers import indirectStarter, xpath_class
# Common base classes for comics with the same structure (same hosting
# software, for example) go here. Since those are shared by many modules,
@ -14,24 +13,24 @@ from ..helpers import indirectStarter, xpath_class
class _WordPressScraper(_ParserScraper):
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous')
nextSearch = '//a[%s]' % xpath_class('comic-nav-next')
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
prevSearch = '//a[d:class("comic-nav-previous")]'
nextSearch = '//a[d:class("comic-nav-next")]'
latestSearch = '//a[d:class("comic-nav-last")]'
class _WPNavi(_WordPressScraper):
prevSearch = '//a[%s]' % xpath_class('navi-prev')
prevSearch = '//a[d:class("navi-prev")]'
class _WPNaviIn(_WordPressScraper):
prevSearch = '//a[%s]' % xpath_class('navi-prev-in')
prevSearch = '//a[d:class("navi-prev-in")]'
class _WPWebcomic(_WordPressScraper):
imageSearch = '//div[{}]//img'.format(xpath_class('webcomic-image'))
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
nextSearch = '///a[{}]'.format(xpath_class('next-webcomic-link'))
latestSearch = '//a[{}]'.format(xpath_class('last-webcomic-link'))
imageSearch = '//div[d:class("webcomic-image")]//img'
prevSearch = '//a[d:class("previous-webcomic-link")]'
nextSearch = '///a[d:class("next-webcomic-link")]'
latestSearch = '//a[d:class("last-webcomic-link")]'
class _ComicControlScraper(_ParserScraper):

View file

@ -6,7 +6,7 @@
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter, xpath_class
from ..helpers import indirectStarter, bounceStarter
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNaviIn, _WPWebcomic
@ -190,8 +190,8 @@ class Dilbert(_ParserScraper):
stripUrl = url + 'strip/%s'
firstStripUrl = stripUrl % '1989-04-16'
starter = indirectStarter
prevSearch = '//div[%s]/a' % xpath_class('nav-left')
imageSearch = '//img[%s]' % xpath_class('img-comic')
prevSearch = '//div[d:class("nav-left")]/a'
imageSearch = '//img[d:class("img-comic")]'
latestSearch = '//a[@class="img-comic-link"]'
help = 'Index format: yyyy-mm-dd'
@ -260,14 +260,14 @@ class DominicDeegan(_ParserScraper):
class DorkTower(_ParserScraper):
url = 'http://www.dorktower.com/'
firstStripUrl = url + '1997/01/01/shadis-magazine-strip-1/'
imageSearch = '//div[%s]//a/img' % xpath_class('entry-content')
prevSearch = '//a[%s][text()="Previous"]' % xpath_class('btn')
imageSearch = '//div[d:class("entry-content")]//a/img'
prevSearch = '//a[d:class("btn")][text()="Previous"]'
class DoomsdayMyDear(_ParserScraper):
url = 'http://doomsdaymydear.com/'
imageSearch = '//img[{}]'.format(xpath_class('attachment-full'))
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
imageSearch = '//img[d:class("attachment-full")]'
prevSearch = '//a[d:class("previous-webcomic-link")]'
class Draconia(_WPWebcomic):
@ -307,10 +307,9 @@ class DresdenCodak(_ParserScraper):
url = 'http://dresdencodak.com/'
startUrl = url + 'cat/comic/'
firstStripUrl = url + '2007/02/08/pom/'
imageSearch = '//section[%s]//img[%s]' % (
xpath_class('entry-content'), xpath_class('aligncenter'))
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
prevSearch = '//a[img[contains(@src, "prev")]]'
latestSearch = '//a[%s]' % xpath_class('tc-grid-bg-link')
latestSearch = '//a[d:class("tc-grid-bg-link")]'
starter = indirectStarter
# Blog and comic are mixed...

View file

@ -6,7 +6,7 @@
import os
from re import compile, IGNORECASE
from ..helpers import bounceStarter, indirectStarter, xpath_class
from ..helpers import bounceStarter, indirectStarter
from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -35,7 +35,7 @@ class EatLiver(_ParserScraper):
url = 'http://www.eatliver.com/'
starter = indirectStarter
multipleImagesPerStrip = True
imageSearch = '//div[%s]//img' % xpath_class('post-content')
imageSearch = '//div[d:class("post-content")]//img'
prevSearch = '//a[@rel="prev"]'
latestSearch = '//a[@rel="bookmark"]'
@ -175,7 +175,7 @@ class Everblue(_ParserScraper):
class EverybodyLovesEricRaymond(_ParserScraper):
url = 'http://geekz.co.uk/lovesraymond/'
firstStripUrl = url + 'archive/slashdotted'
imageSearch = '//div[%s]//img' % xpath_class('entry-content')
imageSearch = '//div[d:class("entry-content")]//img'
prevSearch = '//a[@rel="prev"]'
@ -255,6 +255,6 @@ class ExtraOrdinary(_ParserScraper):
url = 'https://www.exocomics.com/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '01'
prevSearch = '//a[%s]' % xpath_class('prev')
imageSearch = '//img[%s]' % xpath_class('image-style-main-comic')
prevSearch = '//a[d:class("prev")]'
imageSearch = '//img[d:class("image-style-main-comic")]'
help = 'Index format: number'

View file

@ -7,7 +7,7 @@ from re import compile, escape
from ..util import tagre
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, joinPathPartsNamer, xpath_class
from ..helpers import indirectStarter, joinPathPartsNamer
from .common import _ComicControlScraper, _WPNaviIn, _WordPressScraper
@ -62,8 +62,8 @@ class FirstWorldProblems(_ParserScraper):
'http://bradcolbow.com/archive/C5/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P10'
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
prevSearch = '//a[{}]'.format(xpath_class('prev'))
imageSearch = '//div[d:class("entry")]//img'
prevSearch = '//a[d:class("prev")]'
multipleImagesPerStrip = True
endOfLife = True
@ -83,7 +83,7 @@ class Flemcomics(_ParserScraper):
'http://www.flemcomics.com/')
stripUrl = url + 'd/%s.html'
firstStripUrl = stripUrl % '19980101'
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
imageSearch = '//img[d:class("ksc")]'
prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd'
@ -174,10 +174,10 @@ class FredoAndPidjin(_ParserScraper):
url = 'https://www.pidjin.net/'
stripUrl = url + '%s/'
firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
imageSearch = '//div[%s]//img' % xpath_class("episode")
imageSearch = '//div[d:class("episode")]//img'
multipleImagesPerStrip = True
prevSearch = '//span[%s]/a' % xpath_class("prev")
latestSearch = '//section[%s]//a' % xpath_class("latest")
prevSearch = '//span[d:class("prev")]/a'
latestSearch = '//section[d:class("latest")]//a'
starter = indirectStarter
namer = joinPathPartsNamer((0, 1, 2))

View file

@ -3,14 +3,14 @@
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from ..scraper import _ParserScraper
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
class GoComics(_ParserScraper):
url = 'https://www.gocomics.com/'
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image'))
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic'))
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0'))
imageSearch = '//picture[d:class("item-comic-image")]/img'
prevSearch = '//a[d:class("js-previous-comic")]'
latestSearch = '//div[d:class("gc-deck--cta-0")]//a'
starter = indirectStarter
help = 'Index format: yyyy/mm/dd'

View file

@ -6,7 +6,7 @@ from re import compile, escape
from ..scraper import _BasicScraper
from ..util import tagre
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from .common import _ComicControlScraper
@ -41,7 +41,7 @@ class JoeAndMonkey(_BasicScraper):
class JohnnyWander(_ComicControlScraper):
imageSearch = ('//ul[%s]/li/@data-src' % xpath_class('cc-showbig'),
imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src',
_ComicControlScraper.imageSearch)
url = 'http://www.johnnywander.com/'

View file

@ -6,7 +6,7 @@
import json
from re import compile, escape, IGNORECASE
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPWebcomic
@ -74,7 +74,7 @@ class MarriedToTheSea(_ParserScraper):
url = 'http://marriedtothesea.com/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '022806'
imageSearch = '//div[%s]//p/img' % xpath_class('jumbotron')
imageSearch = '//div[d:class("jumbotron")]//p/img'
prevSearch = '//a[contains(text(), "Yesterday")]'
help = 'Index format: mmddyy'

View file

@ -6,7 +6,7 @@
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPWebcomic
@ -134,7 +134,7 @@ class Nimona(_ParserScraper):
'http://gingerhaze.com/nimona/')
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % "page-1"
imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page'))
imageSearch = '//div[d:class("field-name-field-comic-page")]//img'
prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
endOfLife = True

View file

@ -6,7 +6,7 @@
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter, queryNamer, indirectStarter, xpath_class
from ..helpers import bounceStarter, queryNamer, indirectStarter
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -99,8 +99,8 @@ class PennyArcade(_ParserScraper):
stripUrl = url + '%s'
firstStripUrl = stripUrl % '1998/11/18'
imageSearch = '//div[@id="comicFrame"]//img'
prevSearch = '//a[%s]' % xpath_class('btnPrev')
nextSearch = '//a[%s]' % xpath_class('btnNext')
prevSearch = '//a[d:class("btnPrev")]'
nextSearch = '//a[d:class("btnNext")]'
starter = bounceStarter
help = 'Index format: yyyy/mm/dd'
@ -231,7 +231,7 @@ class PokeyThePenguin(_ParserScraper):
class PoorlyDrawnLines(_ParserScraper):
url = 'http://poorlydrawnlines.com/comic/'
firstStripUrl = url + 'campus-characters/'
imageSearch = '//div[%s]//img' % xpath_class('comic')
imageSearch = '//div[d:class("comic")]//img'
prevSearch = '//a[@rel="prev"]'
@ -269,7 +269,7 @@ class PrinceOfSartar(_WPNavi):
url = 'http://www.princeofsartar.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % 'introduction-chapter-1'
nextSearch = '//a[%s]' % xpath_class('navi-next')
nextSearch = '//a[d:class("navi-next")]'
starter = bounceStarter
help = 'Index format: name'

View file

@ -4,7 +4,6 @@
# Copyright (C) 2015-2020 Tobias Gruetzmacher
# Copyright (C) 2019-2020 Daniel Ring
from ..scraper import _ParserScraper
from ..helpers import xpath_class
class QuantumVibe(_ParserScraper):
@ -28,6 +27,6 @@ class Qwantz(_ParserScraper):
url = 'http://www.qwantz.com/index.php'
stripUrl = url + '?comic=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//img[{}]'.format(xpath_class('comic'))
imageSearch = '//img[d:class("comic")]'
prevSearch = '//a[@rel="prev"]'
help = 'Index format: n'

View file

@ -6,7 +6,7 @@
from re import compile
from urllib.parse import urljoin
from ..helpers import bounceStarter, xpath_class
from ..helpers import bounceStarter
from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre
from .common import _WordPressScraper, _WPWebcomic
@ -107,7 +107,7 @@ class RomanticallyApocalyptic(_ParserScraper):
url = 'http://romanticallyapocalyptic.com/'
stripUrl = url + '%s'
firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel')
imageSearch = '//div[d:class("comicpanel")]/center//img'
prevSearch = '//a[@accesskey="p"]'
help = 'Index format: n'
adult = True

View file

@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE, sub
from os.path import splitext
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer, xpath_class
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -120,7 +120,7 @@ class SchoolBites(_ParserScraper):
url = ('https://web.archive.org/web/20170215065523/'
'http://schoolbites.net/')
stripUrl = url + 'd/%s.html'
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
imageSearch = '//img[d:class("ksc")]'
prevSearch = '//a[@rel="prev"]'
endOfLife = True
help = 'Index format: yyyymmdd'
@ -132,7 +132,7 @@ class Schuelert(_ParserScraper):
stripUrl = url + 'index.php?paged=%s'
firstStripUrl = stripUrl % '3'
imageSearch = '//img[contains(@src, "wp-content")]'
prevSearch = '//span[{}]/a'.format(xpath_class('prevlink'))
prevSearch = '//span[d:class("prevlink")]/a'
multipleImagesPerStrip = True
endOfLife = True
lang = 'de'
@ -143,7 +143,7 @@ class Science(_ParserScraper):
'http://sci-ence.org/%s/')
url = stripUrl % 'new-york-comic-con-2013'
firstStripUrl = stripUrl % 'periodic-table-element-ass'
prevSearch = '//a[{}]'.format(xpath_class('navi-prev'))
prevSearch = '//a[d:class("navi-prev")]'
imageSearch = '//div[@class="comicpane"]//img'
endOfLife = True
@ -159,7 +159,7 @@ class SequentialArt(_ParserScraper):
url = 'https://www.collectedcurios.com/sequentialart.php'
stripUrl = url + '?s=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//img[{}]'.format(xpath_class('w3-image'))
imageSearch = '//img[d:class("w3-image")]'
prevSearch = '//a[@id="backOne"]'
help = 'Index format: name'
@ -286,9 +286,9 @@ class SluggyFreelance(_ParserScraper):
url = 'http://sluggy.com/'
stripUrl = 'http://archives.sluggy.com/book.php?chapter=%s'
firstStripUrl = stripUrl % '1'
imageSearch = '//div[%s]/img/@data-src' % xpath_class('comic_content')
prevSearch = '//div[%s]/a' % xpath_class('previous')
latestSearch = '//a[%s]' % xpath_class('archives_link')
imageSearch = '//div[d:class("comic_content")]/img/@data-src'
prevSearch = '//div[d:class("previous")]/a'
latestSearch = '//a[d:class("archives_link")]'
starter = indirectStarter
multipleImagesPerStrip = True
help = 'Index format: chapter'
@ -374,7 +374,7 @@ class SpaceJunkArlia(_ParserScraper):
url = 'http://spacejunkarlia.com/'
stripUrl = url + '?strip_id=%s'
firstStripUrl = stripUrl % '0'
imageSearch = '//div[%s]/img' % xpath_class('content')
imageSearch = '//div[d:class("content")]/img'
prevSearch = '//a[text()="<"]'
help = 'Index format: number'
@ -382,7 +382,7 @@ class SpaceJunkArlia(_ParserScraper):
class SpaceTrawler(_ParserScraper):
url = 'https://www.baldwinpage.com/spacetrawler/'
firstStripUrl = url + '2010/01/01/spacetrawler-4/'
imageSearch = '//img[%s]' % xpath_class('size-full')
imageSearch = '//img[d:class("size-full")]'
prevSearch = '//a[@rel="prev"]'

View file

@ -3,14 +3,13 @@
# Copyright (C) 2019-2020 Daniel Ring
import re
from ..helpers import xpath_class
from ..scraper import _ParserScraper
class SmackJeeves(_ParserScraper):
baseUrl = 'https://www.smackjeeves.com/discover/'
apiBase = 'https://www.smackjeeves.com/api/discover/'
prevSearch = '//a[i[{}]]'.format(xpath_class('i-arrow-double-left-black'))
prevSearch = '//a[i[d:class("i-arrow-double-left-black")]]'
imageSearch = re.compile("comicData:[^']*'([^']*)'", re.DOTALL)
help = 'Index format: n'

View file

@ -10,7 +10,7 @@ except ImportError:
from cached_property import cached_property
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -40,8 +40,8 @@ class TheBrads(_ParserScraper):
'http://bradcolbow.com/archive/C4/')
stripUrl = url + '%s/'
firstStripUrl = stripUrl % 'P125'
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
prevSearch = '//a[{}]'.format(xpath_class('prev'))
imageSearch = '//div[d:class("entry")]//img'
prevSearch = '//a[d:class("prev")]'
multipleImagesPerStrip = True
endOfLife = True
@ -120,7 +120,7 @@ class TheLandscaper(_ParserScraper):
'http://landscaper.visual-assault.net/comic/%s')
url = stripUrl % 'latest'
firstStripUrl = stripUrl % '1'
imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic'))
imageSearch = '//article[d:class("comic")]//img[1]'
prevSearch = '//a[contains(text(), "Previous")]'
endOfLife = True
@ -294,8 +294,8 @@ class TumbleDryComics(_WordPressScraper):
class Turnoff(_ParserScraper):
name = 'turnoff'
url = 'https://turnoff.us/'
imageSearch = '//article[%s]//img' % xpath_class('post-content')
prevSearch = '//div[%s]//a' % xpath_class('prev')
imageSearch = '//article[d:class("post-content")]//img'
prevSearch = '//div[d:class("prev")]//a'
stripUrl = url + 'geek/%s'
firstStripUrl = stripUrl % 'tcp-buddies'
multipleImagesPerStrip = True
@ -341,8 +341,8 @@ class Twokinds(_ParserScraper):
url = 'http://twokinds.keenspot.com/'
stripUrl = url + 'comic/%s/'
firstStripUrl = stripUrl % '1'
imageSearch = '//article[%s]//img' % xpath_class('comic')
prevSearch = '//a[%s]' % xpath_class('navprev')
imageSearch = '//article[d:class("comic")]//img'
prevSearch = '//a[d:class("navprev")]'
help = 'Index format: n (unpadded)'

View file

@ -6,7 +6,7 @@
from re import compile
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import indirectStarter, xpath_class
from ..helpers import indirectStarter
from ..util import tagre
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
@ -46,7 +46,7 @@ class Unsounded(_ParserScraper):
stripUrl = url + 'comic/ch%s/ch%s_%s.html'
firstStripUrl = stripUrl % ('01', '01', '01')
imageSearch = '//img[contains(@src, "pageart/")]'
prevSearch = '//a[%s]' % xpath_class('back')
prevSearch = '//a[d:class("back")]'
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
multipleImagesPerStrip = True
starter = indirectStarter

View file

@ -6,7 +6,7 @@
from re import compile
from ..scraper import _BasicScraper, _ParserScraper
from ..helpers import bounceStarter, indirectStarter, xpath_class
from ..helpers import bounceStarter, indirectStarter
from ..util import tagre
@ -71,8 +71,8 @@ class VictimsOfTheSystem(_BasicScraper):
class ViiviJaWagner(_ParserScraper):
url = 'http://www.hs.fi/viivijawagner/'
imageSearch = '//meta[@property="og:image"]/@content'
prevSearch = '//a[%s]' % xpath_class('prev')
latestSearch = '//div[%s]//a' % xpath_class('cartoon-content')
prevSearch = '//a[d:class("prev")]'
latestSearch = '//div[d:class("cartoon-content")]//a'
starter = indirectStarter
lang = 'fi'

View file

@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE
from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre
from ..helpers import bounceStarter, indirectStarter, xpath_class
from ..helpers import bounceStarter, indirectStarter
from .common import _ComicControlScraper, _WPNavi, _WPNaviIn, _WPWebcomic
@ -28,8 +28,8 @@ class WastedTalent(_BasicScraper):
class WebcomicName(_ParserScraper):
url = 'https://webcomicname.com/'
imageSearch = '//figure[{}]//img'.format(xpath_class('tmblr-full'))
prevSearch = '//a[{}]'.format(xpath_class('next'))
imageSearch = '//figure[d:class("tmblr-full")]//img'
prevSearch = '//a[d:class("next")]'
multipleImagesPerStrip = True
@ -38,10 +38,10 @@ class WebDesignerCOTW(_ParserScraper):
url = baseUrl + 'category/comics/'
starter = indirectStarter
firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/'
imageSearch = '//article[%s]//img' % xpath_class('article-content')
imageSearch = '//article[d:class("article-content")]//img'
multipleImagesPerStrip = True
prevSearch = '//a[span[%s]]' % xpath_class('icon-right-small')
latestSearch = '//a[%s]' % xpath_class('anim-link')
prevSearch = '//a[span[d:class("icon-right-small")]]'
latestSearch = '//a[d:class("anim-link")]'
def shouldSkipUrl(self, url, data):
"""Skip non-comic URLs."""

View file

@ -1,12 +1,12 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2019 Tobias Gruetzmacher
# Copyright (C) 2015-2020 Tobias Gruetzmacher
from re import compile, escape
from ..scraper import _BasicScraper, _ParserScraper
from ..util import tagre
from ..helpers import bounceStarter, joinPathPartsNamer, xpath_class
from ..helpers import bounceStarter, joinPathPartsNamer
from .common import _WPNavi
@ -21,8 +21,8 @@ class Zapiro(_ParserScraper):
url = 'http://mg.co.za/zapiro/'
starter = bounceStarter
imageSearch = '//div[@id="cartoon"]/img'
prevSearch = '//a[%s]' % xpath_class('left')
nextSearch = '//a[%s]' % xpath_class('right')
prevSearch = '//a[d:class("left")]'
nextSearch = '//a[d:class("right")]'
namer = joinPathPartsNamer((-1,), ())
@ -31,8 +31,8 @@ class ZenPencils(_WPNavi):
multipleImagesPerStrip = True
firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'
starter = bounceStarter
prevSearch = '//a[%s]' % xpath_class('navi-prev')
nextSearch = '//a[%s]' % xpath_class('navi-next')
prevSearch = '//a[d:class("navi-prev")]'
nextSearch = '//a[d:class("navi-next")]'
class ZombieHunters(_BasicScraper):

View file

@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
from .comic import ComicStrip
from .output import out
from .events import getHandler
from .xml import NS
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
XML_DECL = re.compile(
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
NS = {
"re": "http://exslt.org/regular-expressions"
}
# Switch between CSS and XPath selectors for this class. Since CSS needs
# another Python module, XPath is the default for now.
css = False
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
searchFun = data.cssselect
else:
def searchFun(s):
return data.xpath(s, namespaces=self.NS)
return data.xpath(s, namespaces=NS)
patterns = makeSequence(patterns)
for search in patterns:
matched = False

20
dosagelib/xml.py Normal file
View file

@ -0,0 +1,20 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import etree
NS = {
'd': 'https://dosage.rocks/xpath',
're': 'http://exslt.org/regular-expressions'
}
def find_by_class(context, cls):
attributes = context.context_node.attrib
if 'class' in attributes:
return cls in attributes['class'].split(' ')
return False
dosagens = etree.FunctionNamespace(NS['d'])
dosagens['class'] = find_by_class

View file

@ -14,7 +14,7 @@ def _file(name):
@lru_cache()
def _content(name):
def content(name):
with gzip.open(_file(name + '.html.gz'), 'r') as f:
return f.read()
@ -26,7 +26,7 @@ def _img(name):
def page(url, pagename):
add(GET, url, _content(pagename))
add(GET, url, content(pagename))
def png(url, name='empty'):

25
tests/test_xml.py Normal file
View file

@ -0,0 +1,25 @@
# SPDX-License-Identifier: MIT
# Copyright (C) 2020 Tobias Gruetzmacher
from lxml import html
from dosagelib.xml import NS
import httpmocks
tree = html.document_fromstring(httpmocks.content('zp-222'))
class TestXML:
def xpath(self, path):
return tree.xpath(path, namespaces=NS)
def test_class_ext(self):
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
assert len(self.xpath('//ul[d:class("menu")]')) == 1
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
def test_re_ext(self):
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1