Merge pull request #169 from webcomics/xpath-ext
Replace xpath_class function with an XPath extension
This commit is contained in:
commit
912b30191d
26 changed files with 148 additions and 117 deletions
|
@ -62,9 +62,3 @@ def indirectStarter(self):
|
|||
data = self.getPage(url)
|
||||
newurl = self.fetchUrl(url, data, self.latestSearch)
|
||||
return self.link_modifier(url, newurl)
|
||||
|
||||
|
||||
def xpath_class(name):
|
||||
"""Returns an XPath expressions which finds a tag which has a specified
|
||||
class."""
|
||||
return 'contains(concat(" ", @class, " "), " %s ")' % name
|
||||
|
|
|
@ -7,7 +7,7 @@ from re import compile, escape
|
|||
|
||||
from ..util import tagre
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
|
||||
|
||||
|
||||
|
@ -71,7 +71,7 @@ class Baroquen(_BasicScraper):
|
|||
class Bearmageddon(_WordPressScraper):
|
||||
url = 'http://bearmageddon.com/bearmo/page-1/'
|
||||
firstStripUrl = url
|
||||
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
|
||||
latestSearch = '//a[d:class("comic-nav-last")]'
|
||||
starter = indirectStarter
|
||||
|
||||
|
||||
|
@ -187,8 +187,8 @@ class BlankIt(_ParserScraper):
|
|||
url = 'http://blankitcomics.com/'
|
||||
firstStripUrl = url + 'comic/well-what-would-you-do'
|
||||
imageSearch = '//div[@id="comic"]//img'
|
||||
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous')
|
||||
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
|
||||
prevSearch = '//a[d:class("comic-nav-previous")]'
|
||||
latestSearch = '//a[d:class("comic-nav-last")]'
|
||||
starter = indirectStarter
|
||||
|
||||
|
||||
|
@ -235,7 +235,7 @@ class BMovieComic(_BasicScraper):
|
|||
|
||||
class BobWhite(_ParserScraper):
|
||||
url = 'http://www.bobwhitecomics.com/'
|
||||
imageSearch = '//span[%s]/img' % xpath_class('webcomic-object')
|
||||
imageSearch = '//span[d:class("webcomic-object")]/img'
|
||||
prevSearch = '//a[@rel="previous"]'
|
||||
|
||||
|
||||
|
@ -296,7 +296,6 @@ class ButImACatPerson(_WordPressScraper):
|
|||
endOfLife = True
|
||||
|
||||
|
||||
|
||||
class ButtercupFestival(_ParserScraper):
|
||||
url = 'http://www.buttercupfestival.com/'
|
||||
stripUrl = url + '%s.htm'
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
from ..helpers import indirectStarter
|
||||
from ..scraper import _ParserScraper
|
||||
from ..util import getQueryParams
|
||||
|
||||
|
||||
class CloneManga(_ParserScraper):
|
||||
baseUrl = 'http://manga.clone-army.org'
|
||||
imageSearch = '//div[%s]//img' % xpath_class('subsectionContainer')
|
||||
imageSearch = '//div[d:class("subsectionContainer")]//img'
|
||||
prevSearch = '//a[span[text()="<<"]]'
|
||||
latestSearch = '//a[span[text()=">|"]]'
|
||||
starter = indirectStarter
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
import os
|
||||
|
||||
from ..scraper import _ParserScraper
|
||||
from ..helpers import bounceStarter, xpath_class
|
||||
from ..helpers import bounceStarter
|
||||
|
||||
XPATH_LINK = '//a[%s and contains(text(), "%s")]'
|
||||
XPATH_IMG = '//div[{}]//a[img[contains(@alt, "%s")]]'.format(xpath_class('comicnav'))
|
||||
XPATH_LINK = '//a[d:class("%s") and contains(text(), "%s")]'
|
||||
XPATH_IMG = '//div[d:class("comicnav")]//a[img[contains(@alt, "%s")]]'
|
||||
|
||||
|
||||
class ComicFury(_ParserScraper):
|
||||
|
@ -21,12 +21,12 @@ class ComicFury(_ParserScraper):
|
|||
# 137 (needs to be before the generic a@rel, because layout is wrong)
|
||||
'//a[contains(@title, "previous")]',
|
||||
'//a[@rel="prev"]',
|
||||
XPATH_LINK % (xpath_class("comicnavlink"), "Previous"),
|
||||
XPATH_LINK % ('comicnavlink', 'Previous'),
|
||||
XPATH_IMG % ('Previous'),
|
||||
# Art, ConsolersDLC, etc.
|
||||
u'//nav//a[contains(text(), "\u2039")]',
|
||||
# LatchkeyKingdom
|
||||
'//a[%s and img[contains(@src, "Previous")]]' % xpath_class('navi'),
|
||||
'//a[d:class("navi") and img[contains(@src, "Previous")]]',
|
||||
# RedSpot
|
||||
'//a[contains(text(), "Back")]',
|
||||
# KATRAN
|
||||
|
@ -37,12 +37,12 @@ class ComicFury(_ParserScraper):
|
|||
# 137 (see above)
|
||||
'//a[contains(@title, "next")]',
|
||||
'//a[@rel="next"]',
|
||||
XPATH_LINK % (xpath_class("comicnavlink"), "Next"),
|
||||
XPATH_LINK % ('comicnavlink', 'Next'),
|
||||
XPATH_IMG % ('Next'),
|
||||
# Art, ConsolersDLC, etc.
|
||||
u'//nav//a[contains(text(), "\u203A")]',
|
||||
# LatchkeyKingdom
|
||||
'//a[%s and img[contains(@src, "Next")]]' % xpath_class('navi'),
|
||||
'//a[d:class("navi") and img[contains(@src, "Next")]]',
|
||||
# RedSpot, KATRAN
|
||||
'//a[contains(text(), "Next")]',
|
||||
)
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
# Copyright (C) 2019-2020 Daniel Ring
|
||||
from ..scraper import _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
|
||||
# Common base classes for comics with the same structure (same hosting
|
||||
# software, for example) go here. Since those are shared by many modules,
|
||||
|
@ -14,24 +13,24 @@ from ..helpers import indirectStarter, xpath_class
|
|||
|
||||
class _WordPressScraper(_ParserScraper):
|
||||
imageSearch = '//div[@id="comic"]//img'
|
||||
prevSearch = '//a[%s]' % xpath_class('comic-nav-previous')
|
||||
nextSearch = '//a[%s]' % xpath_class('comic-nav-next')
|
||||
latestSearch = '//a[%s]' % xpath_class('comic-nav-last')
|
||||
prevSearch = '//a[d:class("comic-nav-previous")]'
|
||||
nextSearch = '//a[d:class("comic-nav-next")]'
|
||||
latestSearch = '//a[d:class("comic-nav-last")]'
|
||||
|
||||
|
||||
class _WPNavi(_WordPressScraper):
|
||||
prevSearch = '//a[%s]' % xpath_class('navi-prev')
|
||||
prevSearch = '//a[d:class("navi-prev")]'
|
||||
|
||||
|
||||
class _WPNaviIn(_WordPressScraper):
|
||||
prevSearch = '//a[%s]' % xpath_class('navi-prev-in')
|
||||
prevSearch = '//a[d:class("navi-prev-in")]'
|
||||
|
||||
|
||||
class _WPWebcomic(_WordPressScraper):
|
||||
imageSearch = '//div[{}]//img'.format(xpath_class('webcomic-image'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
|
||||
nextSearch = '///a[{}]'.format(xpath_class('next-webcomic-link'))
|
||||
latestSearch = '//a[{}]'.format(xpath_class('last-webcomic-link'))
|
||||
imageSearch = '//div[d:class("webcomic-image")]//img'
|
||||
prevSearch = '//a[d:class("previous-webcomic-link")]'
|
||||
nextSearch = '///a[d:class("next-webcomic-link")]'
|
||||
latestSearch = '//a[d:class("last-webcomic-link")]'
|
||||
|
||||
|
||||
class _ComicControlScraper(_ParserScraper):
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile, escape
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, bounceStarter, xpath_class
|
||||
from ..helpers import indirectStarter, bounceStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNaviIn, _WPWebcomic
|
||||
|
||||
|
@ -190,8 +190,8 @@ class Dilbert(_ParserScraper):
|
|||
stripUrl = url + 'strip/%s'
|
||||
firstStripUrl = stripUrl % '1989-04-16'
|
||||
starter = indirectStarter
|
||||
prevSearch = '//div[%s]/a' % xpath_class('nav-left')
|
||||
imageSearch = '//img[%s]' % xpath_class('img-comic')
|
||||
prevSearch = '//div[d:class("nav-left")]/a'
|
||||
imageSearch = '//img[d:class("img-comic")]'
|
||||
latestSearch = '//a[@class="img-comic-link"]'
|
||||
help = 'Index format: yyyy-mm-dd'
|
||||
|
||||
|
@ -260,14 +260,14 @@ class DominicDeegan(_ParserScraper):
|
|||
class DorkTower(_ParserScraper):
|
||||
url = 'http://www.dorktower.com/'
|
||||
firstStripUrl = url + '1997/01/01/shadis-magazine-strip-1/'
|
||||
imageSearch = '//div[%s]//a/img' % xpath_class('entry-content')
|
||||
prevSearch = '//a[%s][text()="Previous"]' % xpath_class('btn')
|
||||
imageSearch = '//div[d:class("entry-content")]//a/img'
|
||||
prevSearch = '//a[d:class("btn")][text()="Previous"]'
|
||||
|
||||
|
||||
class DoomsdayMyDear(_ParserScraper):
|
||||
url = 'http://doomsdaymydear.com/'
|
||||
imageSearch = '//img[{}]'.format(xpath_class('attachment-full'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
|
||||
imageSearch = '//img[d:class("attachment-full")]'
|
||||
prevSearch = '//a[d:class("previous-webcomic-link")]'
|
||||
|
||||
|
||||
class Draconia(_WPWebcomic):
|
||||
|
@ -307,10 +307,9 @@ class DresdenCodak(_ParserScraper):
|
|||
url = 'http://dresdencodak.com/'
|
||||
startUrl = url + 'cat/comic/'
|
||||
firstStripUrl = url + '2007/02/08/pom/'
|
||||
imageSearch = '//section[%s]//img[%s]' % (
|
||||
xpath_class('entry-content'), xpath_class('aligncenter'))
|
||||
imageSearch = '//section[d:class("entry-content")]//img[d:class("aligncenter")]'
|
||||
prevSearch = '//a[img[contains(@src, "prev")]]'
|
||||
latestSearch = '//a[%s]' % xpath_class('tc-grid-bg-link')
|
||||
latestSearch = '//a[d:class("tc-grid-bg-link")]'
|
||||
starter = indirectStarter
|
||||
|
||||
# Blog and comic are mixed...
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
import os
|
||||
from re import compile, IGNORECASE
|
||||
|
||||
from ..helpers import bounceStarter, indirectStarter, xpath_class
|
||||
from ..helpers import bounceStarter, indirectStarter
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
||||
|
@ -35,7 +35,7 @@ class EatLiver(_ParserScraper):
|
|||
url = 'http://www.eatliver.com/'
|
||||
starter = indirectStarter
|
||||
multipleImagesPerStrip = True
|
||||
imageSearch = '//div[%s]//img' % xpath_class('post-content')
|
||||
imageSearch = '//div[d:class("post-content")]//img'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
latestSearch = '//a[@rel="bookmark"]'
|
||||
|
||||
|
@ -175,7 +175,7 @@ class Everblue(_ParserScraper):
|
|||
class EverybodyLovesEricRaymond(_ParserScraper):
|
||||
url = 'http://geekz.co.uk/lovesraymond/'
|
||||
firstStripUrl = url + 'archive/slashdotted'
|
||||
imageSearch = '//div[%s]//img' % xpath_class('entry-content')
|
||||
imageSearch = '//div[d:class("entry-content")]//img'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
|
||||
|
||||
|
@ -255,6 +255,6 @@ class ExtraOrdinary(_ParserScraper):
|
|||
url = 'https://www.exocomics.com/'
|
||||
stripUrl = url + '%s'
|
||||
firstStripUrl = stripUrl % '01'
|
||||
prevSearch = '//a[%s]' % xpath_class('prev')
|
||||
imageSearch = '//img[%s]' % xpath_class('image-style-main-comic')
|
||||
prevSearch = '//a[d:class("prev")]'
|
||||
imageSearch = '//img[d:class("image-style-main-comic")]'
|
||||
help = 'Index format: number'
|
||||
|
|
|
@ -7,7 +7,7 @@ from re import compile, escape
|
|||
|
||||
from ..util import tagre
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, joinPathPartsNamer, xpath_class
|
||||
from ..helpers import indirectStarter, joinPathPartsNamer
|
||||
from .common import _ComicControlScraper, _WPNaviIn, _WordPressScraper
|
||||
|
||||
|
||||
|
@ -62,8 +62,8 @@ class FirstWorldProblems(_ParserScraper):
|
|||
'http://bradcolbow.com/archive/C5/')
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % 'P10'
|
||||
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('prev'))
|
||||
imageSearch = '//div[d:class("entry")]//img'
|
||||
prevSearch = '//a[d:class("prev")]'
|
||||
multipleImagesPerStrip = True
|
||||
endOfLife = True
|
||||
|
||||
|
@ -83,7 +83,7 @@ class Flemcomics(_ParserScraper):
|
|||
'http://www.flemcomics.com/')
|
||||
stripUrl = url + 'd/%s.html'
|
||||
firstStripUrl = stripUrl % '19980101'
|
||||
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
|
||||
imageSearch = '//img[d:class("ksc")]'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
endOfLife = True
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
@ -174,10 +174,10 @@ class FredoAndPidjin(_ParserScraper):
|
|||
url = 'https://www.pidjin.net/'
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % '2006/02/19/goofy-monday'
|
||||
imageSearch = '//div[%s]//img' % xpath_class("episode")
|
||||
imageSearch = '//div[d:class("episode")]//img'
|
||||
multipleImagesPerStrip = True
|
||||
prevSearch = '//span[%s]/a' % xpath_class("prev")
|
||||
latestSearch = '//section[%s]//a' % xpath_class("latest")
|
||||
prevSearch = '//span[d:class("prev")]/a'
|
||||
latestSearch = '//section[d:class("latest")]//a'
|
||||
starter = indirectStarter
|
||||
namer = joinPathPartsNamer((0, 1, 2))
|
||||
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
from ..scraper import _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
|
||||
|
||||
class GoComics(_ParserScraper):
|
||||
url = 'https://www.gocomics.com/'
|
||||
imageSearch = '//picture[{}]/img'.format(xpath_class('item-comic-image'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('js-previous-comic'))
|
||||
latestSearch = '//div[{}]//a'.format(xpath_class('gc-deck--cta-0'))
|
||||
imageSearch = '//picture[d:class("item-comic-image")]/img'
|
||||
prevSearch = '//a[d:class("js-previous-comic")]'
|
||||
latestSearch = '//div[d:class("gc-deck--cta-0")]//a'
|
||||
starter = indirectStarter
|
||||
help = 'Index format: yyyy/mm/dd'
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from re import compile, escape
|
|||
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from .common import _ComicControlScraper
|
||||
|
||||
|
||||
|
@ -41,7 +41,7 @@ class JoeAndMonkey(_BasicScraper):
|
|||
|
||||
|
||||
class JohnnyWander(_ComicControlScraper):
|
||||
imageSearch = ('//ul[%s]/li/@data-src' % xpath_class('cc-showbig'),
|
||||
imageSearch = ('//ul[d:class("cc-showbig")]/li/@data-src',
|
||||
_ComicControlScraper.imageSearch)
|
||||
url = 'http://www.johnnywander.com/'
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
import json
|
||||
from re import compile, escape, IGNORECASE
|
||||
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPWebcomic
|
||||
|
@ -74,7 +74,7 @@ class MarriedToTheSea(_ParserScraper):
|
|||
url = 'http://marriedtothesea.com/'
|
||||
stripUrl = url + '%s'
|
||||
firstStripUrl = stripUrl % '022806'
|
||||
imageSearch = '//div[%s]//p/img' % xpath_class('jumbotron')
|
||||
imageSearch = '//div[d:class("jumbotron")]//p/img'
|
||||
prevSearch = '//a[contains(text(), "Yesterday")]'
|
||||
help = 'Index format: mmddyy'
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile, escape
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPWebcomic
|
||||
|
||||
|
@ -134,7 +134,7 @@ class Nimona(_ParserScraper):
|
|||
'http://gingerhaze.com/nimona/')
|
||||
stripUrl = url + 'comic/%s'
|
||||
firstStripUrl = stripUrl % "page-1"
|
||||
imageSearch = '//div[{}]//img'.format(xpath_class('field-name-field-comic-page'))
|
||||
imageSearch = '//div[d:class("field-name-field-comic-page")]//img'
|
||||
prevSearch = '//a[img[contains(@src, "/comicdrop_prev_label")]]'
|
||||
endOfLife = True
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile, escape
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import bounceStarter, queryNamer, indirectStarter, xpath_class
|
||||
from ..helpers import bounceStarter, queryNamer, indirectStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
||||
|
||||
|
@ -99,8 +99,8 @@ class PennyArcade(_ParserScraper):
|
|||
stripUrl = url + '%s'
|
||||
firstStripUrl = stripUrl % '1998/11/18'
|
||||
imageSearch = '//div[@id="comicFrame"]//img'
|
||||
prevSearch = '//a[%s]' % xpath_class('btnPrev')
|
||||
nextSearch = '//a[%s]' % xpath_class('btnNext')
|
||||
prevSearch = '//a[d:class("btnPrev")]'
|
||||
nextSearch = '//a[d:class("btnNext")]'
|
||||
starter = bounceStarter
|
||||
help = 'Index format: yyyy/mm/dd'
|
||||
|
||||
|
@ -231,7 +231,7 @@ class PokeyThePenguin(_ParserScraper):
|
|||
class PoorlyDrawnLines(_ParserScraper):
|
||||
url = 'http://poorlydrawnlines.com/comic/'
|
||||
firstStripUrl = url + 'campus-characters/'
|
||||
imageSearch = '//div[%s]//img' % xpath_class('comic')
|
||||
imageSearch = '//div[d:class("comic")]//img'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
|
||||
|
||||
|
@ -269,7 +269,7 @@ class PrinceOfSartar(_WPNavi):
|
|||
url = 'http://www.princeofsartar.com/'
|
||||
stripUrl = url + 'comic/%s/'
|
||||
firstStripUrl = stripUrl % 'introduction-chapter-1'
|
||||
nextSearch = '//a[%s]' % xpath_class('navi-next')
|
||||
nextSearch = '//a[d:class("navi-next")]'
|
||||
starter = bounceStarter
|
||||
help = 'Index format: name'
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
# Copyright (C) 2019-2020 Daniel Ring
|
||||
from ..scraper import _ParserScraper
|
||||
from ..helpers import xpath_class
|
||||
|
||||
|
||||
class QuantumVibe(_ParserScraper):
|
||||
|
@ -28,6 +27,6 @@ class Qwantz(_ParserScraper):
|
|||
url = 'http://www.qwantz.com/index.php'
|
||||
stripUrl = url + '?comic=%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = '//img[{}]'.format(xpath_class('comic'))
|
||||
imageSearch = '//img[d:class("comic")]'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
help = 'Index format: n'
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from ..helpers import bounceStarter, xpath_class
|
||||
from ..helpers import bounceStarter
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..util import tagre
|
||||
from .common import _WordPressScraper, _WPWebcomic
|
||||
|
@ -107,7 +107,7 @@ class RomanticallyApocalyptic(_ParserScraper):
|
|||
url = 'http://romanticallyapocalyptic.com/'
|
||||
stripUrl = url + '%s'
|
||||
firstStripUrl = stripUrl % '0'
|
||||
imageSearch = '//div[%s]/center//img' % xpath_class('comicpanel')
|
||||
imageSearch = '//div[d:class("comicpanel")]/center//img'
|
||||
prevSearch = '//a[@accesskey="p"]'
|
||||
help = 'Index format: n'
|
||||
adult = True
|
||||
|
|
|
@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE, sub
|
|||
from os.path import splitext
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer, xpath_class
|
||||
from ..helpers import indirectStarter, bounceStarter, joinPathPartsNamer
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
|
||||
|
||||
|
@ -120,7 +120,7 @@ class SchoolBites(_ParserScraper):
|
|||
url = ('https://web.archive.org/web/20170215065523/'
|
||||
'http://schoolbites.net/')
|
||||
stripUrl = url + 'd/%s.html'
|
||||
imageSearch = '//img[{}]'.format(xpath_class('ksc'))
|
||||
imageSearch = '//img[d:class("ksc")]'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
endOfLife = True
|
||||
help = 'Index format: yyyymmdd'
|
||||
|
@ -132,7 +132,7 @@ class Schuelert(_ParserScraper):
|
|||
stripUrl = url + 'index.php?paged=%s'
|
||||
firstStripUrl = stripUrl % '3'
|
||||
imageSearch = '//img[contains(@src, "wp-content")]'
|
||||
prevSearch = '//span[{}]/a'.format(xpath_class('prevlink'))
|
||||
prevSearch = '//span[d:class("prevlink")]/a'
|
||||
multipleImagesPerStrip = True
|
||||
endOfLife = True
|
||||
lang = 'de'
|
||||
|
@ -143,7 +143,7 @@ class Science(_ParserScraper):
|
|||
'http://sci-ence.org/%s/')
|
||||
url = stripUrl % 'new-york-comic-con-2013'
|
||||
firstStripUrl = stripUrl % 'periodic-table-element-ass'
|
||||
prevSearch = '//a[{}]'.format(xpath_class('navi-prev'))
|
||||
prevSearch = '//a[d:class("navi-prev")]'
|
||||
imageSearch = '//div[@class="comicpane"]//img'
|
||||
endOfLife = True
|
||||
|
||||
|
@ -159,7 +159,7 @@ class SequentialArt(_ParserScraper):
|
|||
url = 'https://www.collectedcurios.com/sequentialart.php'
|
||||
stripUrl = url + '?s=%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = '//img[{}]'.format(xpath_class('w3-image'))
|
||||
imageSearch = '//img[d:class("w3-image")]'
|
||||
prevSearch = '//a[@id="backOne"]'
|
||||
help = 'Index format: name'
|
||||
|
||||
|
@ -286,9 +286,9 @@ class SluggyFreelance(_ParserScraper):
|
|||
url = 'http://sluggy.com/'
|
||||
stripUrl = 'http://archives.sluggy.com/book.php?chapter=%s'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = '//div[%s]/img/@data-src' % xpath_class('comic_content')
|
||||
prevSearch = '//div[%s]/a' % xpath_class('previous')
|
||||
latestSearch = '//a[%s]' % xpath_class('archives_link')
|
||||
imageSearch = '//div[d:class("comic_content")]/img/@data-src'
|
||||
prevSearch = '//div[d:class("previous")]/a'
|
||||
latestSearch = '//a[d:class("archives_link")]'
|
||||
starter = indirectStarter
|
||||
multipleImagesPerStrip = True
|
||||
help = 'Index format: chapter'
|
||||
|
@ -374,7 +374,7 @@ class SpaceJunkArlia(_ParserScraper):
|
|||
url = 'http://spacejunkarlia.com/'
|
||||
stripUrl = url + '?strip_id=%s'
|
||||
firstStripUrl = stripUrl % '0'
|
||||
imageSearch = '//div[%s]/img' % xpath_class('content')
|
||||
imageSearch = '//div[d:class("content")]/img'
|
||||
prevSearch = '//a[text()="<"]'
|
||||
help = 'Index format: number'
|
||||
|
||||
|
@ -382,7 +382,7 @@ class SpaceJunkArlia(_ParserScraper):
|
|||
class SpaceTrawler(_ParserScraper):
|
||||
url = 'https://www.baldwinpage.com/spacetrawler/'
|
||||
firstStripUrl = url + '2010/01/01/spacetrawler-4/'
|
||||
imageSearch = '//img[%s]' % xpath_class('size-full')
|
||||
imageSearch = '//img[d:class("size-full")]'
|
||||
prevSearch = '//a[@rel="prev"]'
|
||||
|
||||
|
||||
|
|
|
@ -3,14 +3,13 @@
|
|||
# Copyright (C) 2019-2020 Daniel Ring
|
||||
import re
|
||||
|
||||
from ..helpers import xpath_class
|
||||
from ..scraper import _ParserScraper
|
||||
|
||||
|
||||
class SmackJeeves(_ParserScraper):
|
||||
baseUrl = 'https://www.smackjeeves.com/discover/'
|
||||
apiBase = 'https://www.smackjeeves.com/api/discover/'
|
||||
prevSearch = '//a[i[{}]]'.format(xpath_class('i-arrow-double-left-black'))
|
||||
prevSearch = '//a[i[d:class("i-arrow-double-left-black")]]'
|
||||
imageSearch = re.compile("comicData:[^']*'([^']*)'", re.DOTALL)
|
||||
help = 'Index format: n'
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ except ImportError:
|
|||
from cached_property import cached_property
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi, _WPNaviIn, _WPWebcomic
|
||||
|
||||
|
@ -40,8 +40,8 @@ class TheBrads(_ParserScraper):
|
|||
'http://bradcolbow.com/archive/C4/')
|
||||
stripUrl = url + '%s/'
|
||||
firstStripUrl = stripUrl % 'P125'
|
||||
imageSearch = '//div[{}]//img'.format(xpath_class('entry'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('prev'))
|
||||
imageSearch = '//div[d:class("entry")]//img'
|
||||
prevSearch = '//a[d:class("prev")]'
|
||||
multipleImagesPerStrip = True
|
||||
endOfLife = True
|
||||
|
||||
|
@ -120,7 +120,7 @@ class TheLandscaper(_ParserScraper):
|
|||
'http://landscaper.visual-assault.net/comic/%s')
|
||||
url = stripUrl % 'latest'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = '//article[{}]//img[1]'.format(xpath_class('comic'))
|
||||
imageSearch = '//article[d:class("comic")]//img[1]'
|
||||
prevSearch = '//a[contains(text(), "Previous")]'
|
||||
endOfLife = True
|
||||
|
||||
|
@ -294,8 +294,8 @@ class TumbleDryComics(_WordPressScraper):
|
|||
class Turnoff(_ParserScraper):
|
||||
name = 'turnoff'
|
||||
url = 'https://turnoff.us/'
|
||||
imageSearch = '//article[%s]//img' % xpath_class('post-content')
|
||||
prevSearch = '//div[%s]//a' % xpath_class('prev')
|
||||
imageSearch = '//article[d:class("post-content")]//img'
|
||||
prevSearch = '//div[d:class("prev")]//a'
|
||||
stripUrl = url + 'geek/%s'
|
||||
firstStripUrl = stripUrl % 'tcp-buddies'
|
||||
multipleImagesPerStrip = True
|
||||
|
@ -341,8 +341,8 @@ class Twokinds(_ParserScraper):
|
|||
url = 'http://twokinds.keenspot.com/'
|
||||
stripUrl = url + 'comic/%s/'
|
||||
firstStripUrl = stripUrl % '1'
|
||||
imageSearch = '//article[%s]//img' % xpath_class('comic')
|
||||
prevSearch = '//a[%s]' % xpath_class('navprev')
|
||||
imageSearch = '//article[d:class("comic")]//img'
|
||||
prevSearch = '//a[d:class("navprev")]'
|
||||
help = 'Index format: n (unpadded)'
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import indirectStarter, xpath_class
|
||||
from ..helpers import indirectStarter
|
||||
from ..util import tagre
|
||||
from .common import _ComicControlScraper, _WordPressScraper, _WPNavi
|
||||
|
||||
|
@ -46,7 +46,7 @@ class Unsounded(_ParserScraper):
|
|||
stripUrl = url + 'comic/ch%s/ch%s_%s.html'
|
||||
firstStripUrl = stripUrl % ('01', '01', '01')
|
||||
imageSearch = '//img[contains(@src, "pageart/")]'
|
||||
prevSearch = '//a[%s]' % xpath_class('back')
|
||||
prevSearch = '//a[d:class("back")]'
|
||||
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
|
||||
multipleImagesPerStrip = True
|
||||
starter = indirectStarter
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
from re import compile
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..helpers import bounceStarter, indirectStarter, xpath_class
|
||||
from ..helpers import bounceStarter, indirectStarter
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
|
@ -71,8 +71,8 @@ class VictimsOfTheSystem(_BasicScraper):
|
|||
class ViiviJaWagner(_ParserScraper):
|
||||
url = 'http://www.hs.fi/viivijawagner/'
|
||||
imageSearch = '//meta[@property="og:image"]/@content'
|
||||
prevSearch = '//a[%s]' % xpath_class('prev')
|
||||
latestSearch = '//div[%s]//a' % xpath_class('cartoon-content')
|
||||
prevSearch = '//a[d:class("prev")]'
|
||||
latestSearch = '//div[d:class("cartoon-content")]//a'
|
||||
starter = indirectStarter
|
||||
lang = 'fi'
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ from re import compile, escape, IGNORECASE
|
|||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..util import tagre
|
||||
from ..helpers import bounceStarter, indirectStarter, xpath_class
|
||||
from ..helpers import bounceStarter, indirectStarter
|
||||
from .common import _ComicControlScraper, _WPNavi, _WPNaviIn, _WPWebcomic
|
||||
|
||||
|
||||
|
@ -28,8 +28,8 @@ class WastedTalent(_BasicScraper):
|
|||
|
||||
class WebcomicName(_ParserScraper):
|
||||
url = 'https://webcomicname.com/'
|
||||
imageSearch = '//figure[{}]//img'.format(xpath_class('tmblr-full'))
|
||||
prevSearch = '//a[{}]'.format(xpath_class('next'))
|
||||
imageSearch = '//figure[d:class("tmblr-full")]//img'
|
||||
prevSearch = '//a[d:class("next")]'
|
||||
multipleImagesPerStrip = True
|
||||
|
||||
|
||||
|
@ -38,10 +38,10 @@ class WebDesignerCOTW(_ParserScraper):
|
|||
url = baseUrl + 'category/comics/'
|
||||
starter = indirectStarter
|
||||
firstStripUrl = baseUrl + '2009/11/comics-of-the-week-1/'
|
||||
imageSearch = '//article[%s]//img' % xpath_class('article-content')
|
||||
imageSearch = '//article[d:class("article-content")]//img'
|
||||
multipleImagesPerStrip = True
|
||||
prevSearch = '//a[span[%s]]' % xpath_class('icon-right-small')
|
||||
latestSearch = '//a[%s]' % xpath_class('anim-link')
|
||||
prevSearch = '//a[span[d:class("icon-right-small")]]'
|
||||
latestSearch = '//a[d:class("anim-link")]'
|
||||
|
||||
def shouldSkipUrl(self, url, data):
|
||||
"""Skip non-comic URLs."""
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
||||
# Copyright (C) 2015-2020 Tobias Gruetzmacher
|
||||
from re import compile, escape
|
||||
|
||||
from ..scraper import _BasicScraper, _ParserScraper
|
||||
from ..util import tagre
|
||||
from ..helpers import bounceStarter, joinPathPartsNamer, xpath_class
|
||||
from ..helpers import bounceStarter, joinPathPartsNamer
|
||||
from .common import _WPNavi
|
||||
|
||||
|
||||
|
@ -21,8 +21,8 @@ class Zapiro(_ParserScraper):
|
|||
url = 'http://mg.co.za/zapiro/'
|
||||
starter = bounceStarter
|
||||
imageSearch = '//div[@id="cartoon"]/img'
|
||||
prevSearch = '//a[%s]' % xpath_class('left')
|
||||
nextSearch = '//a[%s]' % xpath_class('right')
|
||||
prevSearch = '//a[d:class("left")]'
|
||||
nextSearch = '//a[d:class("right")]'
|
||||
namer = joinPathPartsNamer((-1,), ())
|
||||
|
||||
|
||||
|
@ -31,8 +31,8 @@ class ZenPencils(_WPNavi):
|
|||
multipleImagesPerStrip = True
|
||||
firstStripUrl = url + 'comic/1-ralph-waldo-emerson-make-them-cry/'
|
||||
starter = bounceStarter
|
||||
prevSearch = '//a[%s]' % xpath_class('navi-prev')
|
||||
nextSearch = '//a[%s]' % xpath_class('navi-next')
|
||||
prevSearch = '//a[d:class("navi-prev")]'
|
||||
nextSearch = '//a[d:class("navi-next")]'
|
||||
|
||||
|
||||
class ZombieHunters(_BasicScraper):
|
||||
|
|
|
@ -26,6 +26,7 @@ from .util import (get_page, makeSequence, get_system_uid, tagre, normaliseURL,
|
|||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
from .events import getHandler
|
||||
from .xml import NS
|
||||
|
||||
|
||||
ARCHIVE_ORG_URL = re.compile(r'https?://web\.archive\.org/web/[^/]*/')
|
||||
|
@ -434,10 +435,6 @@ class _ParserScraper(Scraper):
|
|||
XML_DECL = re.compile(
|
||||
r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
|
||||
|
||||
NS = {
|
||||
"re": "http://exslt.org/regular-expressions"
|
||||
}
|
||||
|
||||
# Switch between CSS and XPath selectors for this class. Since CSS needs
|
||||
# another Python module, XPath is the default for now.
|
||||
css = False
|
||||
|
@ -519,7 +516,7 @@ class _ParserScraper(Scraper):
|
|||
searchFun = data.cssselect
|
||||
else:
|
||||
def searchFun(s):
|
||||
return data.xpath(s, namespaces=self.NS)
|
||||
return data.xpath(s, namespaces=NS)
|
||||
patterns = makeSequence(patterns)
|
||||
for search in patterns:
|
||||
matched = False
|
||||
|
|
20
dosagelib/xml.py
Normal file
20
dosagelib/xml.py
Normal file
|
@ -0,0 +1,20 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||
from lxml import etree
|
||||
|
||||
|
||||
NS = {
|
||||
'd': 'https://dosage.rocks/xpath',
|
||||
're': 'http://exslt.org/regular-expressions'
|
||||
}
|
||||
|
||||
|
||||
def find_by_class(context, cls):
|
||||
attributes = context.context_node.attrib
|
||||
if 'class' in attributes:
|
||||
return cls in attributes['class'].split(' ')
|
||||
return False
|
||||
|
||||
|
||||
dosagens = etree.FunctionNamespace(NS['d'])
|
||||
dosagens['class'] = find_by_class
|
|
@ -14,7 +14,7 @@ def _file(name):
|
|||
|
||||
|
||||
@lru_cache()
|
||||
def _content(name):
|
||||
def content(name):
|
||||
with gzip.open(_file(name + '.html.gz'), 'r') as f:
|
||||
return f.read()
|
||||
|
||||
|
@ -26,7 +26,7 @@ def _img(name):
|
|||
|
||||
|
||||
def page(url, pagename):
|
||||
add(GET, url, _content(pagename))
|
||||
add(GET, url, content(pagename))
|
||||
|
||||
|
||||
def png(url, name='empty'):
|
||||
|
|
25
tests/test_xml.py
Normal file
25
tests/test_xml.py
Normal file
|
@ -0,0 +1,25 @@
|
|||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2020 Tobias Gruetzmacher
|
||||
|
||||
from lxml import html
|
||||
|
||||
from dosagelib.xml import NS
|
||||
|
||||
import httpmocks
|
||||
|
||||
|
||||
tree = html.document_fromstring(httpmocks.content('zp-222'))
|
||||
|
||||
|
||||
class TestXML:
|
||||
def xpath(self, path):
|
||||
return tree.xpath(path, namespaces=NS)
|
||||
|
||||
def test_class_ext(self):
|
||||
assert len(self.xpath('//li[d:class("menu-item-3773")]')) == 1
|
||||
assert len(self.xpath('//ul[d:class("menu")]')) == 1
|
||||
assert len(self.xpath('//li[d:class("menu-item-object-custom")]')) == 2
|
||||
assert len(self.xpath('//li[d:class("menu-item")]')) == 25
|
||||
|
||||
def test_re_ext(self):
|
||||
assert len(self.xpath(r'//img[re:test(@src, "posters.*jpg")]')) == 1
|
Loading…
Reference in a new issue