Fix Tapastic scraper
This commit is contained in:
parent
e30f63ec4d
commit
cb8f67911d
1 changed files with 5 additions and 22 deletions
|
@ -5,13 +5,15 @@ import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from ..scraper import _ParserScraper
|
from ..scraper import _ParserScraper
|
||||||
|
from ..helpers import indirectStarter
|
||||||
|
|
||||||
|
|
||||||
class Tapastic(_ParserScraper):
|
class Tapastic(_ParserScraper):
|
||||||
baseUrl = 'https://tapas.io/'
|
baseUrl = 'https://tapas.io/'
|
||||||
imageSearch = '//article[@class="ep-contents"]//img'
|
imageSearch = '//article[contains(@class, "js-episode-article")]//img/@data-src'
|
||||||
episodeIdSearch = re.compile(r'episodeId : (\d+),')
|
prevSearch = '//a[contains(@class, "js-prev-ep-btn")]'
|
||||||
episodeListSearch = re.compile(r'episodeList : (.*),')
|
latestSearch = '//ul[contains(@class, "js-episode-list")]//a'
|
||||||
|
starter = indirectStarter
|
||||||
multipleImagesPerStrip = True
|
multipleImagesPerStrip = True
|
||||||
|
|
||||||
def __init__(self, name, url):
|
def __init__(self, name, url):
|
||||||
|
@ -19,30 +21,11 @@ class Tapastic(_ParserScraper):
|
||||||
self.url = self.baseUrl + 'series/' + url
|
self.url = self.baseUrl + 'series/' + url
|
||||||
self.stripUrl = self.baseUrl + 'episode/%s'
|
self.stripUrl = self.baseUrl + 'episode/%s'
|
||||||
|
|
||||||
def starter(self):
|
|
||||||
# Retrieve series data object
|
|
||||||
seriesPage = self.getPage(self.url)
|
|
||||||
dataScript = seriesPage.xpath('//script[contains(text(), "var _data")]')[0].text
|
|
||||||
# Extract episode list
|
|
||||||
currentEpisode = self.episodeIdSearch.findall(dataScript)[0]
|
|
||||||
self.episodeList = json.loads(self.episodeListSearch.findall(dataScript)[0])
|
|
||||||
return self.stripUrl % currentEpisode
|
|
||||||
|
|
||||||
def fetchUrls(self, url, data, urlSearch):
|
def fetchUrls(self, url, data, urlSearch):
|
||||||
# Save link order for position-based filenames
|
# Save link order for position-based filenames
|
||||||
self.imageUrls = super().fetchUrls(url, data, urlSearch)
|
self.imageUrls = super().fetchUrls(url, data, urlSearch)
|
||||||
# Update firstStripUrl with the correct episode title
|
|
||||||
if int(url.rsplit('/', 1)[-1]) == self.episodeList[0]['id']:
|
|
||||||
self.firstStripUrl = url
|
|
||||||
return self.imageUrls
|
return self.imageUrls
|
||||||
|
|
||||||
def getPrevUrl(self, url, data):
|
|
||||||
episodeId = int(url.rsplit('/', 1)[-1])
|
|
||||||
index = [i for i, ep in enumerate(self.episodeList) if ep['id'] == episodeId][0]
|
|
||||||
if index == 0:
|
|
||||||
return None
|
|
||||||
return self.stripUrl % str(self.episodeList[index - 1]['id'])
|
|
||||||
|
|
||||||
def namer(self, imageUrl, pageUrl):
|
def namer(self, imageUrl, pageUrl):
|
||||||
# Construct filename from episode number and image position on page
|
# Construct filename from episode number and image position on page
|
||||||
episodeNum = pageUrl.rsplit('/', 1)[-1]
|
episodeNum = pageUrl.rsplit('/', 1)[-1]
|
||||||
|
|
Loading…
Reference in a new issue