Fix Tapastic scraper

This commit is contained in:
Techwolf 2020-04-07 01:18:20 -07:00
parent e30f63ec4d
commit cb8f67911d

View file

@ -5,13 +5,15 @@ import json
import re import re
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..helpers import indirectStarter
class Tapastic(_ParserScraper): class Tapastic(_ParserScraper):
baseUrl = 'https://tapas.io/' baseUrl = 'https://tapas.io/'
imageSearch = '//article[@class="ep-contents"]//img' imageSearch = '//article[contains(@class, "js-episode-article")]//img/@data-src'
episodeIdSearch = re.compile(r'episodeId : (\d+),') prevSearch = '//a[contains(@class, "js-prev-ep-btn")]'
episodeListSearch = re.compile(r'episodeList : (.*),') latestSearch = '//ul[contains(@class, "js-episode-list")]//a'
starter = indirectStarter
multipleImagesPerStrip = True multipleImagesPerStrip = True
def __init__(self, name, url): def __init__(self, name, url):
@ -19,30 +21,11 @@ class Tapastic(_ParserScraper):
self.url = self.baseUrl + 'series/' + url self.url = self.baseUrl + 'series/' + url
self.stripUrl = self.baseUrl + 'episode/%s' self.stripUrl = self.baseUrl + 'episode/%s'
def starter(self):
# Retrieve series data object
seriesPage = self.getPage(self.url)
dataScript = seriesPage.xpath('//script[contains(text(), "var _data")]')[0].text
# Extract episode list
currentEpisode = self.episodeIdSearch.findall(dataScript)[0]
self.episodeList = json.loads(self.episodeListSearch.findall(dataScript)[0])
return self.stripUrl % currentEpisode
def fetchUrls(self, url, data, urlSearch): def fetchUrls(self, url, data, urlSearch):
# Save link order for position-based filenames # Save link order for position-based filenames
self.imageUrls = super().fetchUrls(url, data, urlSearch) self.imageUrls = super().fetchUrls(url, data, urlSearch)
# Update firstStripUrl with the correct episode title
if int(url.rsplit('/', 1)[-1]) == self.episodeList[0]['id']:
self.firstStripUrl = url
return self.imageUrls return self.imageUrls
def getPrevUrl(self, url, data):
episodeId = int(url.rsplit('/', 1)[-1])
index = [i for i, ep in enumerate(self.episodeList) if ep['id'] == episodeId][0]
if index == 0:
return None
return self.stripUrl % str(self.episodeList[index - 1]['id'])
def namer(self, imageUrl, pageUrl): def namer(self, imageUrl, pageUrl):
# Construct filename from episode number and image position on page # Construct filename from episode number and image position on page
episodeNum = pageUrl.rsplit('/', 1)[-1] episodeNum = pageUrl.rsplit('/', 1)[-1]