Convert starters and other helpers to new interface.
This allows those starters to work with future scrapers.
This commit is contained in:
parent
4265053846
commit
2567bd4e57
7 changed files with 37 additions and 37 deletions
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
from .util import fetchUrl, getPageContent, getQueryParams
|
||||
from .util import getQueryParams
|
||||
|
||||
def queryNamer(paramName, usePageUrl=False):
|
||||
"""Get name from URL query part."""
|
||||
|
@ -30,10 +30,10 @@ def bounceStarter(url, nextSearch):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get bounced start URL."""
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
||||
data, baseUrl = getPageContent(url1, cls.session)
|
||||
return fetchUrl(url1, data, baseUrl, nextSearch)
|
||||
data = cls.getPage(url)
|
||||
url1 = cls.fetchUrl(url, data, cls.prevSearch)
|
||||
data = cls.getPage(url1)
|
||||
return cls.fetchUrl(url1, data, nextSearch)
|
||||
return _starter
|
||||
|
||||
|
||||
|
@ -42,6 +42,6 @@ def indirectStarter(url, latestSearch):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get indirect start URL."""
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, baseUrl, latestSearch)
|
||||
data = cls.getPage(url)
|
||||
return cls.fetchUrl(url, data, latestSearch)
|
||||
return _starter
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, getQueryParams, fetchUrl, getPageContent
|
||||
from ..util import tagre, getQueryParams
|
||||
|
||||
|
||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||
|
@ -25,15 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
data, _baseUrl = getPageContent(baseUrl, cls.session)
|
||||
data = cls.getPage(baseUrl)
|
||||
try:
|
||||
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
||||
url = cls.fetchUrl(baseUrl, data, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
||||
return cls.fetchUrl(baseUrl, data, _lastSearch)
|
||||
else:
|
||||
data, _baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
||||
data = cls.getPage(url)
|
||||
return cls.fetchUrl(url, data, _nextSearch)
|
||||
|
||||
attrs = dict(
|
||||
name='CloneManga/' + name,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper, Genre
|
||||
from ..util import tagre, fetchUrl, getPageContent
|
||||
from ..util import tagre
|
||||
|
||||
# note: adding the compile() functions inside add() is a major performance hog
|
||||
_imageSearch = compile(tagre("img", "src", r'(https://s3\.amazonaws\.com/media\.drunkduck\.com/[^"]+)', before="page-image"))
|
||||
|
@ -27,15 +27,15 @@ def add(name, path):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
data, baseUrl = getPageContent(_url, cls.session)
|
||||
data = cls.getPage(_url)
|
||||
try:
|
||||
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
||||
url = cls.fetchUrl(_url, data, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
||||
return cls.fetchUrl(_url, data, _lastSearch)
|
||||
else:
|
||||
data, baseUrl = getPageContent(url, cls.session)
|
||||
return fetchUrl(url, data, baseUrl, _nextSearch)
|
||||
data = cls.getPage(url)
|
||||
return cls.fetchUrl(url, data, _nextSearch)
|
||||
|
||||
attrs = dict(
|
||||
name = 'DrunkDuck/' + name,
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
from re import compile, escape
|
||||
from ..scraper import _BasicScraper
|
||||
from ..util import tagre, getPageContent, fetchUrls
|
||||
from ..util import tagre
|
||||
from ..helpers import bounceStarter
|
||||
|
||||
|
||||
|
@ -21,9 +21,9 @@ class HagarTheHorrible(_BasicScraper):
|
|||
def starter(cls):
|
||||
"""Return last gallery link."""
|
||||
url = 'http://www.hagardunor.net/comics.php'
|
||||
content = getPageContent(url, cls.session)[0]
|
||||
data = cls.getPage(url)
|
||||
pattern = compile(tagre("a", "href", cls.prevUrl))
|
||||
for starturl in fetchUrls(url, content, url, pattern):
|
||||
for starturl in cls.fetchUrls(url, data, pattern):
|
||||
pass
|
||||
return starturl
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
from re import compile, escape
|
||||
from ..scraper import _BasicScraper
|
||||
from ..helpers import bounceStarter, queryNamer, indirectStarter
|
||||
from ..util import tagre, fetchUrl, getPageContent
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
class PandyLand(_BasicScraper):
|
||||
|
@ -104,10 +104,10 @@ class PennyArcade(_BasicScraper):
|
|||
@classmethod
|
||||
def starter(cls):
|
||||
"""Get bounced start URL."""
|
||||
data, baseUrl = getPageContent(cls.url, cls.session)
|
||||
url1 = fetchUrl(cls.url, data, baseUrl, cls.prevSearch)
|
||||
data, baseUrl = getPageContent(url1, cls.session)
|
||||
url2 = fetchUrl(url1, data, baseUrl, cls.nextSearch)
|
||||
data = cls.getPage(cls.url)
|
||||
url1 = cls.fetchUrl(cls.url, data, cls.prevSearch)
|
||||
data = cls.getPage(url1)
|
||||
url2 = cls.fetchUrl(url1, data, cls.nextSearch)
|
||||
return cls.prevUrlModifier(url2)
|
||||
|
||||
@classmethod
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
|
||||
from ..util import tagre, quote, case_insensitive_re
|
||||
|
||||
# SmackJeeves is a crawlers nightmare - users are allowed to edit HTML directly.
|
||||
# That's why there are so much different search patterns.
|
||||
|
@ -45,11 +45,11 @@ def add(name, url, description, adult, bounce):
|
|||
def _starter(cls):
|
||||
"""Get start URL."""
|
||||
url1 = modifier(url)
|
||||
data, baseUrl = getPageContent(url1, cls.session)
|
||||
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
||||
data = cls.getPage(url1)
|
||||
url2 = cls.fetchUrl(url1, data, cls.prevSearch)
|
||||
if bounce:
|
||||
data, baseUrl = getPageContent(url2, cls.session)
|
||||
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
||||
data = cls.getPage(url2)
|
||||
url3 = cls.fetchUrl(url2, data, _nextSearch)
|
||||
return modifier(url3)
|
||||
return modifier(url2)
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
from re import compile, escape, IGNORECASE
|
||||
from ..scraper import _BasicScraper
|
||||
from ..helpers import indirectStarter
|
||||
from ..util import tagre, fetchUrl, getPageContent
|
||||
from ..util import tagre
|
||||
|
||||
|
||||
class TheBrads(_BasicScraper):
|
||||
|
@ -223,11 +223,11 @@ class TheThinHLine(_BasicScraper):
|
|||
|
||||
indirectImageSearch = compile(tagre('a', 'href', r'(%simage/\d+)' % rurl))
|
||||
|
||||
def getComicStrip(self, url, data, baseUrl):
|
||||
def getComicStrip(self, url, data):
|
||||
"""The comic strip image is in a separate page."""
|
||||
pageUrl = fetchUrl(url, data, baseUrl, self.indirectImageSearch)
|
||||
pageData, pageBaseUrl = getPageContent(pageUrl, self.session)
|
||||
return super(TheThinHLine, self).getComicStrip(pageUrl, pageData, pageBaseUrl)
|
||||
pageUrl = self.fetchUrl(url, data, self.indirectImageSearch)
|
||||
pageData = self.getPage(pageUrl)
|
||||
return super(TheThinHLine, self).getComicStrip(pageUrl, pageData)
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
|
|
Loading…
Reference in a new issue