2016-04-12 21:11:39 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
2016-10-28 22:21:41 +00:00
|
|
|
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2019-06-30 18:52:15 +00:00
|
|
|
# Copyright (C) 2015-2019 Tobias Gruetzmacher
|
2016-04-12 21:11:39 +00:00
|
|
|
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
|
2014-07-23 18:53:59 +00:00
|
|
|
from .util import getQueryParams
|
2012-06-20 19:58:13 +00:00
|
|
|
|
2016-04-12 21:11:39 +00:00
|
|
|
|
2016-04-21 06:20:49 +00:00
|
|
|
def queryNamer(param, use_page_url=False):
|
2012-09-26 14:47:39 +00:00
|
|
|
"""Get name from URL query part."""
|
2016-04-21 06:20:49 +00:00
|
|
|
def _namer(self, image_url, page_url):
|
2012-12-12 16:41:29 +00:00
|
|
|
"""Get URL query part."""
|
2016-04-21 06:20:49 +00:00
|
|
|
url = page_url if use_page_url else image_url
|
|
|
|
return getQueryParams(url)[param][0]
|
2012-06-20 19:58:13 +00:00
|
|
|
return _namer
|
|
|
|
|
|
|
|
|
2016-04-21 06:20:49 +00:00
|
|
|
def regexNamer(regex, use_page_url=False):
|
2012-09-26 14:47:39 +00:00
|
|
|
"""Get name from regular expression."""
|
2016-04-21 06:20:49 +00:00
|
|
|
def _namer(self, image_url, page_url):
|
2012-12-12 16:41:29 +00:00
|
|
|
"""Get first regular expression group."""
|
2016-04-21 06:20:49 +00:00
|
|
|
url = page_url if use_page_url else image_url
|
2013-03-07 17:22:49 +00:00
|
|
|
mo = regex.search(url)
|
2012-11-13 18:12:28 +00:00
|
|
|
if mo:
|
|
|
|
return mo.group(1)
|
2012-06-20 19:58:13 +00:00
|
|
|
return _namer
|
|
|
|
|
|
|
|
|
2019-06-30 18:52:15 +00:00
|
|
|
def joinPathPartsNamer(pageurlparts, imageurlparts=(-1,), joinchar='_'):
|
|
|
|
"""Get name by mashing path parts together with underscores."""
|
|
|
|
def _namer(self, imageurl, pageurl):
|
|
|
|
# Split and drop host name
|
|
|
|
pageurlsplit = pageurl.split('/')[3:]
|
|
|
|
imageurlsplit = imageurl.split('/')[3:]
|
|
|
|
joinparts = ([pageurlsplit[i] for i in pageurlparts] +
|
|
|
|
[imageurlsplit[i] for i in imageurlparts])
|
|
|
|
return joinchar.join(joinparts)
|
|
|
|
return _namer
|
|
|
|
|
|
|
|
|
2016-04-13 18:01:51 +00:00
|
|
|
def bounceStarter(self):
|
2016-04-12 21:11:39 +00:00
|
|
|
"""Get start URL by "bouncing" back and forth one time.
|
|
|
|
|
|
|
|
This needs the url and nextSearch properties be defined on the class.
|
|
|
|
"""
|
2016-04-13 18:01:51 +00:00
|
|
|
data = self.getPage(self.url)
|
2016-11-01 00:12:16 +00:00
|
|
|
prevurl = self.fetchUrl(self.url, data, self.prevSearch)
|
|
|
|
prevurl = self.link_modifier(self.url, prevurl)
|
|
|
|
data = self.getPage(prevurl)
|
|
|
|
nexturl = self.fetchUrl(prevurl, data, self.nextSearch)
|
|
|
|
return self.link_modifier(prevurl, nexturl)
|
2012-06-20 19:58:13 +00:00
|
|
|
|
|
|
|
|
2016-04-13 18:01:51 +00:00
|
|
|
def indirectStarter(self):
|
2016-04-12 21:11:39 +00:00
|
|
|
"""Get start URL by indirection.
|
|
|
|
|
|
|
|
This is useful for comics where the latest comic can't be reached at a
|
|
|
|
stable URL. If the class has an attribute 'startUrl', this page is fetched
|
|
|
|
first, otherwise the page at 'url' is fetched. After that, the attribute
|
|
|
|
'latestSearch' is used on the page content to find the latest strip."""
|
2016-04-13 18:01:51 +00:00
|
|
|
url = self.startUrl if hasattr(self, "startUrl") else self.url
|
|
|
|
data = self.getPage(url)
|
2016-11-01 00:12:16 +00:00
|
|
|
newurl = self.fetchUrl(url, data, self.latestSearch)
|
|
|
|
return self.link_modifier(url, newurl)
|
2017-02-13 21:41:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
def xpath_class(name):
|
|
|
|
"""Returns an XPath expressions which finds a tag which has a specified
|
|
|
|
class."""
|
|
|
|
return 'contains(concat(" ", @class, " "), " %s ")' % name
|