dosage/dosagelib/helpers.py

100 lines
3.2 KiB
Python
Raw Normal View History

# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012 Bastian Kleineidam
2012-06-20 19:58:13 +00:00
import re
2012-10-11 10:03:12 +00:00
from .util import fetchUrl, getQueryParams
from .scraper import _BasicScraper
2012-06-20 19:58:13 +00:00
def queryNamer(paramName, usePageUrl=False):
2012-09-26 14:47:39 +00:00
"""Get name from URL query part."""
2012-06-20 19:58:13 +00:00
@staticmethod
def _namer(imageUrl, pageUrl):
url = (imageUrl, pageUrl)[usePageUrl]
return getQueryParams(url)[paramName][0]
return _namer
def regexNamer(regex):
2012-09-26 14:47:39 +00:00
"""Get name from regular expression."""
2012-06-20 19:58:13 +00:00
@staticmethod
def _namer(imageUrl, pageUrl):
2012-11-13 18:12:28 +00:00
mo = regex.search(imageUrl)
if mo:
return mo.group(1)
2012-06-20 19:58:13 +00:00
return _namer
def bounceStarter(latestUrl, nextSearch):
2012-09-26 14:47:39 +00:00
"""Get start URL by "bouncing" back and forth one time."""
2012-06-20 19:58:13 +00:00
@classmethod
def _starter(cls):
url = fetchUrl(latestUrl, cls.prevSearch)
2012-11-21 20:57:26 +00:00
if not url:
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, latestUrl))
url = fetchUrl(url, nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, latestUrl))
2012-06-20 19:58:13 +00:00
return url
return _starter
def indirectStarter(baseUrl, latestSearch):
2012-09-26 14:47:39 +00:00
"""Get start URL by indirection."""
2012-06-20 19:58:13 +00:00
@staticmethod
def _starter():
2012-11-21 20:57:26 +00:00
url = fetchUrl(baseUrl, latestSearch)
if not url:
raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
return url
2012-06-20 19:58:13 +00:00
return _starter
class IndirectLatestMixin(object):
'''
Mixin for comics that link to the latest comic from a base page of
some kind. This also supports comics which don't link to the last comic
from the base page, but the beginning of the latest chapter or similiar
schemes. It simulates going forward until it can't find a 'next' link as
specified by the 'nextSearch' regex.
@type baseUrl: C{string}
@cvar baseUrl: the URL where the link to the latest comic is found.
@type latestSearch C{regex}
@cvar latestSearch: a compiled regex for finding the 'latest' URL.
@type nextSearch C{regex}
@cvar nextSearch: a compiled regex for finding the 'next' URL.
'''
__latestUrl = None
def getLatestUrl(self):
2012-09-26 14:47:39 +00:00
"""Get latest comic URL."""
2012-06-20 19:58:13 +00:00
if not self.__latestUrl:
self.__latestUrl = fetchUrl(self.baseUrl, self.latestSearch)
if hasattr(self, "nextSearch"):
nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)
while nextUrl:
self.__latestUrl = nextUrl
nextUrl = fetchUrl(self.__latestUrl, self.nextSearch)
return self.__latestUrl
latestUrl = property(getLatestUrl)
class _PHPScraper(_BasicScraper):
"""
2012-09-26 14:47:39 +00:00
Scraper for comics using phpComic/CUSP.
2012-06-20 19:58:13 +00:00
This provides an easy way to define scrapers for webcomics using phpComic.
"""
imageUrl = property(lambda self: self.basePath + 'daily.php?date=%s')
imageSearch = property(lambda self: re.compile(r'<img alt=[^>]+ src="(%scomics/\d{6}\..+?)">' % (self.basePath,)))
help = 'Index format: yymmdd'
@classmethod
def starter(cls):
2012-09-26 14:47:39 +00:00
"""Get starter URL."""
2012-06-20 19:58:13 +00:00
return cls.basePath + cls.latestUrl