Simplify the fetchUrl code.

This commit is contained in:
Bastian Kleineidam 2013-02-11 19:43:46 +01:00
parent ae81b88acd
commit 67836942d8
6 changed files with 55 additions and 87 deletions

View file

@ -1,7 +1,7 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam
from .util import fetchUrl, getQueryParams
from .util import fetchUrl, getPageContent, getQueryParams
def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part."""
@ -29,24 +29,18 @@ def bounceStarter(url, nextSearch):
@classmethod
def _starter(cls):
"""Get bounced start URL."""
url1 = fetchUrl(url, cls.prevSearch, session=cls.session)
if not url1:
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, url))
url2 = fetchUrl(url1, nextSearch, session=cls.session)
if not url2:
raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, url1))
return url2
data, baseUrl = getPageContent(url, session=cls.session)
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
data, baseUrl = getPageContent(url1, session=cls.session)
return fetchUrl(url1, data, baseUrl, nextSearch)
return _starter
def indirectStarter(baseUrl, latestSearch):
def indirectStarter(url, latestSearch):
"""Get start URL by indirection."""
@classmethod
def _starter(cls):
"""Get indirect start URL."""
url = fetchUrl(baseUrl, latestSearch, session=cls.session)
if not url:
raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
return url
data, baseUrl = getPageContent(url, session=cls.session)
return fetchUrl(url, data, baseUrl, latestSearch)
return _starter

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import tagre, getQueryParams, fetchUrl
from ..util import tagre, getQueryParams, fetchUrl, getPageContent
_linkTag = tagre("a", "href", r'([^"]+)')
@ -25,17 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
url = fetchUrl(baseUrl, _prevSearch)
if not url:
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
try:
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic
url = fetchUrl(baseUrl, _lastSearch)
if not url:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl))
return url
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
else:
data, _baseUrl = getPageContent(url, session=cls.session)
return fetchUrl(url, data, _baseUrl, _nextSearch)
attrs = dict(
name='CloneManga/' + name,

View file

@ -4,7 +4,7 @@
from re import compile
from ..scraper import make_scraper
from ..util import tagre, fetchUrl
from ..util import tagre, fetchUrl, getPageContent
# note: adding the compile() functions inside add() is a major performance hog
_imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image"))
@ -26,17 +26,15 @@ def add(name, path):
@classmethod
def _starter(cls):
# first, try hopping to previous and next comic
url = fetchUrl(_url, _prevSearch)
if not url:
data, baseUrl = getPageContent(_url, session=cls.session)
try:
url = fetchUrl(_url, data, baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic
url = fetchUrl(_url, _lastSearch)
if not url:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, _url))
return url
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
return fetchUrl(_url, data, baseUrl, _lastSearch)
else:
data, baseUrl = getPageContent(url, session=cls.session)
return fetchUrl(url, data, baseUrl, _nextSearch)
globals()[classname] = make_scraper(classname,
name = 'DrunkDuck/' + name,

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam
from re import compile
from ..scraper import make_scraper
from ..util import tagre, quote, fetchUrl, case_insensitive_re
from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
_imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"'))
_linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?')
@ -30,15 +30,14 @@ def add(name, url, description, adult, bounce):
@classmethod
def _starter(cls):
"""Get start URL."""
url1 = fetchUrl(modifier(url), cls.prevSearch, session=cls.session)
if not url1:
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, modifier(url)))
url1 = modifier(url)
data, baseUrl = getPageContent(url1, session=cls.session)
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
if bounce:
url2 = fetchUrl(modifier(url1), _nextSearch, session=cls.session)
if not url2:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, modifier(url1)))
return modifier(url2)
return modifier(url1)
data, baseUrl = getPageContent(url2, session=cls.session)
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
return modifier(url3)
return modifier(url2)
@classmethod
def namer(cls, imageUrl, pageUrl):

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam
import requests
from . import loader
from .util import fetchUrls
from .util import fetchUrl, fetchUrls, getPageContent
from .comic import ComicStrip
from .output import out
@ -62,7 +62,8 @@ class _BasicScraper(object):
def getStrip(self, url):
"""Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0]
data, baseUrl = getPageContent(url, session=self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls)
@ -97,12 +98,13 @@ class _BasicScraper(object):
retrieving the given number of strips."""
seen_urls = set()
while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
self.prevSearch, session=self.session)
data, baseUrl = getPageContent(url, session=self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls)
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url)
yield self.getComicStrip(url, imageUrls)
if prevUrl in seen_urls:
# avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl)

View file

@ -99,8 +99,7 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
"""Get text content of given URL."""
check_robotstxt(url)
# read page data
page = urlopen(url, max_content_bytes=max_content_bytes,
session=session)
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
data = page.text
# determine base URL
baseUrl = None
@ -117,45 +116,23 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrl(url, urlSearch, session=None):
"""Search for given URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session)
match = urlSearch.search(data)
if match:
def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern in a HTML page."""
searchUrls = []
for match in urlSearch.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
out.debug('matched URL %r' % searchUrl)
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl))
return None
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
if not searchUrls:
raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data))
return searchUrls
def fetchUrls(url, imageSearch, prevSearch=None, session=None):
"""Search for given image and previous URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session)
# match images
imageUrls = set()
for match in imageSearch.finditer(data):
imageUrl = match.group(1)
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls:
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
if prevSearch is not None:
# match previous URL
match = prevSearch.search(data)
if match:
prevUrl = match.group(1)
if not prevUrl:
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
else:
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
prevUrl = None
return imageUrls, prevUrl
return imageUrls, None
def fetchUrl(url, data, baseUrl, urlSearch):
"""Search first entry for given URL pattern in a HTML page."""
return fetchUrls(url, data, baseUrl, urlSearch)[0]
def unescape(text):