Simplify the fetchUrl code.

This commit is contained in:
Bastian Kleineidam 2013-02-11 19:43:46 +01:00
parent ae81b88acd
commit 67836942d8
6 changed files with 55 additions and 87 deletions

View file

@ -1,7 +1,7 @@
# -*- coding: iso-8859-1 -*- # -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
from .util import fetchUrl, getQueryParams from .util import fetchUrl, getPageContent, getQueryParams
def queryNamer(paramName, usePageUrl=False): def queryNamer(paramName, usePageUrl=False):
"""Get name from URL query part.""" """Get name from URL query part."""
@ -29,24 +29,18 @@ def bounceStarter(url, nextSearch):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get bounced start URL.""" """Get bounced start URL."""
url1 = fetchUrl(url, cls.prevSearch, session=cls.session) data, baseUrl = getPageContent(url, session=cls.session)
if not url1: url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, url)) data, baseUrl = getPageContent(url1, session=cls.session)
url2 = fetchUrl(url1, nextSearch, session=cls.session) return fetchUrl(url1, data, baseUrl, nextSearch)
if not url2:
raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, url1))
return url2
return _starter return _starter
def indirectStarter(baseUrl, latestSearch): def indirectStarter(url, latestSearch):
"""Get start URL by indirection.""" """Get start URL by indirection."""
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get indirect start URL.""" """Get indirect start URL."""
url = fetchUrl(baseUrl, latestSearch, session=cls.session) data, baseUrl = getPageContent(url, session=cls.session)
if not url: return fetchUrl(url, data, baseUrl, latestSearch)
raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
return url
return _starter return _starter

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import make_scraper
from ..util import tagre, getQueryParams, fetchUrl from ..util import tagre, getQueryParams, fetchUrl, getPageContent
_linkTag = tagre("a", "href", r'([^"]+)') _linkTag = tagre("a", "href", r'([^"]+)')
@ -25,17 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
# first, try hopping to previous and next comic # first, try hopping to previous and next comic
url = fetchUrl(baseUrl, _prevSearch) data, _baseUrl = getPageContent(baseUrl, session=cls.session)
if not url: try:
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic # no previous link found, try hopping to last comic
url = fetchUrl(baseUrl, _lastSearch) return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
if not url: else:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl)) data, _baseUrl = getPageContent(url, session=cls.session)
return url return fetchUrl(url, data, _baseUrl, _nextSearch)
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
attrs = dict( attrs = dict(
name='CloneManga/' + name, name='CloneManga/' + name,

View file

@ -4,7 +4,7 @@
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import make_scraper
from ..util import tagre, fetchUrl from ..util import tagre, fetchUrl, getPageContent
# note: adding the compile() functions inside add() is a major performance hog # note: adding the compile() functions inside add() is a major performance hog
_imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image")) _imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image"))
@ -26,17 +26,15 @@ def add(name, path):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
# first, try hopping to previous and next comic # first, try hopping to previous and next comic
url = fetchUrl(_url, _prevSearch) data, baseUrl = getPageContent(_url, session=cls.session)
if not url: try:
url = fetchUrl(_url, data, baseUrl, _prevSearch)
except ValueError:
# no previous link found, try hopping to last comic # no previous link found, try hopping to last comic
url = fetchUrl(_url, _lastSearch) return fetchUrl(_url, data, baseUrl, _lastSearch)
if not url: else:
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, _url)) data, baseUrl = getPageContent(url, session=cls.session)
return url return fetchUrl(url, data, baseUrl, _nextSearch)
url = fetchUrl(url, _nextSearch)
if not url:
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
return url
globals()[classname] = make_scraper(classname, globals()[classname] = make_scraper(classname,
name = 'DrunkDuck/' + name, name = 'DrunkDuck/' + name,

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
from re import compile from re import compile
from ..scraper import make_scraper from ..scraper import make_scraper
from ..util import tagre, quote, fetchUrl, case_insensitive_re from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
_imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"')) _imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"'))
_linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?') _linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?')
@ -30,15 +30,14 @@ def add(name, url, description, adult, bounce):
@classmethod @classmethod
def _starter(cls): def _starter(cls):
"""Get start URL.""" """Get start URL."""
url1 = fetchUrl(modifier(url), cls.prevSearch, session=cls.session) url1 = modifier(url)
if not url1: data, baseUrl = getPageContent(url1, session=cls.session)
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, modifier(url))) url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
if bounce: if bounce:
url2 = fetchUrl(modifier(url1), _nextSearch, session=cls.session) data, baseUrl = getPageContent(url2, session=cls.session)
if not url2: url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, modifier(url1))) return modifier(url3)
return modifier(url2) return modifier(url2)
return modifier(url1)
@classmethod @classmethod
def namer(cls, imageUrl, pageUrl): def namer(cls, imageUrl, pageUrl):

View file

@ -3,7 +3,7 @@
# Copyright (C) 2012-2013 Bastian Kleineidam # Copyright (C) 2012-2013 Bastian Kleineidam
import requests import requests
from . import loader from . import loader
from .util import fetchUrls from .util import fetchUrl, fetchUrls, getPageContent
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
@ -62,7 +62,8 @@ class _BasicScraper(object):
def getStrip(self, url): def getStrip(self, url):
"""Get comic strip for given URL.""" """Get comic strip for given URL."""
imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0] data, baseUrl = getPageContent(url, session=self.session)
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
if len(imageUrls) > 1 and not self.multipleImagesPerStrip: if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern)) out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
return self.getComicStrip(url, imageUrls) return self.getComicStrip(url, imageUrls)
@ -97,12 +98,13 @@ class _BasicScraper(object):
retrieving the given number of strips.""" retrieving the given number of strips."""
seen_urls = set() seen_urls = set()
while url: while url:
imageUrls, prevUrl = fetchUrls(url, self.imageSearch, data, baseUrl = getPageContent(url, session=self.session)
self.prevSearch, session=self.session) imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
yield self.getComicStrip(url, imageUrls)
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
prevUrl = self.prevUrlModifier(prevUrl) prevUrl = self.prevUrlModifier(prevUrl)
out.debug("Matched previous URL %s" % prevUrl) out.debug("Matched previous URL %s" % prevUrl)
seen_urls.add(url) seen_urls.add(url)
yield self.getComicStrip(url, imageUrls)
if prevUrl in seen_urls: if prevUrl in seen_urls:
# avoid recursive URL loops # avoid recursive URL loops
out.warn("Already seen previous URL %r" % prevUrl) out.warn("Already seen previous URL %r" % prevUrl)

View file

@ -99,8 +99,7 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
"""Get text content of given URL.""" """Get text content of given URL."""
check_robotstxt(url) check_robotstxt(url)
# read page data # read page data
page = urlopen(url, max_content_bytes=max_content_bytes, page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
session=session)
data = page.text data = page.text
# determine base URL # determine base URL
baseUrl = None baseUrl = None
@ -117,45 +116,23 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes) return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
def fetchUrl(url, urlSearch, session=None): def fetchUrls(url, data, baseUrl, urlSearch):
"""Search for given URL pattern in a HTML page.""" """Search all entries for given URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session) searchUrls = []
match = urlSearch.search(data) for match in urlSearch.finditer(data):
if match:
searchUrl = match.group(1) searchUrl = match.group(1)
if not searchUrl: if not searchUrl:
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern)) raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
out.debug('matched URL %r' % searchUrl) out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl)) searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
return None if not searchUrls:
raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data))
return searchUrls
def fetchUrls(url, imageSearch, prevSearch=None, session=None): def fetchUrl(url, data, baseUrl, urlSearch):
"""Search for given image and previous URL pattern in a HTML page.""" """Search first entry for given URL pattern in a HTML page."""
data, baseUrl = getPageContent(url, session=session) return fetchUrls(url, data, baseUrl, urlSearch)[0]
# match images
imageUrls = set()
for match in imageSearch.finditer(data):
imageUrl = match.group(1)
if not imageUrl:
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
if not imageUrls:
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
if prevSearch is not None:
# match previous URL
match = prevSearch.search(data)
if match:
prevUrl = match.group(1)
if not prevUrl:
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
else:
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
prevUrl = None
return imageUrls, prevUrl
return imageUrls, None
def unescape(text): def unescape(text):