Simplify the fetchUrl code.
This commit is contained in:
parent
ae81b88acd
commit
67836942d8
6 changed files with 55 additions and 87 deletions
|
@ -1,7 +1,7 @@
|
|||
# -*- coding: iso-8859-1 -*-
|
||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
from .util import fetchUrl, getQueryParams
|
||||
from .util import fetchUrl, getPageContent, getQueryParams
|
||||
|
||||
def queryNamer(paramName, usePageUrl=False):
|
||||
"""Get name from URL query part."""
|
||||
|
@ -29,24 +29,18 @@ def bounceStarter(url, nextSearch):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get bounced start URL."""
|
||||
url1 = fetchUrl(url, cls.prevSearch, session=cls.session)
|
||||
if not url1:
|
||||
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, url))
|
||||
url2 = fetchUrl(url1, nextSearch, session=cls.session)
|
||||
if not url2:
|
||||
raise ValueError("could not find nextSearch pattern %r in %s" % (nextSearch.pattern, url1))
|
||||
return url2
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
url1 = fetchUrl(url, data, baseUrl, cls.prevSearch)
|
||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||
return fetchUrl(url1, data, baseUrl, nextSearch)
|
||||
return _starter
|
||||
|
||||
|
||||
def indirectStarter(baseUrl, latestSearch):
|
||||
def indirectStarter(url, latestSearch):
|
||||
"""Get start URL by indirection."""
|
||||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get indirect start URL."""
|
||||
url = fetchUrl(baseUrl, latestSearch, session=cls.session)
|
||||
if not url:
|
||||
raise ValueError("could not find latestSearch pattern %r in %s" % (latestSearch.pattern, baseUrl))
|
||||
return url
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
return fetchUrl(url, data, baseUrl, latestSearch)
|
||||
return _starter
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, getQueryParams, fetchUrl
|
||||
from ..util import tagre, getQueryParams, fetchUrl, getPageContent
|
||||
|
||||
|
||||
_linkTag = tagre("a", "href", r'([^"]+)')
|
||||
|
@ -25,17 +25,15 @@ def add(name, shortName, imageFolder=None, lastStrip=None):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
url = fetchUrl(baseUrl, _prevSearch)
|
||||
if not url:
|
||||
data, _baseUrl = getPageContent(baseUrl, session=cls.session)
|
||||
try:
|
||||
url = fetchUrl(baseUrl, data, _baseUrl, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
url = fetchUrl(baseUrl, _lastSearch)
|
||||
if not url:
|
||||
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, baseUrl))
|
||||
return url
|
||||
url = fetchUrl(url, _nextSearch)
|
||||
if not url:
|
||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
|
||||
return url
|
||||
return fetchUrl(baseUrl, data, _baseUrl, _lastSearch)
|
||||
else:
|
||||
data, _baseUrl = getPageContent(url, session=cls.session)
|
||||
return fetchUrl(url, data, _baseUrl, _nextSearch)
|
||||
|
||||
attrs = dict(
|
||||
name='CloneManga/' + name,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, fetchUrl
|
||||
from ..util import tagre, fetchUrl, getPageContent
|
||||
|
||||
# note: adding the compile() functions inside add() is a major performance hog
|
||||
_imageSearch = compile(tagre("img", "src", r'(http://media\.drunkduck\.com\.s3\.amazonaws\.com:80/[^"]+)', before="page-image"))
|
||||
|
@ -26,17 +26,15 @@ def add(name, path):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
# first, try hopping to previous and next comic
|
||||
url = fetchUrl(_url, _prevSearch)
|
||||
if not url:
|
||||
data, baseUrl = getPageContent(_url, session=cls.session)
|
||||
try:
|
||||
url = fetchUrl(_url, data, baseUrl, _prevSearch)
|
||||
except ValueError:
|
||||
# no previous link found, try hopping to last comic
|
||||
url = fetchUrl(_url, _lastSearch)
|
||||
if not url:
|
||||
raise ValueError("could not find lastSearch pattern %r in %s" % (_lastSearch.pattern, _url))
|
||||
return url
|
||||
url = fetchUrl(url, _nextSearch)
|
||||
if not url:
|
||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, url))
|
||||
return url
|
||||
return fetchUrl(_url, data, baseUrl, _lastSearch)
|
||||
else:
|
||||
data, baseUrl = getPageContent(url, session=cls.session)
|
||||
return fetchUrl(url, data, baseUrl, _nextSearch)
|
||||
|
||||
globals()[classname] = make_scraper(classname,
|
||||
name = 'DrunkDuck/' + name,
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
from re import compile
|
||||
from ..scraper import make_scraper
|
||||
from ..util import tagre, quote, fetchUrl, case_insensitive_re
|
||||
from ..util import tagre, quote, fetchUrl, case_insensitive_re, getPageContent
|
||||
|
||||
_imageSearch = compile(tagre("img", "src", r'([^"]+)', after='id="comic_image"'))
|
||||
_linkSearch = tagre("a", "href", r'([^>"]*/comics/\d+/[^>"]*)', quote='"?')
|
||||
|
@ -30,15 +30,14 @@ def add(name, url, description, adult, bounce):
|
|||
@classmethod
|
||||
def _starter(cls):
|
||||
"""Get start URL."""
|
||||
url1 = fetchUrl(modifier(url), cls.prevSearch, session=cls.session)
|
||||
if not url1:
|
||||
raise ValueError("could not find prevSearch pattern %r in %s" % (cls.prevSearch.pattern, modifier(url)))
|
||||
url1 = modifier(url)
|
||||
data, baseUrl = getPageContent(url1, session=cls.session)
|
||||
url2 = fetchUrl(url1, data, baseUrl, cls.prevSearch)
|
||||
if bounce:
|
||||
url2 = fetchUrl(modifier(url1), _nextSearch, session=cls.session)
|
||||
if not url2:
|
||||
raise ValueError("could not find nextSearch pattern %r in %s" % (_nextSearch.pattern, modifier(url1)))
|
||||
data, baseUrl = getPageContent(url2, session=cls.session)
|
||||
url3 = fetchUrl(url2, data, baseUrl, _nextSearch)
|
||||
return modifier(url3)
|
||||
return modifier(url2)
|
||||
return modifier(url1)
|
||||
|
||||
@classmethod
|
||||
def namer(cls, imageUrl, pageUrl):
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# Copyright (C) 2012-2013 Bastian Kleineidam
|
||||
import requests
|
||||
from . import loader
|
||||
from .util import fetchUrls
|
||||
from .util import fetchUrl, fetchUrls, getPageContent
|
||||
from .comic import ComicStrip
|
||||
from .output import out
|
||||
|
||||
|
@ -62,7 +62,8 @@ class _BasicScraper(object):
|
|||
|
||||
def getStrip(self, url):
|
||||
"""Get comic strip for given URL."""
|
||||
imageUrls = fetchUrls(url, self.imageSearch, session=self.session)[0]
|
||||
data, baseUrl = getPageContent(url, session=self.session)
|
||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||
if len(imageUrls) > 1 and not self.multipleImagesPerStrip:
|
||||
out.warn("found %d images instead of 1 with %s" % (len(imageUrls), self.imageSearch.pattern))
|
||||
return self.getComicStrip(url, imageUrls)
|
||||
|
@ -97,12 +98,13 @@ class _BasicScraper(object):
|
|||
retrieving the given number of strips."""
|
||||
seen_urls = set()
|
||||
while url:
|
||||
imageUrls, prevUrl = fetchUrls(url, self.imageSearch,
|
||||
self.prevSearch, session=self.session)
|
||||
data, baseUrl = getPageContent(url, session=self.session)
|
||||
imageUrls = set(fetchUrls(url, data, baseUrl, self.imageSearch))
|
||||
yield self.getComicStrip(url, imageUrls)
|
||||
prevUrl = fetchUrl(url, data, baseUrl, self.prevSearch)
|
||||
prevUrl = self.prevUrlModifier(prevUrl)
|
||||
out.debug("Matched previous URL %s" % prevUrl)
|
||||
seen_urls.add(url)
|
||||
yield self.getComicStrip(url, imageUrls)
|
||||
if prevUrl in seen_urls:
|
||||
# avoid recursive URL loops
|
||||
out.warn("Already seen previous URL %r" % prevUrl)
|
||||
|
|
|
@ -99,8 +99,7 @@ def getPageContent(url, max_content_bytes=MaxContentBytes, session=None):
|
|||
"""Get text content of given URL."""
|
||||
check_robotstxt(url)
|
||||
# read page data
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes,
|
||||
session=session)
|
||||
page = urlopen(url, max_content_bytes=max_content_bytes, session=session)
|
||||
data = page.text
|
||||
# determine base URL
|
||||
baseUrl = None
|
||||
|
@ -117,45 +116,23 @@ def getImageObject(url, referrer, max_content_bytes=MaxImageBytes):
|
|||
return urlopen(url, referrer=referrer, max_content_bytes=max_content_bytes)
|
||||
|
||||
|
||||
def fetchUrl(url, urlSearch, session=None):
|
||||
"""Search for given URL pattern in a HTML page."""
|
||||
data, baseUrl = getPageContent(url, session=session)
|
||||
match = urlSearch.search(data)
|
||||
if match:
|
||||
def fetchUrls(url, data, baseUrl, urlSearch):
|
||||
"""Search all entries for given URL pattern in a HTML page."""
|
||||
searchUrls = []
|
||||
for match in urlSearch.finditer(data):
|
||||
searchUrl = match.group(1)
|
||||
if not searchUrl:
|
||||
raise ValueError("Match empty URL at %s with pattern %s" % (url, urlSearch.pattern))
|
||||
out.debug('matched URL %r' % searchUrl)
|
||||
return normaliseURL(urlparse.urljoin(baseUrl, searchUrl))
|
||||
return None
|
||||
raise ValueError("Pattern %s matched empty URL at %s." % (urlSearch.pattern, url))
|
||||
out.debug('matched URL %r with pattern %s' % (searchUrl, urlSearch.pattern))
|
||||
searchUrls.append(normaliseURL(urlparse.urljoin(baseUrl, searchUrl)))
|
||||
if not searchUrls:
|
||||
raise ValueError("Pattern %s not found at URL %s with data %r." % (urlSearch.pattern, url, data))
|
||||
return searchUrls
|
||||
|
||||
|
||||
def fetchUrls(url, imageSearch, prevSearch=None, session=None):
|
||||
"""Search for given image and previous URL pattern in a HTML page."""
|
||||
data, baseUrl = getPageContent(url, session=session)
|
||||
# match images
|
||||
imageUrls = set()
|
||||
for match in imageSearch.finditer(data):
|
||||
imageUrl = match.group(1)
|
||||
if not imageUrl:
|
||||
raise ValueError("Match empty image URL at %s with pattern %s" % (url, imageSearch.pattern))
|
||||
out.debug('matched image URL %r with pattern %s' % (imageUrl, imageSearch.pattern))
|
||||
imageUrls.add(normaliseURL(urlparse.urljoin(baseUrl, imageUrl)))
|
||||
if not imageUrls:
|
||||
out.warn("no images found at %s with pattern %s" % (url, imageSearch.pattern))
|
||||
if prevSearch is not None:
|
||||
# match previous URL
|
||||
match = prevSearch.search(data)
|
||||
if match:
|
||||
prevUrl = match.group(1)
|
||||
if not prevUrl:
|
||||
raise ValueError("Match empty previous URL at %s with pattern %s" % (url, prevSearch.pattern))
|
||||
prevUrl = normaliseURL(urlparse.urljoin(baseUrl, prevUrl))
|
||||
else:
|
||||
out.debug('no previous URL %s at %s' % (prevSearch.pattern, url))
|
||||
prevUrl = None
|
||||
return imageUrls, prevUrl
|
||||
return imageUrls, None
|
||||
def fetchUrl(url, data, baseUrl, urlSearch):
|
||||
"""Search first entry for given URL pattern in a HTML page."""
|
||||
return fetchUrls(url, data, baseUrl, urlSearch)[0]
|
||||
|
||||
|
||||
def unescape(text):
|
||||
|
|
Loading…
Reference in a new issue