Move all regular expression operation into the new class.

- Move fetchUrls, fetchUrl and fetchText.
- Move base URL handling.
This commit is contained in:
Tobias Gruetzmacher 2014-07-23 20:53:59 +02:00
parent fde1fdced6
commit 0e03eca8f0
2 changed files with 48 additions and 52 deletions

View file

@ -5,8 +5,14 @@ import requests
import time import time
import random import random
import os import os
import re
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
from . import loader, configuration, util from . import loader, configuration, util
from .util import (makeSequence, get_system_uid, urlopen, getDirname) from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
getDirname, unescape, tagre, normaliseURL)
from .comic import ComicStrip from .comic import ComicStrip
from .output import out from .output import out
from .events import getHandler from .events import getHandler
@ -315,20 +321,55 @@ class _BasicScraper(Scraper):
any). any).
""" """
BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
@classmethod @classmethod
def getPage(cls, url): def getPage(cls, url):
content, baseUrl = util.getPageContent(url, cls.session) content = getPageContent(url, cls.session)
# determine base URL
baseUrl = None
match = cls.BASE_SEARCH.search(content)
if match:
baseUrl = match.group(1)
else:
baseUrl = url
return (content, baseUrl) return (content, baseUrl)
@classmethod @classmethod
def fetchUrls(cls, url, data, urlSearch): def fetchUrls(cls, url, data, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page.""" """Search all entries for given URL pattern(s) in a HTML page."""
return util.fetchUrls(url, data[0], data[1], urlSearch) searchUrls = []
searches = makeSequence(urlSearch)
for search in searches:
for match in search.finditer(data[0]):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
if searchUrls:
# do not search other links if one pattern matched
break
if not searchUrls:
patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
return searchUrls
@classmethod @classmethod
def fetchText(cls, url, data, textSearch, optional): def fetchText(cls, url, data, textSearch, optional):
"""Search text entry for given text pattern in a HTML page.""" """Search text entry for given text pattern in a HTML page."""
return util.fetchText(url, data[0], textSearch, optional) if textSearch:
match = textSearch.search(data[0])
if match:
text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
return unescape(text).strip()
if optional:
return None
else:
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
else:
return None
def find_scraperclasses(comic, multiple_allowed=False): def find_scraperclasses(comic, multiple_allowed=False):

View file

@ -7,9 +7,9 @@ try:
except ImportError: except ImportError:
from urllib import quote as url_quote, unquote as url_unquote from urllib import quote as url_quote, unquote as url_unquote
try: try:
from urllib.parse import urlparse, urlunparse, urljoin, urlsplit from urllib.parse import urlparse, urlunparse, urlsplit
except ImportError: except ImportError:
from urlparse import urlparse, urlunparse, urljoin, urlsplit from urlparse import urlparse, urlunparse, urlsplit
try: try:
from urllib import robotparser from urllib import robotparser
except ImportError: except ImportError:
@ -176,8 +176,6 @@ def case_insensitive_re(name):
return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
def isValidPageContent(data): def isValidPageContent(data):
"""Check if page content is empty or has error messages.""" """Check if page content is empty or has error messages."""
# The python requests library sometimes returns empty data. # The python requests library sometimes returns empty data.
@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
if not isValidPageContent(data): if not isValidPageContent(data):
raise ValueError("Got invalid page content from %s: %r" % (url, data)) raise ValueError("Got invalid page content from %s: %r" % (url, data))
out.debug(u"Got page content %r" % data, level=3) out.debug(u"Got page content %r" % data, level=3)
# determine base URL return data
baseUrl = None
match = baseSearch.search(data)
if match:
baseUrl = match.group(1)
else:
baseUrl = url
return data, baseUrl
def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
@ -226,42 +217,6 @@ def makeSequence(item):
return (item,) return (item,)
def fetchUrls(url, data, baseUrl, urlSearch):
"""Search all entries for given URL pattern(s) in a HTML page."""
searchUrls = []
searches = makeSequence(urlSearch)
for search in searches:
for match in search.finditer(data):
searchUrl = match.group(1)
if not searchUrl:
raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
if searchUrls:
# do not search other links if one pattern matched
break
if not searchUrls:
patterns = [x.pattern for x in searches]
raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
return searchUrls
def fetchUrl(url, data, baseUrl, urlSearch):
"""Search first URL entry for given URL pattern in a HTML page."""
return fetchUrls(url, data, baseUrl, urlSearch)[0]
def fetchText(url, data, textSearch, optional=False):
"""Search text entry for given text pattern in a HTML page."""#
match = textSearch.search(data)
if match:
text = match.group(1)
out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
return text
if not optional:
raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
_htmlparser = HTMLParser() _htmlparser = HTMLParser()
def unescape(text): def unescape(text):
"""Replace HTML entities and character references.""" """Replace HTML entities and character references."""