Move all regular expression operation into the new class.

- Move fetchUrls, fetchUrl and fetchText. - Move base URL handling.
2014-07-23 20:53:59 +02:00 · 2014-07-23 20:53:59 +02:00 · 0e03eca8f0
commit 0e03eca8f0
parent fde1fdced6
2 changed files with 48 additions and 52 deletions
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@ -5,8 +5,14 @@ import requests
 import time
 import random
 import os
 import re
 try:
    from urllib.parse import urljoin
 except ImportError:
    from urlparse import urljoin
 from . import loader, configuration, util
-from .util import (makeSequence, get_system_uid, urlopen, getDirname)
+from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
        getDirname, unescape, tagre, normaliseURL)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@ -315,20 +321,55 @@ class _BasicScraper(Scraper):
    any).
    """
    BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
    @classmethod
    def getPage(cls, url):
-        content, baseUrl = util.getPageContent(url, cls.session)
+        content = getPageContent(url, cls.session)
        # determine base URL
        baseUrl = None
        match = cls.BASE_SEARCH.search(content)
        if match:
            baseUrl = match.group(1)
        else:
            baseUrl = url
        return (content, baseUrl)
    @classmethod
    def fetchUrls(cls, url, data, urlSearch):
        """Search all entries for given URL pattern(s) in a HTML page."""
-        return util.fetchUrls(url, data[0], data[1], urlSearch)
+        searchUrls = []
        searches = makeSequence(urlSearch)
        for search in searches:
            for match in search.finditer(data[0]):
                searchUrl = match.group(1)
                if not searchUrl:
                    raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
                out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
                searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
            if searchUrls:
                # do not search other links if one pattern matched
                break
        if not searchUrls:
            patterns = [x.pattern for x in searches]
            raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
        return searchUrls
    @classmethod
    def fetchText(cls, url, data, textSearch, optional):
        """Search text entry for given text pattern in a HTML page."""
-        return util.fetchText(url, data[0], textSearch, optional)
+        if textSearch:
            match = textSearch.search(data[0])
            if match:
                text = match.group(1)
                out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
                return unescape(text).strip()
            if optional:
                return None
            else:
                raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
        else:
            return None
 def find_scraperclasses(comic, multiple_allowed=False):
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@ -7,9 +7,9 @@ try:
 except ImportError:
    from urllib import quote as url_quote, unquote as url_unquote
 try:
-    from urllib.parse import urlparse, urlunparse, urljoin, urlsplit
+    from urllib.parse import urlparse, urlunparse, urlsplit
 except ImportError:
-    from urlparse import urlparse, urlunparse, urljoin, urlsplit
+    from urlparse import urlparse, urlunparse, urlsplit
 try:
    from urllib import robotparser
 except ImportError:
@ -176,8 +176,6 @@ def case_insensitive_re(name):
    return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
 baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
 def isValidPageContent(data):
    """Check if page content is empty or has error messages."""
    # The python requests library sometimes returns empty data.
@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
    if not isValidPageContent(data):
        raise ValueError("Got invalid page content from %s: %r" % (url, data))
    out.debug(u"Got page content %r" % data, level=3)
-    # determine base URL
+    return data
    baseUrl = None
    match = baseSearch.search(data)
    if match:
        baseUrl = match.group(1)
    else:
        baseUrl = url
    return data, baseUrl
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
@ -226,42 +217,6 @@ def makeSequence(item):
    return (item,)
 def fetchUrls(url, data, baseUrl, urlSearch):
    """Search all entries for given URL pattern(s) in a HTML page."""
    searchUrls = []
    searches = makeSequence(urlSearch)
    for search in searches:
        for match in search.finditer(data):
            searchUrl = match.group(1)
            if not searchUrl:
                raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
            out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
            searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
        if searchUrls:
            # do not search other links if one pattern matched
            break
    if not searchUrls:
        patterns = [x.pattern for x in searches]
        raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
    return searchUrls
 def fetchUrl(url, data, baseUrl, urlSearch):
    """Search first URL entry for given URL pattern in a HTML page."""
    return fetchUrls(url, data, baseUrl, urlSearch)[0]
 def fetchText(url, data, textSearch, optional=False):
    """Search text entry for given text pattern in a HTML page."""#
    match = textSearch.search(data)
    if match:
        text = match.group(1)
        out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
        return text
    if not optional:
        raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
 _htmlparser = HTMLParser()
 def unescape(text):
    """Replace HTML entities and character references."""