From 0e03eca8f0c58c4fd516fa36e89661833f609b57 Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Wed, 23 Jul 2014 20:53:59 +0200
Subject: [PATCH] Move all regular expression operation into the new class. -
 Move fetchUrls, fetchUrl and fetchText. - Move base URL handling.

---
 dosagelib/scraper.py | 49 ++++++++++++++++++++++++++++++++++++++----
 dosagelib/util.py    | 51 +++-----------------------------------------
 2 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index a246ccae6..ef9541a4e 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -5,8 +5,14 @@ import requests
 import time
 import random
 import os
+import re
+try:
+    from urllib.parse import urljoin
+except ImportError:
+    from urlparse import urljoin
 from . import loader, configuration, util
-from .util import (makeSequence, get_system_uid, urlopen, getDirname)
+from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
+        getDirname, unescape, tagre, normaliseURL)
 from .comic import ComicStrip
 from .output import out
 from .events import getHandler
@@ -315,20 +321,55 @@ class _BasicScraper(Scraper):
     any).
     """
 
+    BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)'))
+
     @classmethod
     def getPage(cls, url):
-        content, baseUrl = util.getPageContent(url, cls.session)
+        content = getPageContent(url, cls.session)
+        # determine base URL
+        baseUrl = None
+        match = cls.BASE_SEARCH.search(content)
+        if match:
+            baseUrl = match.group(1)
+        else:
+            baseUrl = url
         return (content, baseUrl)
 
     @classmethod
     def fetchUrls(cls, url, data, urlSearch):
         """Search all entries for given URL pattern(s) in a HTML page."""
-        return util.fetchUrls(url, data[0], data[1], urlSearch)
+        searchUrls = []
+        searches = makeSequence(urlSearch)
+        for search in searches:
+            for match in search.finditer(data[0]):
+                searchUrl = match.group(1)
+                if not searchUrl:
+                    raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
+                out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
+                searchUrls.append(normaliseURL(urljoin(data[1], searchUrl)))
+            if searchUrls:
+                # do not search other links if one pattern matched
+                break
+        if not searchUrls:
+            patterns = [x.pattern for x in searches]
+            raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
+        return searchUrls
 
     @classmethod
     def fetchText(cls, url, data, textSearch, optional):
         """Search text entry for given text pattern in a HTML page."""
-        return util.fetchText(url, data[0], textSearch, optional)
+        if textSearch:
+            match = textSearch.search(data[0])
+            if match:
+                text = match.group(1)
+                out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
+                return unescape(text).strip()
+            if optional:
+                return None
+            else:
+                raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
+        else:
+            return None
 
 
 def find_scraperclasses(comic, multiple_allowed=False):
diff --git a/dosagelib/util.py b/dosagelib/util.py
index d1201806c..d586da69a 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -7,9 +7,9 @@ try:
 except ImportError:
     from urllib import quote as url_quote, unquote as url_unquote
 try:
-    from urllib.parse import urlparse, urlunparse, urljoin, urlsplit
+    from urllib.parse import urlparse, urlunparse, urlsplit
 except ImportError:
-    from urlparse import urlparse, urlunparse, urljoin, urlsplit
+    from urlparse import urlparse, urlunparse, urlsplit
 try:
     from urllib import robotparser
 except ImportError:
@@ -176,8 +176,6 @@ def case_insensitive_re(name):
     return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
 
 
-baseSearch = re.compile(tagre("base", "href", '([^"]*)'))
-
 def isValidPageContent(data):
     """Check if page content is empty or has error messages."""
     # The python requests library sometimes returns empty data.
@@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes):
     if not isValidPageContent(data):
         raise ValueError("Got invalid page content from %s: %r" % (url, data))
     out.debug(u"Got page content %r" % data, level=3)
-    # determine base URL
-    baseUrl = None
-    match = baseSearch.search(data)
-    if match:
-        baseUrl = match.group(1)
-    else:
-        baseUrl = url
-    return data, baseUrl
+    return data
 
 
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
@@ -226,42 +217,6 @@ def makeSequence(item):
     return (item,)
 
 
-def fetchUrls(url, data, baseUrl, urlSearch):
-    """Search all entries for given URL pattern(s) in a HTML page."""
-    searchUrls = []
-    searches = makeSequence(urlSearch)
-    for search in searches:
-        for match in search.finditer(data):
-            searchUrl = match.group(1)
-            if not searchUrl:
-                raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url))
-            out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern))
-            searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl)))
-        if searchUrls:
-            # do not search other links if one pattern matched
-            break
-    if not searchUrls:
-        patterns = [x.pattern for x in searches]
-        raise ValueError("Patterns %s not found at URL %s." % (patterns, url))
-    return searchUrls
-
-
-def fetchUrl(url, data, baseUrl, urlSearch):
-    """Search first URL entry for given URL pattern in a HTML page."""
-    return fetchUrls(url, data, baseUrl, urlSearch)[0]
-
-
-def fetchText(url, data, textSearch, optional=False):
-    """Search text entry for given text pattern in a HTML page."""#
-    match = textSearch.search(data)
-    if match:
-        text = match.group(1)
-        out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern))
-        return text
-    if not optional:
-        raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url))
-
-
 _htmlparser = HTMLParser()
 def unescape(text):
     """Replace HTML entities and character references."""