From 0e03eca8f0c58c4fd516fa36e89661833f609b57 Mon Sep 17 00:00:00 2001 From: Tobias Gruetzmacher Date: Wed, 23 Jul 2014 20:53:59 +0200 Subject: [PATCH] Move all regular expression operation into the new class. - Move fetchUrls, fetchUrl and fetchText. - Move base URL handling. --- dosagelib/scraper.py | 49 ++++++++++++++++++++++++++++++++++++++---- dosagelib/util.py | 51 +++----------------------------------------- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py index a246ccae6..ef9541a4e 100644 --- a/dosagelib/scraper.py +++ b/dosagelib/scraper.py @@ -5,8 +5,14 @@ import requests import time import random import os +import re +try: + from urllib.parse import urljoin +except ImportError: + from urlparse import urljoin from . import loader, configuration, util -from .util import (makeSequence, get_system_uid, urlopen, getDirname) +from .util import (getPageContent, makeSequence, get_system_uid, urlopen, + getDirname, unescape, tagre, normaliseURL) from .comic import ComicStrip from .output import out from .events import getHandler @@ -315,20 +321,55 @@ class _BasicScraper(Scraper): any). """ + BASE_SEARCH = re.compile(tagre("base", "href", '([^"]*)')) + @classmethod def getPage(cls, url): - content, baseUrl = util.getPageContent(url, cls.session) + content = getPageContent(url, cls.session) + # determine base URL + baseUrl = None + match = cls.BASE_SEARCH.search(content) + if match: + baseUrl = match.group(1) + else: + baseUrl = url return (content, baseUrl) @classmethod def fetchUrls(cls, url, data, urlSearch): """Search all entries for given URL pattern(s) in a HTML page.""" - return util.fetchUrls(url, data[0], data[1], urlSearch) + searchUrls = [] + searches = makeSequence(urlSearch) + for search in searches: + for match in search.finditer(data[0]): + searchUrl = match.group(1) + if not searchUrl: + raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) + out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern)) + searchUrls.append(normaliseURL(urljoin(data[1], searchUrl))) + if searchUrls: + # do not search other links if one pattern matched + break + if not searchUrls: + patterns = [x.pattern for x in searches] + raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) + return searchUrls @classmethod def fetchText(cls, url, data, textSearch, optional): """Search text entry for given text pattern in a HTML page.""" - return util.fetchText(url, data[0], textSearch, optional) + if textSearch: + match = textSearch.search(data[0]) + if match: + text = match.group(1) + out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) + return unescape(text).strip() + if optional: + return None + else: + raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) + else: + return None def find_scraperclasses(comic, multiple_allowed=False): diff --git a/dosagelib/util.py b/dosagelib/util.py index d1201806c..d586da69a 100644 --- a/dosagelib/util.py +++ b/dosagelib/util.py @@ -7,9 +7,9 @@ try: except ImportError: from urllib import quote as url_quote, unquote as url_unquote try: - from urllib.parse import urlparse, urlunparse, urljoin, urlsplit + from urllib.parse import urlparse, urlunparse, urlsplit except ImportError: - from urlparse import urlparse, urlunparse, urljoin, urlsplit + from urlparse import urlparse, urlunparse, urlsplit try: from urllib import robotparser except ImportError: @@ -176,8 +176,6 @@ def case_insensitive_re(name): return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name) -baseSearch = re.compile(tagre("base", "href", '([^"]*)')) - def isValidPageContent(data): """Check if page content is empty or has error messages.""" # The python requests library sometimes returns empty data. @@ -203,14 +201,7 @@ def getPageContent(url, session, max_content_bytes=MaxContentBytes): if not isValidPageContent(data): raise ValueError("Got invalid page content from %s: %r" % (url, data)) out.debug(u"Got page content %r" % data, level=3) - # determine base URL - baseUrl = None - match = baseSearch.search(data) - if match: - baseUrl = match.group(1) - else: - baseUrl = url - return data, baseUrl + return data def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes): @@ -226,42 +217,6 @@ def makeSequence(item): return (item,) -def fetchUrls(url, data, baseUrl, urlSearch): - """Search all entries for given URL pattern(s) in a HTML page.""" - searchUrls = [] - searches = makeSequence(urlSearch) - for search in searches: - for match in search.finditer(data): - searchUrl = match.group(1) - if not searchUrl: - raise ValueError("Pattern %s matched empty URL at %s." % (search.pattern, url)) - out.debug(u'matched URL %r with pattern %s' % (searchUrl, search.pattern)) - searchUrls.append(normaliseURL(urljoin(baseUrl, searchUrl))) - if searchUrls: - # do not search other links if one pattern matched - break - if not searchUrls: - patterns = [x.pattern for x in searches] - raise ValueError("Patterns %s not found at URL %s." % (patterns, url)) - return searchUrls - - -def fetchUrl(url, data, baseUrl, urlSearch): - """Search first URL entry for given URL pattern in a HTML page.""" - return fetchUrls(url, data, baseUrl, urlSearch)[0] - - -def fetchText(url, data, textSearch, optional=False): - """Search text entry for given text pattern in a HTML page."""# - match = textSearch.search(data) - if match: - text = match.group(1) - out.debug(u'matched text %r with pattern %s' % (text, textSearch.pattern)) - return text - if not optional: - raise ValueError("Pattern %s not found at URL %s." % (textSearch.pattern, url)) - - _htmlparser = HTMLParser() def unescape(text): """Replace HTML entities and character references."""