From 8768ff07b6b71fb26647aea04052892d712dc27f Mon Sep 17 00:00:00 2001
From: Tobias Gruetzmacher <tobias-git@23.gs>
Date: Wed, 6 Apr 2016 22:22:22 +0200
Subject: [PATCH] Fix AhoiPolloi, be a bit smarter about encoding.

HTML character encoding in the context of HTTP is quite tricky to get
right and honestly, I'm not sure if I did get it right this time. But I
think, the current behaviour matches best what web browsers try to do:

1. Let Requests figure out the content from the HTTP header. This
   overrides everything else. We need to "trick" LXML to accept our
   decision if the document contains an XML declaration which might
   disagree with the HTTP header.
2. If the HTTP headers don't specify any encoding, let LXML guess the
   encoding and be done with it.
---
 dosagelib/plugins/a.py | 12 +++---------
 dosagelib/scraper.py   | 22 ++++++++++++++++++----
 dosagelib/util.py      | 17 ++++++++---------
 3 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/dosagelib/plugins/a.py b/dosagelib/plugins/a.py
index 09822595e..855e97a64 100644
--- a/dosagelib/plugins/a.py
+++ b/dosagelib/plugins/a.py
@@ -88,22 +88,16 @@ class AGirlAndHerFed(_BasicScraper):
     help = 'Index format: nnn'
 
 
-class AhoiPolloi(_BasicScraper):
+class AhoiPolloi(_ParserScraper):
     url = 'http://ahoipolloi.blogger.de/'
     stripUrl = url + '?day=%s'
     firstStripUrl = stripUrl % '20060306'
     multipleImagesPerStrip = True
     lang = 'de'
-    imageSearch = compile(tagre('img', 'src',
-                                r'(/static/antville/ahoipolloi/images/[^"]+)'))
-    prevSearch = compile(tagre('a', 'href',
-                               r'(http://ahoipolloi\.blogger\.de/\?day=\d+)'))
+    imageSearch = '//img[contains(@src, "/static/antville/ahoipolloi/")]'
+    prevSearch = '//a[contains(@href, "/?day=")]'
     help = 'Index format: yyyymmdd'
 
-    @classmethod
-    def namer(cls, imageUrl, pageUrl):
-        return imageUrl.rsplit('/', 1)[1]
-
 
 class AhoyEarth(_ParserScraper):
     url = 'http://www.ahoyearth.com/'
diff --git a/dosagelib/scraper.py b/dosagelib/scraper.py
index 171d13e62..d3d6eed99 100644
--- a/dosagelib/scraper.py
+++ b/dosagelib/scraper.py
@@ -29,8 +29,8 @@ except ImportError:
     pycountry = None
 
 from . import loader, configuration, languages
-from .util import (getPageContent, makeSequence, get_system_uid, urlopen,
-                   getDirname, unescape, tagre, normaliseURL,
+from .util import (get_page, makeSequence, get_system_uid, urlopen, getDirname,
+        unescape, tagre, normaliseURL,
                    prettyMatcherList, requests_session)
 from .comic import ComicStrip
 from .output import out
@@ -361,7 +361,7 @@ class _BasicScraper(Scraper):
 
     @classmethod
     def getPage(cls, url):
-        content = getPageContent(url, cls.session)
+        content = get_page(url, cls.session).text
         # determine base URL
         baseUrl = None
         match = cls.BASE_SEARCH.search(content)
@@ -430,13 +430,27 @@ class _ParserScraper(Scraper):
     of the HTML element and returns that.
     """
 
+    # Taken directly from LXML
+    XML_DECL = re.compile(
+        r'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
+
     # Switch between CSS and XPath selectors for this class. Since CSS needs
     # another Python module, XPath is the default for now.
     css = False
 
     @classmethod
     def getPage(cls, url):
-        tree = html.document_fromstring(getPageContent(url, cls.session))
+        page = get_page(url, cls.session)
+        if page.encoding:
+            # Requests figured out the encoding, so we can deliver Unicode to
+            # LXML. Unfortunatly, LXML feels betrayed if there is still an XML
+            # declaration with (probably wrong!) encoding at the top of the
+            # document. Web browsers ignore such if the encoding was specified
+            # in the HTTP header and so do we.
+            text = cls.XML_DECL.sub('\1\2', page.text, count=1)
+            tree = html.document_fromstring(text)
+        else:
+            tree = html.document_fromstring(page.content)
         tree.make_links_absolute(url)
         return tree
 
diff --git a/dosagelib/util.py b/dosagelib/util.py
index 6968dc1dc..f29fba554 100644
--- a/dosagelib/util.py
+++ b/dosagelib/util.py
@@ -185,14 +185,13 @@ def case_insensitive_re(name):
     return "".join("[%s%s]" % (c.lower(), c.upper()) for c in name)
 
 
-def getPageContent(url, session, max_content_bytes=MaxContentBytes):
+def get_page(url, session, max_content_bytes=MaxContentBytes):
     """Get text content of given URL."""
     check_robotstxt(url, session)
     # read page data
     page = urlopen(url, session, max_content_bytes=max_content_bytes)
-    data = page.text
-    out.debug(u"Got page content %r" % data, level=3)
-    return data
+    out.debug(u"Got page content %r" % page.content, level=3)
+    return page
 
 
 def getImageObject(url, referrer, session, max_content_bytes=MaxImageBytes):
@@ -437,7 +436,7 @@ def strtimezone():
         zone = time.altzone
     else:
         zone = time.timezone
-    return "%+04d" % (-zone//3600)
+    return "%+04d" % (-zone // 3600)
 
 
 def rfc822date(indate):
@@ -477,12 +476,12 @@ def strsize(b):
     if b < 1024 * 1024:
         return "%.2fKB" % (float(b) / 1024)
     if b < 1024 * 1024 * 10:
-        return "%.2fMB" % (float(b) / (1024*1024))
+        return "%.2fMB" % (float(b) / (1024 * 1024))
     if b < 1024 * 1024 * 1024:
-        return "%.1fMB" % (float(b) / (1024*1024))
+        return "%.1fMB" % (float(b) / (1024 * 1024))
     if b < 1024 * 1024 * 1024 * 10:
-        return "%.2fGB" % (float(b) / (1024*1024*1024))
-    return "%.1fGB" % (float(b) / (1024*1024*1024))
+        return "%.2fGB" % (float(b) / (1024 * 1024 * 1024))
+    return "%.1fGB" % (float(b) / (1024 * 1024 * 1024))
 
 
 def getDirname(name):