Fix TheWhiteboard
This commit is contained in:
parent
b055a8574f
commit
ed3acd2d2f
1 changed files with 6 additions and 4 deletions
|
@ -109,13 +109,15 @@ class TheThinHLine(_TumblrScraper):
|
||||||
|
|
||||||
|
|
||||||
class TheWhiteboard(_ParserScraper):
|
class TheWhiteboard(_ParserScraper):
|
||||||
BROKEN_PAGE_MIDDLE = compile(r'</body></html><')
|
BROKEN_PAGE_MIDDLE = compile(r'</body></html>\n<')
|
||||||
url = 'http://www.the-whiteboard.com/'
|
url = 'http://www.the-whiteboard.com/'
|
||||||
imageSearch = '//center/img'
|
stripUrl = url + 'auto%s.html'
|
||||||
prevSearch = '//a[text()="previous"]'
|
firstStripUrl = stripUrl % 'wb001'
|
||||||
|
imageSearch = '//img[contains(@src, "auto")]'
|
||||||
|
prevSearch = '//a[.//img[contains(@src, "previous")]]'
|
||||||
|
|
||||||
# Another ugly hack :(
|
|
||||||
def _parse_page(self, data):
|
def _parse_page(self, data):
|
||||||
|
# Ugly hack to fix broken HTML
|
||||||
data = self.BROKEN_PAGE_MIDDLE.sub('<', data)
|
data = self.BROKEN_PAGE_MIDDLE.sub('<', data)
|
||||||
return super(TheWhiteboard, self)._parse_page(data)
|
return super(TheWhiteboard, self)._parse_page(data)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue