Fix TheWhiteboard
This commit is contained in:
parent
b055a8574f
commit
ed3acd2d2f
1 changed files with 6 additions and 4 deletions
|
@ -109,13 +109,15 @@ class TheThinHLine(_TumblrScraper):
|
|||
|
||||
|
||||
class TheWhiteboard(_ParserScraper):
|
||||
BROKEN_PAGE_MIDDLE = compile(r'</body></html><')
|
||||
BROKEN_PAGE_MIDDLE = compile(r'</body></html>\n<')
|
||||
url = 'http://www.the-whiteboard.com/'
|
||||
imageSearch = '//center/img'
|
||||
prevSearch = '//a[text()="previous"]'
|
||||
stripUrl = url + 'auto%s.html'
|
||||
firstStripUrl = stripUrl % 'wb001'
|
||||
imageSearch = '//img[contains(@src, "auto")]'
|
||||
prevSearch = '//a[.//img[contains(@src, "previous")]]'
|
||||
|
||||
# Another ugly hack :(
|
||||
def _parse_page(self, data):
|
||||
# Ugly hack to fix broken HTML
|
||||
data = self.BROKEN_PAGE_MIDDLE.sub('<', data)
|
||||
return super(TheWhiteboard, self)._parse_page(data)
|
||||
|
||||
|
|
Loading…
Reference in a new issue