diff --git a/dosagelib/plugins/creators.py b/dosagelib/plugins/creators.py index f609dc6f5..d90823a36 100644 --- a/dosagelib/plugins/creators.py +++ b/dosagelib/plugins/creators.py @@ -1,17 +1,18 @@ -# -*- coding: iso-8859-1 -*- +# -*- coding: utf-8 -*- # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2012-2014 Bastian Kleineidam -# Copyright (C) 2015 Tobias Gruetzmacher +# Copyright (C) 2015-2016 Tobias Gruetzmacher + +from __future__ import absolute_import, division, print_function -from re import compile from ..scraper import _ParserScraper -from ..util import tagre + class _Creators(_ParserScraper): - url = 'http://www.creators.com/comics/' - imageSearch = '//td/a[@class="z"]' - prevSearch = '//a[contains(@class,"time_l")]' - help = 'Index format: n' + url = 'https://www.creators.com/features/' + imageSearch = '//a[contains(@class,"fancybox")]/img' + prevSearch = '//a[@id="nav_prev"]' + latestSearch = '//div[contains(@class,"caption")]/a' @classmethod def getName(cls): @@ -19,125 +20,156 @@ class _Creators(_ParserScraper): @classmethod def starter(cls): - return cls.url + cls.path + '.html' + start = cls.url + cls.path + data = cls.getPage(start) + return cls.fetchUrl(start, data, cls.latestSearch) - def getIndexStripUrl(self, index): - return self.url + self.path + '/%s.html' % index class _CreatorsEs(_Creators): lang = 'es' - def shouldSkipUrl(self, url, data): - """Images are 404...""" - return url in ( - self.url + 'heathcliff-spanish/139736.html' - ) # Some comics are not listed on the "all" page (too old?) -class WinnieThePooh(_Creators): - path = u'winnie-the-pooh' +class CafeconLeche(_Creators): + path = 'cafe-con-leche' -class Recess(_Creators): - path = u'recess' -class NaturalSelection(_Creators): - path = u'natural-selection' +class DonaldDuck(_Creators): + path = 'donald-duck' + + +class Flare(_Creators): + path = 'flare' + class FlightDeck(_Creators): - path = u'flight-deck' + path = 'flight-deck' -# do not edit anything below since these entries are generated from scripts/update_plugins.sh + +class GirlsAndSports(_Creators): + path = 'girls-and-sports' + + +class GirlsandSportsSpanish(_CreatorsEs): + path = 'girls-and-sports-spanish' + + +class HomeOffice(_Creators): + path = 'stay-at-home-dad' + + +class HopeAndDeath(_Creators): + path = 'hope-and-death' + + +class MickeyMouse(_Creators): + path = 'mickey-mouse' + + +class NaturalSelection(_Creators): + path = 'natural-selection' + + +class OffCenter(_Creators): + path = 'off-center' + + +class Recess(_Creators): + path = 'recess' + + +class Rugrats(_Creators): + path = 'rugrats' + + +class RugratsSpanish(_CreatorsEs): + path = 'rugrats-spanish' + + +class TheQuigmans(_Creators): + path = 'the-quigmans' + + +class WinnieThePooh(_Creators): + path = 'winnie-the-pooh' + + +# do not edit anything below since these entries are generated from +# scripts/update_plugins.sh # DO NOT REMOVE # Agnes has a duplicate in gocomics # AndyCapp has a duplicate in gocomics -class Archie(_Creators): - path = u'archie' +class AndyMarlette(_Creators): + path = 'andy-marlette' + + +class Archie(_Creators): + path = 'archie' + + +class ArchieSpanish(_CreatorsEs): + path = 'archie-spanish' -class ArchieinSpanish(_CreatorsEs): - path = u'archie-spanish' # AskShagg has a duplicate in gocomics # BC has a duplicate in gocomics -class BCinSpanish(_CreatorsEs): - path = u'bc-spanish' - # BallardStreet has a duplicate in gocomics -class CafeconLeche(_Creators): - path = u'cafe-con-leche' - +# BobGorrell has a duplicate in gocomics +# ChipBok has a duplicate in gocomics +# ChrisBritt has a duplicate in gocomics # ChuckleBros has a duplicate in gocomics # DaddysHome has a duplicate in gocomics # DiamondLil has a duplicate in gocomics # DogEatDoug has a duplicate in gocomics -# DogsofCKennel has a duplicate in gocomics -class DonaldDuck(_Creators): - path = u'donald-duck' - -class Doodles(_Creators): - path = u'doodles' - -class Flare(_Creators): - path = u'flare' - -class FlightDeck(_Creators): - path = u'flight-deck' - -# FloandFriends has a duplicate in gocomics +# DogsOfCKennel has a duplicate in gocomics +# FloAndFriends has a duplicate in gocomics # ForHeavensSake has a duplicate in gocomics # FreeRange has a duplicate in gocomics -class GirlsAndSports(_Creators): - path = u'girls-and-sports' - -class GirlsandSportsinSpanish(_CreatorsEs): - path = u'girls-and-sports-spanish' - +# GaryMarkstein has a duplicate in gocomics +# GaryVarvel has a duplicate in gocomics # Heathcliff has a duplicate in gocomics -class HeathcliffinSpanish(_CreatorsEs): - path = u'heathcliff-spanish' +class HeathcliffSpanish(_CreatorsEs): + path = 'heathcliff-spanish' -# HerbandJamaal has a duplicate in gocomics -class HomeOffice(_Creators): - path = u'stay-at-home-dad' - -class HopeAndDeath(_Creators): - path = u'hope-and-death' +# HerbAndJamaal has a duplicate in gocomics +# JohnDeering has a duplicate in gocomics +# KenCatalino has a duplicate in gocomics # LibertyMeadows has a duplicate in gocomics class LongStoryShort(_Creators): - path = u'long-story-short' + path = 'long-story-short' -class MickeyMouse(_Creators): - path = u'mickey-mouse' +# MarshallRamsey has a duplicate in gocomics +# MichaelRamirez has a duplicate in gocomics +# MikeLuckovich has a duplicate in gocomics # Momma has a duplicate in gocomics +class Mossprints(_Creators): + path = 'mossprints' + + # NestHeads has a duplicate in gocomics -class OffCenter(_Creators): - path = u'off-center' - -# OnaClaireDay has a duplicate in gocomics # OneBigHappy has a duplicate in gocomics +# PaulSzep has a duplicate in gocomics # Rubes has a duplicate in gocomics -class Rugrats(_Creators): - path = u'rugrats' - -class RugratsinSpanish(_CreatorsEs): - path = u'rugrats-spanish' - # ScaryGary has a duplicate in gocomics # SpeedBump has a duplicate in gocomics +# SteveBenson has a duplicate in gocomics +# SteveBreen has a duplicate in gocomics +# SteveKelley has a duplicate in gocomics # StrangeBrew has a duplicate in gocomics # TheBarn has a duplicate in gocomics -# TheDinetteSet has a duplicate in gocomics -# TheMeaningofLila has a duplicate in gocomics +# TheMeaningOfLila has a duplicate in gocomics # TheOtherCoast has a duplicate in gocomics -class TheQuigmans(_Creators): - path = u'the-quigmans' +class TomStiglich(_Creators): + path = 'tom-stiglich' -class TheWizardofIdinSpanish(_CreatorsEs): - path = u'wizard-of-id-spanish' -# ThinLines has a duplicate in gocomics # WeePals has a duplicate in gocomics -# WizardofId has a duplicate in gocomics -# WorkingitOut has a duplicate in gocomics +# WizardOfId has a duplicate in gocomics +class WizardOfIdSpanish(_CreatorsEs): + path = 'wizard-of-id-spanish' + + +# WorkingItOut has a duplicate in gocomics # ZackHill has a duplicate in gocomics diff --git a/scripts/creators.py b/scripts/creators.py index 6e7ab816c..e7bfcae02 100755 --- a/scripts/creators.py +++ b/scripts/creators.py @@ -1,55 +1,66 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # Copyright (C) 2012-2014 Bastian Kleineidam +# Copyright (C) 2015-2016 Tobias Gruetzmacher """ -Script to get a list of creators.com comics and save the info in a JSON file for further processing. +Script to get a list of creators.com comics and save the info in a JSON file +for further processing. """ -from __future__ import print_function -import re +from __future__ import absolute_import, division, print_function + import codecs import sys import os + import requests -sys.path.append(os.path.join(os.path.dirname(__file__), "..")) -from dosagelib.util import getPageContent, asciify, unescape, tagre +from lxml import html + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa +from dosagelib.util import get_page from dosagelib.scraper import get_scraperclasses -from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name +from scriptutil import (contains_case_insensitive, save_result, load_result, + truncate_name, format_name) json_file = __file__.replace(".py", ".json") -url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'([^<]+)') - # names of comics to exclude exclude_comics = [ + 'Doodles', # no images ] + def handle_url(url, session, res): - """Parse one search result page.""" + """Parse one listing page.""" print("Parsing", url, file=sys.stderr) try: - data = getPageContent(url, session) + data = html.document_fromstring(get_page(url, session).text) + data.make_links_absolute(url) except IOError as msg: print("ERROR:", msg, file=sys.stderr) return - for match in url_matcher.finditer(data): - url = match.group(1) - name = unescape(match.group(2)) - name = asciify(name.replace('&', 'And').replace('@', 'At')) - name = capfirst(name) + + for comicdiv in data.cssselect('ul.all-test li'): + comiclink = comicdiv.cssselect('a')[0] + comicurl = comiclink.attrib['href'] + name = format_name(comicdiv.cssselect('p strong')[0].text) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case - print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) + print("INFO: skipping possible duplicate", repr(name), + file=sys.stderr) continue - res[name] = url + + res[name] = comicurl.rsplit('/', 1)[1] def get_results(): """Parse all search result pages.""" # store info in a dictionary {name -> shortname} res = {} - session = requests.Session() - handle_url('http://www.creators.com/comics/cat-seeall.html', session, res) + sess = requests.Session() + handle_url('https://www.creators.com/categories/comics/all', sess, res) + handle_url('https://www.creators.com/categories/cartoons/all', sess, res) save_result(res, json_file) @@ -68,14 +79,13 @@ def print_results(args): min_comics, filename = args with codecs.open(filename, 'a', 'utf-8') as fp: for name, path in sorted(load_result(json_file).items()): - if name in exclude_comics: - continue lang = 'Es' if name.lower().endswith('spanish') else '' if has_gocomics_comic(name): - fp.write(u'# %s has a duplicate in gocomics\n' % truncate_name(name)) + fp.write(u'# %s has a duplicate in gocomics\n' % + truncate_name(name)) else: - fp.write(u"class %s(_Creators%s):\n path = %r\n\n" % - (truncate_name(name), lang, path)) + fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" % + (truncate_name(name), lang, path)) if __name__ == '__main__':