Fix creators module.

This commit is contained in:
Tobias Gruetzmacher 2016-04-07 00:20:03 +02:00
parent 8768ff07b6
commit 0033a8046b
2 changed files with 150 additions and 108 deletions

View file

@ -1,17 +1,18 @@
# -*- coding: iso-8859-1 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs # Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015 Tobias Gruetzmacher # Copyright (C) 2015-2016 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
from re import compile
from ..scraper import _ParserScraper from ..scraper import _ParserScraper
from ..util import tagre
class _Creators(_ParserScraper): class _Creators(_ParserScraper):
url = 'http://www.creators.com/comics/' url = 'https://www.creators.com/features/'
imageSearch = '//td/a[@class="z"]' imageSearch = '//a[contains(@class,"fancybox")]/img'
prevSearch = '//a[contains(@class,"time_l")]' prevSearch = '//a[@id="nav_prev"]'
help = 'Index format: n' latestSearch = '//div[contains(@class,"caption")]/a'
@classmethod @classmethod
def getName(cls): def getName(cls):
@ -19,125 +20,156 @@ class _Creators(_ParserScraper):
@classmethod @classmethod
def starter(cls): def starter(cls):
return cls.url + cls.path + '.html' start = cls.url + cls.path
data = cls.getPage(start)
return cls.fetchUrl(start, data, cls.latestSearch)
def getIndexStripUrl(self, index):
return self.url + self.path + '/%s.html' % index
class _CreatorsEs(_Creators): class _CreatorsEs(_Creators):
lang = 'es' lang = 'es'
def shouldSkipUrl(self, url, data):
"""Images are 404..."""
return url in (
self.url + 'heathcliff-spanish/139736.html'
)
# Some comics are not listed on the "all" page (too old?) # Some comics are not listed on the "all" page (too old?)
class WinnieThePooh(_Creators): class CafeconLeche(_Creators):
path = u'winnie-the-pooh' path = 'cafe-con-leche'
class Recess(_Creators):
path = u'recess'
class NaturalSelection(_Creators): class DonaldDuck(_Creators):
path = u'natural-selection' path = 'donald-duck'
class Flare(_Creators):
path = 'flare'
class FlightDeck(_Creators): class FlightDeck(_Creators):
path = u'flight-deck' path = 'flight-deck'
# do not edit anything below since these entries are generated from scripts/update_plugins.sh
class GirlsAndSports(_Creators):
path = 'girls-and-sports'
class GirlsandSportsSpanish(_CreatorsEs):
path = 'girls-and-sports-spanish'
class HomeOffice(_Creators):
path = 'stay-at-home-dad'
class HopeAndDeath(_Creators):
path = 'hope-and-death'
class MickeyMouse(_Creators):
path = 'mickey-mouse'
class NaturalSelection(_Creators):
path = 'natural-selection'
class OffCenter(_Creators):
path = 'off-center'
class Recess(_Creators):
path = 'recess'
class Rugrats(_Creators):
path = 'rugrats'
class RugratsSpanish(_CreatorsEs):
path = 'rugrats-spanish'
class TheQuigmans(_Creators):
path = 'the-quigmans'
class WinnieThePooh(_Creators):
path = 'winnie-the-pooh'
# do not edit anything below since these entries are generated from
# scripts/update_plugins.sh
# DO NOT REMOVE # DO NOT REMOVE
# Agnes has a duplicate in gocomics # Agnes has a duplicate in gocomics
# AndyCapp has a duplicate in gocomics # AndyCapp has a duplicate in gocomics
class Archie(_Creators): class AndyMarlette(_Creators):
path = u'archie' path = 'andy-marlette'
class Archie(_Creators):
path = 'archie'
class ArchieSpanish(_CreatorsEs):
path = 'archie-spanish'
class ArchieinSpanish(_CreatorsEs):
path = u'archie-spanish'
# AskShagg has a duplicate in gocomics # AskShagg has a duplicate in gocomics
# BC has a duplicate in gocomics # BC has a duplicate in gocomics
class BCinSpanish(_CreatorsEs):
path = u'bc-spanish'
# BallardStreet has a duplicate in gocomics # BallardStreet has a duplicate in gocomics
class CafeconLeche(_Creators): # BobGorrell has a duplicate in gocomics
path = u'cafe-con-leche' # ChipBok has a duplicate in gocomics
# ChrisBritt has a duplicate in gocomics
# ChuckleBros has a duplicate in gocomics # ChuckleBros has a duplicate in gocomics
# DaddysHome has a duplicate in gocomics # DaddysHome has a duplicate in gocomics
# DiamondLil has a duplicate in gocomics # DiamondLil has a duplicate in gocomics
# DogEatDoug has a duplicate in gocomics # DogEatDoug has a duplicate in gocomics
# DogsofCKennel has a duplicate in gocomics # DogsOfCKennel has a duplicate in gocomics
class DonaldDuck(_Creators): # FloAndFriends has a duplicate in gocomics
path = u'donald-duck'
class Doodles(_Creators):
path = u'doodles'
class Flare(_Creators):
path = u'flare'
class FlightDeck(_Creators):
path = u'flight-deck'
# FloandFriends has a duplicate in gocomics
# ForHeavensSake has a duplicate in gocomics # ForHeavensSake has a duplicate in gocomics
# FreeRange has a duplicate in gocomics # FreeRange has a duplicate in gocomics
class GirlsAndSports(_Creators): # GaryMarkstein has a duplicate in gocomics
path = u'girls-and-sports' # GaryVarvel has a duplicate in gocomics
class GirlsandSportsinSpanish(_CreatorsEs):
path = u'girls-and-sports-spanish'
# Heathcliff has a duplicate in gocomics # Heathcliff has a duplicate in gocomics
class HeathcliffinSpanish(_CreatorsEs): class HeathcliffSpanish(_CreatorsEs):
path = u'heathcliff-spanish' path = 'heathcliff-spanish'
# HerbandJamaal has a duplicate in gocomics
class HomeOffice(_Creators):
path = u'stay-at-home-dad'
class HopeAndDeath(_Creators):
path = u'hope-and-death'
# HerbAndJamaal has a duplicate in gocomics
# JohnDeering has a duplicate in gocomics
# KenCatalino has a duplicate in gocomics
# LibertyMeadows has a duplicate in gocomics # LibertyMeadows has a duplicate in gocomics
class LongStoryShort(_Creators): class LongStoryShort(_Creators):
path = u'long-story-short' path = 'long-story-short'
class MickeyMouse(_Creators):
path = u'mickey-mouse'
# MarshallRamsey has a duplicate in gocomics
# MichaelRamirez has a duplicate in gocomics
# MikeLuckovich has a duplicate in gocomics
# Momma has a duplicate in gocomics # Momma has a duplicate in gocomics
class Mossprints(_Creators):
path = 'mossprints'
# NestHeads has a duplicate in gocomics # NestHeads has a duplicate in gocomics
class OffCenter(_Creators):
path = u'off-center'
# OnaClaireDay has a duplicate in gocomics
# OneBigHappy has a duplicate in gocomics # OneBigHappy has a duplicate in gocomics
# PaulSzep has a duplicate in gocomics
# Rubes has a duplicate in gocomics # Rubes has a duplicate in gocomics
class Rugrats(_Creators):
path = u'rugrats'
class RugratsinSpanish(_CreatorsEs):
path = u'rugrats-spanish'
# ScaryGary has a duplicate in gocomics # ScaryGary has a duplicate in gocomics
# SpeedBump has a duplicate in gocomics # SpeedBump has a duplicate in gocomics
# SteveBenson has a duplicate in gocomics
# SteveBreen has a duplicate in gocomics
# SteveKelley has a duplicate in gocomics
# StrangeBrew has a duplicate in gocomics # StrangeBrew has a duplicate in gocomics
# TheBarn has a duplicate in gocomics # TheBarn has a duplicate in gocomics
# TheDinetteSet has a duplicate in gocomics # TheMeaningOfLila has a duplicate in gocomics
# TheMeaningofLila has a duplicate in gocomics
# TheOtherCoast has a duplicate in gocomics # TheOtherCoast has a duplicate in gocomics
class TheQuigmans(_Creators): class TomStiglich(_Creators):
path = u'the-quigmans' path = 'tom-stiglich'
class TheWizardofIdinSpanish(_CreatorsEs):
path = u'wizard-of-id-spanish'
# ThinLines has a duplicate in gocomics
# WeePals has a duplicate in gocomics # WeePals has a duplicate in gocomics
# WizardofId has a duplicate in gocomics # WizardOfId has a duplicate in gocomics
# WorkingitOut has a duplicate in gocomics class WizardOfIdSpanish(_CreatorsEs):
path = 'wizard-of-id-spanish'
# WorkingItOut has a duplicate in gocomics
# ZackHill has a duplicate in gocomics # ZackHill has a duplicate in gocomics

View file

@ -1,55 +1,66 @@
#!/usr/bin/env python #!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2012-2014 Bastian Kleineidam # Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2016 Tobias Gruetzmacher
""" """
Script to get a list of creators.com comics and save the info in a JSON file for further processing. Script to get a list of creators.com comics and save the info in a JSON file
for further processing.
""" """
from __future__ import print_function from __future__ import absolute_import, division, print_function
import re
import codecs import codecs
import sys import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from lxml import html
from dosagelib.util import getPageContent, asciify, unescape, tagre
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json") json_file = __file__.replace(".py", ".json")
url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'<strong>([^<]+)</strong>')
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
'Doodles', # no images
] ]
def handle_url(url, session, res): def handle_url(url, session, res):
"""Parse one search result page.""" """Parse one listing page."""
print("Parsing", url, file=sys.stderr) print("Parsing", url, file=sys.stderr)
try: try:
data = getPageContent(url, session) data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg: except IOError as msg:
print("ERROR:", msg, file=sys.stderr) print("ERROR:", msg, file=sys.stderr)
return return
for match in url_matcher.finditer(data):
url = match.group(1) for comicdiv in data.cssselect('ul.all-test li'):
name = unescape(match.group(2)) comiclink = comicdiv.cssselect('a')[0]
name = asciify(name.replace('&', 'And').replace('@', 'At')) comicurl = comiclink.attrib['href']
name = capfirst(name) name = format_name(comicdiv.cssselect('p strong')[0].text)
if name in exclude_comics: if name in exclude_comics:
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue continue
res[name] = url
res[name] = comicurl.rsplit('/', 1)[1]
def get_results(): def get_results():
"""Parse all search result pages.""" """Parse all search result pages."""
# store info in a dictionary {name -> shortname} # store info in a dictionary {name -> shortname}
res = {} res = {}
session = requests.Session() sess = requests.Session()
handle_url('http://www.creators.com/comics/cat-seeall.html', session, res) handle_url('https://www.creators.com/categories/comics/all', sess, res)
handle_url('https://www.creators.com/categories/cartoons/all', sess, res)
save_result(res, json_file) save_result(res, json_file)
@ -68,13 +79,12 @@ def print_results(args):
min_comics, filename = args min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp: with codecs.open(filename, 'a', 'utf-8') as fp:
for name, path in sorted(load_result(json_file).items()): for name, path in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
lang = 'Es' if name.lower().endswith('spanish') else '' lang = 'Es' if name.lower().endswith('spanish') else ''
if has_gocomics_comic(name): if has_gocomics_comic(name):
fp.write(u'# %s has a duplicate in gocomics\n' % truncate_name(name)) fp.write(u'# %s has a duplicate in gocomics\n' %
truncate_name(name))
else: else:
fp.write(u"class %s(_Creators%s):\n path = %r\n\n" % fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" %
(truncate_name(name), lang, path)) (truncate_name(name), lang, path))