Fix creators module.
This commit is contained in:
parent
8768ff07b6
commit
0033a8046b
2 changed files with 150 additions and 108 deletions
|
@ -1,17 +1,18 @@
|
||||||
# -*- coding: iso-8859-1 -*-
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
# Copyright (C) 2015 Tobias Gruetzmacher
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
|
|
||||||
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
from re import compile
|
|
||||||
from ..scraper import _ParserScraper
|
from ..scraper import _ParserScraper
|
||||||
from ..util import tagre
|
|
||||||
|
|
||||||
class _Creators(_ParserScraper):
|
class _Creators(_ParserScraper):
|
||||||
url = 'http://www.creators.com/comics/'
|
url = 'https://www.creators.com/features/'
|
||||||
imageSearch = '//td/a[@class="z"]'
|
imageSearch = '//a[contains(@class,"fancybox")]/img'
|
||||||
prevSearch = '//a[contains(@class,"time_l")]'
|
prevSearch = '//a[@id="nav_prev"]'
|
||||||
help = 'Index format: n'
|
latestSearch = '//div[contains(@class,"caption")]/a'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getName(cls):
|
def getName(cls):
|
||||||
|
@ -19,125 +20,156 @@ class _Creators(_ParserScraper):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def starter(cls):
|
def starter(cls):
|
||||||
return cls.url + cls.path + '.html'
|
start = cls.url + cls.path
|
||||||
|
data = cls.getPage(start)
|
||||||
|
return cls.fetchUrl(start, data, cls.latestSearch)
|
||||||
|
|
||||||
def getIndexStripUrl(self, index):
|
|
||||||
return self.url + self.path + '/%s.html' % index
|
|
||||||
|
|
||||||
class _CreatorsEs(_Creators):
|
class _CreatorsEs(_Creators):
|
||||||
lang = 'es'
|
lang = 'es'
|
||||||
|
|
||||||
def shouldSkipUrl(self, url, data):
|
|
||||||
"""Images are 404..."""
|
|
||||||
return url in (
|
|
||||||
self.url + 'heathcliff-spanish/139736.html'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Some comics are not listed on the "all" page (too old?)
|
# Some comics are not listed on the "all" page (too old?)
|
||||||
class WinnieThePooh(_Creators):
|
class CafeconLeche(_Creators):
|
||||||
path = u'winnie-the-pooh'
|
path = 'cafe-con-leche'
|
||||||
|
|
||||||
class Recess(_Creators):
|
|
||||||
path = u'recess'
|
|
||||||
|
|
||||||
class NaturalSelection(_Creators):
|
class DonaldDuck(_Creators):
|
||||||
path = u'natural-selection'
|
path = 'donald-duck'
|
||||||
|
|
||||||
|
|
||||||
|
class Flare(_Creators):
|
||||||
|
path = 'flare'
|
||||||
|
|
||||||
|
|
||||||
class FlightDeck(_Creators):
|
class FlightDeck(_Creators):
|
||||||
path = u'flight-deck'
|
path = 'flight-deck'
|
||||||
|
|
||||||
# do not edit anything below since these entries are generated from scripts/update_plugins.sh
|
|
||||||
|
class GirlsAndSports(_Creators):
|
||||||
|
path = 'girls-and-sports'
|
||||||
|
|
||||||
|
|
||||||
|
class GirlsandSportsSpanish(_CreatorsEs):
|
||||||
|
path = 'girls-and-sports-spanish'
|
||||||
|
|
||||||
|
|
||||||
|
class HomeOffice(_Creators):
|
||||||
|
path = 'stay-at-home-dad'
|
||||||
|
|
||||||
|
|
||||||
|
class HopeAndDeath(_Creators):
|
||||||
|
path = 'hope-and-death'
|
||||||
|
|
||||||
|
|
||||||
|
class MickeyMouse(_Creators):
|
||||||
|
path = 'mickey-mouse'
|
||||||
|
|
||||||
|
|
||||||
|
class NaturalSelection(_Creators):
|
||||||
|
path = 'natural-selection'
|
||||||
|
|
||||||
|
|
||||||
|
class OffCenter(_Creators):
|
||||||
|
path = 'off-center'
|
||||||
|
|
||||||
|
|
||||||
|
class Recess(_Creators):
|
||||||
|
path = 'recess'
|
||||||
|
|
||||||
|
|
||||||
|
class Rugrats(_Creators):
|
||||||
|
path = 'rugrats'
|
||||||
|
|
||||||
|
|
||||||
|
class RugratsSpanish(_CreatorsEs):
|
||||||
|
path = 'rugrats-spanish'
|
||||||
|
|
||||||
|
|
||||||
|
class TheQuigmans(_Creators):
|
||||||
|
path = 'the-quigmans'
|
||||||
|
|
||||||
|
|
||||||
|
class WinnieThePooh(_Creators):
|
||||||
|
path = 'winnie-the-pooh'
|
||||||
|
|
||||||
|
|
||||||
|
# do not edit anything below since these entries are generated from
|
||||||
|
# scripts/update_plugins.sh
|
||||||
# DO NOT REMOVE
|
# DO NOT REMOVE
|
||||||
# Agnes has a duplicate in gocomics
|
# Agnes has a duplicate in gocomics
|
||||||
# AndyCapp has a duplicate in gocomics
|
# AndyCapp has a duplicate in gocomics
|
||||||
class Archie(_Creators):
|
class AndyMarlette(_Creators):
|
||||||
path = u'archie'
|
path = 'andy-marlette'
|
||||||
|
|
||||||
|
|
||||||
|
class Archie(_Creators):
|
||||||
|
path = 'archie'
|
||||||
|
|
||||||
|
|
||||||
|
class ArchieSpanish(_CreatorsEs):
|
||||||
|
path = 'archie-spanish'
|
||||||
|
|
||||||
class ArchieinSpanish(_CreatorsEs):
|
|
||||||
path = u'archie-spanish'
|
|
||||||
|
|
||||||
# AskShagg has a duplicate in gocomics
|
# AskShagg has a duplicate in gocomics
|
||||||
# BC has a duplicate in gocomics
|
# BC has a duplicate in gocomics
|
||||||
class BCinSpanish(_CreatorsEs):
|
|
||||||
path = u'bc-spanish'
|
|
||||||
|
|
||||||
# BallardStreet has a duplicate in gocomics
|
# BallardStreet has a duplicate in gocomics
|
||||||
class CafeconLeche(_Creators):
|
# BobGorrell has a duplicate in gocomics
|
||||||
path = u'cafe-con-leche'
|
# ChipBok has a duplicate in gocomics
|
||||||
|
# ChrisBritt has a duplicate in gocomics
|
||||||
# ChuckleBros has a duplicate in gocomics
|
# ChuckleBros has a duplicate in gocomics
|
||||||
# DaddysHome has a duplicate in gocomics
|
# DaddysHome has a duplicate in gocomics
|
||||||
# DiamondLil has a duplicate in gocomics
|
# DiamondLil has a duplicate in gocomics
|
||||||
# DogEatDoug has a duplicate in gocomics
|
# DogEatDoug has a duplicate in gocomics
|
||||||
# DogsofCKennel has a duplicate in gocomics
|
# DogsOfCKennel has a duplicate in gocomics
|
||||||
class DonaldDuck(_Creators):
|
# FloAndFriends has a duplicate in gocomics
|
||||||
path = u'donald-duck'
|
|
||||||
|
|
||||||
class Doodles(_Creators):
|
|
||||||
path = u'doodles'
|
|
||||||
|
|
||||||
class Flare(_Creators):
|
|
||||||
path = u'flare'
|
|
||||||
|
|
||||||
class FlightDeck(_Creators):
|
|
||||||
path = u'flight-deck'
|
|
||||||
|
|
||||||
# FloandFriends has a duplicate in gocomics
|
|
||||||
# ForHeavensSake has a duplicate in gocomics
|
# ForHeavensSake has a duplicate in gocomics
|
||||||
# FreeRange has a duplicate in gocomics
|
# FreeRange has a duplicate in gocomics
|
||||||
class GirlsAndSports(_Creators):
|
# GaryMarkstein has a duplicate in gocomics
|
||||||
path = u'girls-and-sports'
|
# GaryVarvel has a duplicate in gocomics
|
||||||
|
|
||||||
class GirlsandSportsinSpanish(_CreatorsEs):
|
|
||||||
path = u'girls-and-sports-spanish'
|
|
||||||
|
|
||||||
# Heathcliff has a duplicate in gocomics
|
# Heathcliff has a duplicate in gocomics
|
||||||
class HeathcliffinSpanish(_CreatorsEs):
|
class HeathcliffSpanish(_CreatorsEs):
|
||||||
path = u'heathcliff-spanish'
|
path = 'heathcliff-spanish'
|
||||||
|
|
||||||
# HerbandJamaal has a duplicate in gocomics
|
|
||||||
class HomeOffice(_Creators):
|
|
||||||
path = u'stay-at-home-dad'
|
|
||||||
|
|
||||||
class HopeAndDeath(_Creators):
|
|
||||||
path = u'hope-and-death'
|
|
||||||
|
|
||||||
|
# HerbAndJamaal has a duplicate in gocomics
|
||||||
|
# JohnDeering has a duplicate in gocomics
|
||||||
|
# KenCatalino has a duplicate in gocomics
|
||||||
# LibertyMeadows has a duplicate in gocomics
|
# LibertyMeadows has a duplicate in gocomics
|
||||||
class LongStoryShort(_Creators):
|
class LongStoryShort(_Creators):
|
||||||
path = u'long-story-short'
|
path = 'long-story-short'
|
||||||
|
|
||||||
class MickeyMouse(_Creators):
|
|
||||||
path = u'mickey-mouse'
|
|
||||||
|
|
||||||
|
# MarshallRamsey has a duplicate in gocomics
|
||||||
|
# MichaelRamirez has a duplicate in gocomics
|
||||||
|
# MikeLuckovich has a duplicate in gocomics
|
||||||
# Momma has a duplicate in gocomics
|
# Momma has a duplicate in gocomics
|
||||||
|
class Mossprints(_Creators):
|
||||||
|
path = 'mossprints'
|
||||||
|
|
||||||
|
|
||||||
# NestHeads has a duplicate in gocomics
|
# NestHeads has a duplicate in gocomics
|
||||||
class OffCenter(_Creators):
|
|
||||||
path = u'off-center'
|
|
||||||
|
|
||||||
# OnaClaireDay has a duplicate in gocomics
|
|
||||||
# OneBigHappy has a duplicate in gocomics
|
# OneBigHappy has a duplicate in gocomics
|
||||||
|
# PaulSzep has a duplicate in gocomics
|
||||||
# Rubes has a duplicate in gocomics
|
# Rubes has a duplicate in gocomics
|
||||||
class Rugrats(_Creators):
|
|
||||||
path = u'rugrats'
|
|
||||||
|
|
||||||
class RugratsinSpanish(_CreatorsEs):
|
|
||||||
path = u'rugrats-spanish'
|
|
||||||
|
|
||||||
# ScaryGary has a duplicate in gocomics
|
# ScaryGary has a duplicate in gocomics
|
||||||
# SpeedBump has a duplicate in gocomics
|
# SpeedBump has a duplicate in gocomics
|
||||||
|
# SteveBenson has a duplicate in gocomics
|
||||||
|
# SteveBreen has a duplicate in gocomics
|
||||||
|
# SteveKelley has a duplicate in gocomics
|
||||||
# StrangeBrew has a duplicate in gocomics
|
# StrangeBrew has a duplicate in gocomics
|
||||||
# TheBarn has a duplicate in gocomics
|
# TheBarn has a duplicate in gocomics
|
||||||
# TheDinetteSet has a duplicate in gocomics
|
# TheMeaningOfLila has a duplicate in gocomics
|
||||||
# TheMeaningofLila has a duplicate in gocomics
|
|
||||||
# TheOtherCoast has a duplicate in gocomics
|
# TheOtherCoast has a duplicate in gocomics
|
||||||
class TheQuigmans(_Creators):
|
class TomStiglich(_Creators):
|
||||||
path = u'the-quigmans'
|
path = 'tom-stiglich'
|
||||||
|
|
||||||
class TheWizardofIdinSpanish(_CreatorsEs):
|
|
||||||
path = u'wizard-of-id-spanish'
|
|
||||||
|
|
||||||
# ThinLines has a duplicate in gocomics
|
|
||||||
# WeePals has a duplicate in gocomics
|
# WeePals has a duplicate in gocomics
|
||||||
# WizardofId has a duplicate in gocomics
|
# WizardOfId has a duplicate in gocomics
|
||||||
# WorkingitOut has a duplicate in gocomics
|
class WizardOfIdSpanish(_CreatorsEs):
|
||||||
|
path = 'wizard-of-id-spanish'
|
||||||
|
|
||||||
|
|
||||||
|
# WorkingItOut has a duplicate in gocomics
|
||||||
# ZackHill has a duplicate in gocomics
|
# ZackHill has a duplicate in gocomics
|
||||||
|
|
|
@ -1,55 +1,66 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
# Copyright (C) 2012-2014 Bastian Kleineidam
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
||||||
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
||||||
"""
|
"""
|
||||||
Script to get a list of creators.com comics and save the info in a JSON file for further processing.
|
Script to get a list of creators.com comics and save the info in a JSON file
|
||||||
|
for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
import re
|
|
||||||
import codecs
|
import codecs
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
from lxml import html
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
|
||||||
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
|
||||||
|
from dosagelib.util import get_page
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
|
from scriptutil import (contains_case_insensitive, save_result, load_result,
|
||||||
|
truncate_name, format_name)
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
json_file = __file__.replace(".py", ".json")
|
||||||
|
|
||||||
url_matcher = re.compile(tagre("a", "href", r'/comics/([^/]+)\.html') + r'<strong>([^<]+)</strong>')
|
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
|
'Doodles', # no images
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
def handle_url(url, session, res):
|
||||||
"""Parse one search result page."""
|
"""Parse one listing page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
print("Parsing", url, file=sys.stderr)
|
||||||
try:
|
try:
|
||||||
data = getPageContent(url, session)
|
data = html.document_fromstring(get_page(url, session).text)
|
||||||
|
data.make_links_absolute(url)
|
||||||
except IOError as msg:
|
except IOError as msg:
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
print("ERROR:", msg, file=sys.stderr)
|
||||||
return
|
return
|
||||||
for match in url_matcher.finditer(data):
|
|
||||||
url = match.group(1)
|
for comicdiv in data.cssselect('ul.all-test li'):
|
||||||
name = unescape(match.group(2))
|
comiclink = comicdiv.cssselect('a')[0]
|
||||||
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
comicurl = comiclink.attrib['href']
|
||||||
name = capfirst(name)
|
name = format_name(comicdiv.cssselect('p strong')[0].text)
|
||||||
if name in exclude_comics:
|
if name in exclude_comics:
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
print("INFO: skipping possible duplicate", repr(name),
|
||||||
|
file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
res[name] = url
|
|
||||||
|
res[name] = comicurl.rsplit('/', 1)[1]
|
||||||
|
|
||||||
|
|
||||||
def get_results():
|
def get_results():
|
||||||
"""Parse all search result pages."""
|
"""Parse all search result pages."""
|
||||||
# store info in a dictionary {name -> shortname}
|
# store info in a dictionary {name -> shortname}
|
||||||
res = {}
|
res = {}
|
||||||
session = requests.Session()
|
sess = requests.Session()
|
||||||
handle_url('http://www.creators.com/comics/cat-seeall.html', session, res)
|
handle_url('https://www.creators.com/categories/comics/all', sess, res)
|
||||||
|
handle_url('https://www.creators.com/categories/cartoons/all', sess, res)
|
||||||
save_result(res, json_file)
|
save_result(res, json_file)
|
||||||
|
|
||||||
|
|
||||||
|
@ -68,14 +79,13 @@ def print_results(args):
|
||||||
min_comics, filename = args
|
min_comics, filename = args
|
||||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
with codecs.open(filename, 'a', 'utf-8') as fp:
|
||||||
for name, path in sorted(load_result(json_file).items()):
|
for name, path in sorted(load_result(json_file).items()):
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
lang = 'Es' if name.lower().endswith('spanish') else ''
|
lang = 'Es' if name.lower().endswith('spanish') else ''
|
||||||
if has_gocomics_comic(name):
|
if has_gocomics_comic(name):
|
||||||
fp.write(u'# %s has a duplicate in gocomics\n' % truncate_name(name))
|
fp.write(u'# %s has a duplicate in gocomics\n' %
|
||||||
|
truncate_name(name))
|
||||||
else:
|
else:
|
||||||
fp.write(u"class %s(_Creators%s):\n path = %r\n\n" %
|
fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" %
|
||||||
(truncate_name(name), lang, path))
|
(truncate_name(name), lang, path))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue