Refactor update helpers: Remove duplicate code.

This commit is contained in:
Tobias Gruetzmacher 2016-04-14 22:22:37 +02:00
parent 497653c448
commit dab5aef094
9 changed files with 344 additions and 497 deletions

View file

@ -9,97 +9,35 @@ processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs from scriptutil import ComicListUpdater
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
"HagartheHorrible", # better source available
]
def handle_url(url, session, res): class ArcamaxUpdater(ComicListUpdater):
"""Parse one search result page.""" dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
print("Parsing", url, file=sys.stderr) "KeenSpot/%s", "ComicGenesis/%s", "SmackJeeves/%s")
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for comiclink in data.cssselect('a.comic-icon'): # names of comics to exclude
path = comiclink.attrib['href'] excluded_comics = (
name = format_name(comiclink.attrib['title']) "HagartheHorrible", # better source available
if name in exclude_comics: )
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = path.rsplit('/', 2)[1]
if not res:
print("ERROR:", "did not match any comics", file=sys.stderr)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def get_results(): for comiclink in data.cssselect('a.comic-icon'):
"""Parse all search result pages.""" path = comiclink.attrib['href']
# store info in a dictionary {name -> shortname} name = comiclink.attrib['title']
res = {}
session = requests.Session()
handle_url('http://www.arcamax.com/comics', session, res)
save_result(res, json_file)
self.add_comic(name, path.rsplit('/', 2)[1])
def find_dups(name): def collect_results(self):
"""Check if comic name already exists.""" """Parse all search result pages."""
names = [ self.handle_url('http://www.arcamax.com/comics')
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
]
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def get_classdef(self, name, entry):
def first_lower(x): return u"class %s(_Arcamax):\n path = %r" % (name, entry)
return x[0].lower()
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, path in sorted(data.items(), key=first_lower):
dup = find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\nclass %s(_Arcamax):\n path = %r\n" % (
truncate_name(name), path))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: ArcamaxUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()

View file

@ -9,213 +9,148 @@ processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs
import sys import sys
import os
import requests from scriptutil import ComicListUpdater
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
# unsuitable navigation
"AlfdisAndGunnora",
"AnAmericanNerdInAnimatedTokyo",
"AngryAlien",
"BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
# no content
"Angst",
# images gone
"BaseballCapsAndTiaras",
"CROSSWORLDSNEXUS",
"Fathead",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"TezzleAndZeek",
# broken HTML
"CrossingOver",
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
]
def handle_url(url, session, res): class ComicFuryUpdater(ComicListUpdater):
"""Parse one search result page.""" # Absolute minumum number of pages a comic may have (restrict search space)
print("Parsing", url, file=sys.stderr) MIN_COMICS = 90
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
num = 999 dup_templates = ("Creators/%s", "DrunkDuck/%s", "GoComics/%s",
for comicdiv in data.cssselect('div.searchresult'): "KeenSpot/%s", "SmackJeeves/%s", "Arcamax/%s")
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = format_name(comiclink.text)
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
info = comicdiv.cssselect('span.comicinfo') # names of comics to exclude
# find out how many images this comic has excluded_comics = (
num = int(info[1].text.strip()) # unsuitable navigation
# find activity "AlfdisAndGunnora",
active = info[6].text.strip().lower() == "active" "AnAmericanNerdInAnimatedTokyo",
lang = info[7].text.strip().lower() "AngryAlien",
res[name] = [comicurl, num, active, lang] "BoozerAndStoner",
"Bonejangles",
"ConradStory",
"Crossing",
"ChristianHumberReloaded",
"CorkAndBlotto",
"Democomix",
"ErraticBeatComics",
"EnergyWielders",
"EvilBearorg",
"Fiascos",
"FateOfTheBlueStar",
"FPK",
"Fanartgyle",
"FrigginRandom",
"GoodbyeKitty",
"HighlyExperiMental",
"IfAndCanBeFlowers",
"JournalismStory",
"JohnsonSuperior",
"Keel",
"JudgeDredBasset",
"LomeathAndHuilii",
"MNPB",
"LucidsDream",
"MadDog",
"Minebreakers",
"MoonlightValley",
"MyImmortalFool",
"NATO",
"NothingFits",
"OptimisticFishermenAndPessimisticFishermen",
"Old2G",
"NothingFitsArtBlog",
"OutToLunchTheStingRayWhoreStory",
"Pandemonium",
"Pewfell",
"ProjectX",
"Ratantia",
"RealLifeTrips",
"Sandgate",
"Secondpuberty",
"Seconds",
"SlightlyEccentricOrigins",
"StardustTheCat",
"StrangerThanFiction",
"TalamakGreatAdventure",
"TheBattalion",
"TheDailyProblem",
"TheMansionOfE",
"ThePainter",
"TheSeekers",
"TheTrialsOfKlahadOfTheAbyss",
"TheStickmen",
"ThornsInOurSide",
"TopHeavyVeryBustyPinUpsForAdults",
"USBUnlimitedSimulatedBody",
"TylerHumanRecycler",
"UAF",
"WhenPigsFly",
"YeOldeLegotimeTheatre",
return num # no content
"Angst",
# images gone
"BaseballCapsAndTiaras",
"CROSSWORLDSNEXUS",
"Fathead",
"KevinZombie",
"KindergardenCrisIs",
"NoSongsForTheDead",
"RequiemShadowbornPariah",
"TezzleAndZeek",
def get_results(): # broken HTML
"""Parse all search result pages.""" "CrossingOver",
# store info in a dictionary {name -> shortname}
res = {}
session = requests.Session()
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&query=' +
'&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&all_ge=1' +
'&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1
print("Parsing search result pages...", file=sys.stderr)
while last_count >= MIN_COMICS:
last_count = handle_url(baseUrl % page, session, res)
page += 1
print(last_count, file=sys.stderr, end=" ")
save_result(res, json_file)
# unique html
"IKilledTheHero",
"PowerOfPower",
"Schizmatic",
"WakeTheSleepers",
"WeightOfEternity",
)
def find_dups(name): def handle_url(self, url):
"""Check if comic name already exists.""" """Parse one search result page."""
names = [ data = self.get_url(url)
("Creators/%s" % name).lower(),
("DrunkDuck/%s" % name).lower(),
("GoComics/%s" % name).lower(),
("KeenSpot/%s" % name).lower(),
("SmackJeeves/%s" % name).lower(),
("Arcamax/%s" % name).lower(),
]
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname in names:
return scraperclass.getName().lower()
return None
count = 999
for comicdiv in data.cssselect('div.searchresult'):
comiclink = comicdiv.cssselect('h3 a')[0]
comicurl = comiclink.attrib['href']
name = comiclink.text
def first_lower(x): info = comicdiv.cssselect('span.comicinfo')
return x[0].lower() # find out how many images this comic has
count = int(info[1].text.strip())
# find activity
active = info[6].text.strip().lower() == "active"
lang = info[7].text.strip().lower()
self.add_comic(name, (comicurl, active, lang), count)
return count
def print_results(args): def collect_results(self):
"""Print all comics that have at least the given number of minimum """Parse all search result pages."""
comic strips.""" # Sort by page count, so we can abort when we get under some threshold.
min_comics, filename = args baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
min_comics = int(min_comics) 'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
with codecs.open(filename, 'a', 'utf-8') as fp: 'all_ge=1&all_st=1&all_la=1&page=%d')
data = load_result(json_file) last_count = 999
for name, entry in sorted(data.items(), key=first_lower): page = 1
url, num, active, lang = entry print("Parsing search result pages...", file=sys.stderr)
if name in exclude_comics: while last_count >= self.MIN_COMICS:
fp.write(u"# %s is excluded\n" % name) last_count = self.handle_url(baseUrl % page)
continue page += 1
if num < min_comics: print(last_count, file=sys.stderr, end=" ")
continue
dup = find_dups(name) def get_classdef(self, name, entry):
if dup is not None: url, active, lang = entry
fp.write(u"# %s has a duplicate in %s\n" % (name, dup)) return u"class CF%s(_ComicFury):\n url = %r" % (name, url)
else:
fp.write(u"class CF%s(_ComicFury):\n url = %r\n\n\n" % (
truncate_name(name), str(url)))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: ComicFuryUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result, from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name) truncate_name, format_name)
@ -435,8 +435,8 @@ def has_comic(name):
("Creators/%s" % name).lower(), ("Creators/%s" % name).lower(),
("GoComics/%s" % name).lower(), ("GoComics/%s" % name).lower(),
] ]
for scraperclass in get_scraperclasses(): for scraperobj in get_scrapers():
lname = scraperclass.getName().lower() lname = scraperclass.name.lower()
if lname in names: if lname in names:
return True return True
return False return False

View file

@ -8,88 +8,37 @@ for further processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs from scriptutil import ComicListUpdater
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from dosagelib.scraper import get_scraperclasses
from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
'Doodles', # no images
]
def handle_url(url, session, res): class CreatorsUpdater(ComicListUpdater):
"""Parse one listing page.""" dup_templates = ('GoComics/%s',)
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for comicdiv in data.cssselect('ul.all-test li'): # names of comics to exclude
comiclink = comicdiv.cssselect('a')[0] excluded_comics = (
comicurl = comiclink.attrib['href'] # no images
name = format_name(comicdiv.cssselect('p strong')[0].text) 'Doodles',
if name in exclude_comics: )
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
continue
res[name] = comicurl.rsplit('/', 1)[1] def handle_url(self, url):
"""Parse one listing page."""
data = self.get_url(url)
for comicdiv in data.cssselect('ul.all-test li'):
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = comicdiv.cssselect('p strong')[0].text
def get_results(): self.add_comic(name, comicurl.rsplit('/', 1)[1])
"""Parse all search result pages."""
# store info in a dictionary {name -> shortname}
res = {}
sess = requests.Session()
handle_url('https://www.creators.com/categories/comics/all', sess, res)
handle_url('https://www.creators.com/categories/cartoons/all', sess, res)
save_result(res, json_file)
def collect_results(self):
"""Parse all search result pages."""
self.handle_url('https://www.creators.com/categories/comics/all')
self.handle_url('https://www.creators.com/categories/cartoons/all')
def has_gocomics_comic(name): def get_classdef(self, name, data):
"""Test if comic name already exists.""" lang = 'Es' if name.lower().endswith('spanish') else ''
cname = "Gocomics/%s" % name return u"class %s(_Creators%s):\n path = %r" % (name, lang, data)
for scraperclass in get_scraperclasses():
lname = scraperclass.getName().lower()
if lname == cname.lower():
return True
return False
def print_results(args):
"""Print comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, path in sorted(load_result(json_file).items()):
lang = 'Es' if name.lower().endswith('spanish') else ''
if has_gocomics_comic(name):
fp.write(u'# %s has a duplicate in gocomics\n' %
truncate_name(name))
else:
fp.write(u"class %s(_Creators%s):\n path = %r\n\n\n" %
(truncate_name(name), lang, path))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: CreatorsUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()

View file

@ -9,86 +9,45 @@ processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs from scriptutil import ComicListUpdater
import sys
import os
import requests
from lxml import html
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page
from scriptutil import contains_case_insensitive, format_name, save_result, load_result, truncate_name
json_file = __file__.replace(".py", ".json")
# names of comics to exclude
exclude_comics = [
# "coming soon"
"Angryprogrammer",
"Guinness",
"Jabberwoncky",
"RandysRationale"
"SignsOfOurTimes",
"TheGagwriter",
"Yaoyao",
# duplicate
"SaturdayMorningBreakfastCereal",
]
def handle_url(url, session, res): class GoComicsUpdater(ComicListUpdater):
"""Parse one search result page.""" # names of comics to exclude
print("Parsing", url, file=sys.stderr) excluded_comics = [
try: # "coming soon"
data = html.document_fromstring(get_page(url, session).text) "Angryprogrammer",
except IOError as msg: "Guinness",
print("ERROR:", msg, file=sys.stderr) "Jabberwoncky",
return "RandysRationale"
"SignsOfOurTimes",
"TheGagwriter",
"Yaoyao",
for comiclink in data.cssselect('a.alpha_list'): # duplicate
link = comiclink.attrib['href'] "SaturdayMorningBreakfastCereal",
name = format_name(comiclink.text) ]
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
res[name] = link
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def get_results(): for comiclink in data.cssselect('a.alpha_list'):
"""Parse all search result pages.""" link = comiclink.attrib['href']
# store info in a dictionary {name -> uri} name = comiclink.text
res = {} self.add_comic(name, link)
session = requests.Session()
handle_url('http://www.gocomics.com/features', session, res)
handle_url('http://www.gocomics.com/explore/espanol', session, res)
handle_url('http://www.gocomics.com/explore/editorial_list', session, res)
handle_url('http://www.gocomics.com/explore/sherpa_list', session, res)
save_result(res, json_file)
def collect_results(self):
"""Parse all listing pages."""
self.handle_url('http://www.gocomics.com/features')
self.handle_url('http://www.gocomics.com/explore/espanol')
self.handle_url('http://www.gocomics.com/explore/editorial_list')
self.handle_url('http://www.gocomics.com/explore/sherpa_list')
def first_lower(x): def get_classdef(self, name, url):
return x[0].lower() return u"class GC%s(_GoComics%s):\n path = %r" % (
name, 'Es' if 'espanol/' in url else '', url[1:])
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, uri in sorted(data.items(), key=first_lower):
if name in exclude_comics:
print("Excluded " + name)
continue
fp.write(u"\n\nclass GC%s(_GoComics%s):\n path = %r\n" % (
truncate_name(name), 'Es' if 'espanol/' in uri else '',
uri[1:]))
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: GoComicsUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()

View file

@ -18,7 +18,7 @@ import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa sys.path.append(os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import get_page, tagre, check_robotstxt from dosagelib.util import get_page, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scrapers
from scriptutil import (contains_case_insensitive, save_result, load_result, from scriptutil import (contains_case_insensitive, save_result, load_result,
truncate_name, format_name) truncate_name, format_name)
@ -108,8 +108,8 @@ def has_comic(name):
("GoComics/%s" % name).lower(), ("GoComics/%s" % name).lower(),
("ComicGenesis/%s" % name).lower(), ("ComicGenesis/%s" % name).lower(),
] ]
for scraperclass in get_scraperclasses(): for scraperobj in get_scrapers():
lname = scraperclass.getName().lower() lname = scraperobj.name.lower()
if lname in names: if lname in names:
return True return True
return False return False

View file

@ -5,11 +5,114 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import os
import re import re
import sys
import json import json
import codecs import codecs
from dosagelib.util import unescape import requests
from lxml import html
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) # noqa
from dosagelib.util import unescape, get_page
from dosagelib import scraper
def first_lower(x):
return x[0].lower()
class ComicListUpdater(object):
dup_templates = ()
excluded_comics = ()
def __init__(self, name):
self.json = name.replace(".py", ".json")
self.session = requests.Session()
def get_url(self, url):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, self.session).text)
data.make_links_absolute(url)
return data
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
raise
def should_skip(self, name):
if name in self.excluded_comics:
return True
if contains_case_insensitive(self.res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
return True
return False
def get_results(self):
"""Collect comics and save dictionary in JSON file."""
self.res = {}
self.collect_results()
if not self.res:
print("ERROR:", "did not match any comics", file=sys.stderr)
return
with codecs.open(self.json, 'wb', 'utf-8') as f:
json.dump(self.res, f, sort_keys=True, indent=2,
separators=(',', ': '))
def add_comic(self, name, data, count=None):
"""Add a collected comic with a specific number of comics."""
name = format_name(name)
if not self.should_skip(name):
self.res[name] = {'count': count, 'data': data}
def collect_results(self):
raise NotImplementedError
def print_results(self, args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
with codecs.open(self.json, 'rb', 'utf-8') as f:
data = json.load(f)
for name, entry in sorted(data.items(), key=first_lower):
count = entry['count']
if count and count < min_comics:
continue
dup = self.find_dups(name)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(u"\n\n%s\n" %
self.get_classdef(truncate_name(name),
entry['data']))
def find_dups(self, name):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraperobj in scraper.get_scrapers():
lname = scraperobj.name.lower()
if lname in names or lname == name.lower():
return scraperobj.name
return None
def get_classdef(self, name, data):
raise NotImplementedError
def run(self):
if len(sys.argv) > 1:
self.print_results(sys.argv[1:])
else:
self.get_results()
def contains_case_insensitive(adict, akey): def contains_case_insensitive(adict, akey):

View file

@ -22,7 +22,7 @@ import requests
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
@ -328,8 +328,8 @@ def get_results():
def has_comic(name): def has_comic(name):
"""Check if comic name already exists.""" """Check if comic name already exists."""
cname = name.lower() cname = name.lower()
for scraperclass in get_scraperclasses(): for scraperobj in get_scrapers():
lname = scraperclass.getName().lower() lname = scraperobj.name.lower()
if lname == cname: if lname == cname:
return True return True
return False return False

View file

@ -9,77 +9,40 @@ further processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs from scriptutil import ComicListUpdater
import sys
import os
import requests
from lxml import html
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page
from scriptutil import (save_result, load_result, truncate_name, format_name)
json_file = __file__.replace(".py", ".json")
def find_first(session, url): class WebComicFactoryUpdater(ComicListUpdater):
print("Parsing", url, file=sys.stderr)
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return url
firstlinks = data.cssselect('a.comic-nav-first')
if not firstlinks:
print("INFO No first link on »%s«, already first page?" % (url))
return url
return firstlinks[0].attrib['href']
def find_first(self, url):
data = self.get_url(url)
def get_results(): firstlinks = data.cssselect('a.comic-nav-first')
"""Parse start page for supported comics.""" if not firstlinks:
res = {} print("INFO:", "No first link on »%s«, already first page?" %
url = 'http://www.thewebcomicfactory.com/' (url))
session = requests.Session() return url
print("Parsing", url, file=sys.stderr) return firstlinks[0].attrib['href']
try:
data = html.document_fromstring(get_page(url, session).text)
data.make_links_absolute(url)
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return {}
for comicdiv in data.cssselect('div.ceo_thumbnail_widget'): def collect_results(self):
comicname = comicdiv.cssselect('h2')[0] """Parse start page for supported comics."""
comiclink = comicdiv.cssselect('a')[0] url = 'http://www.thewebcomicfactory.com/'
comicurl = comiclink.attrib['href'] data = self.get_url(url)
name = format_name(comicname.text)
if 'comic-color-key' in comicurl:
continue
comicurl = find_first(session, comicurl)
res[name] = comicurl
save_result(res, json_file) for comicdiv in data.cssselect('div.ceo_thumbnail_widget'):
comicname = comicdiv.cssselect('h2')[0]
comiclink = comicdiv.cssselect('a')[0]
comicurl = comiclink.attrib['href']
name = comicname.text
if 'comic-color-key' in comicurl:
continue
comicurl = self.find_first(comicurl)
self.add_comic(name, comicurl)
def get_classdef(self, name, url):
def first_lower(x): return (u"class %s(_WebcomicFactory):\n url = %r\n" % (name, url) +
return x[0].lower() u" firstStripUrl = url")
def print_results(args):
"""Print all comics."""
min_comics, filename = args
with codecs.open(filename, 'a', 'utf-8') as fp:
data = load_result(json_file)
for name, url in sorted(data.items(), key=first_lower):
fp.write(u"\n\nclass %s(_WebcomicFactory):\n url = %r\n" % (
truncate_name(name), str(url)))
fp.write(u" firstStripUrl = url\n")
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: WebComicFactoryUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()