Update SmackJeeves update helper.

Don't use it right now, it adds a HUGE amount of comics.
This commit is contained in:
Tobias Gruetzmacher 2016-04-20 23:48:29 +02:00
parent fe51a449df
commit 1d2e1f2dd1

View file

@ -9,354 +9,171 @@ for further processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs
import re
import sys import sys
import os
try: try:
from urllib.parse import urljoin from urllib.parse import urlsplit
except ImportError: except ImportError:
from urlparse import urljoin from urlparse import urlsplit
import requests from scriptutil import ComicListUpdater
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json") class SmackJeevesUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'
# names of comics to exclude # names of comics to exclude
exclude_comics = [ excluded_comics = (
"4plyKamalsHead", # does not follow standard layout # comic moved/we have a better module
"9Lives", # missing images "Amya",
"ADifferentPerspective", # does not follow standard layout "Carciphona",
"AFairlyTwistedRealitySuper", # does not follow standard layout "Footloose",
"Ahoge", # does not follow standard layout "TitleUnrelated",
"AngelJunkPileFelix", # images are 403 forbidden
"AntavioussGenLab", # images are 403 forbidden
"AreyougayJohnny", # does not follow standard layout
"Authorbattlesthevideogame", # missing images
"BambooArmonicKnightsGuild", # missing previous link
"BassLegends", # does not follow standard layout
"BreIshurnasspritesandstuff", # comic moved
"CatboyattheCon", # missing images
"Comatose", # does not follow standard layout
"ContraandtheSpamDump", # missing images
"ClubLove", # does not follow standard layout
"Darkkyosshorts", # missing images
"DeSTRESS", # does not follow standard layout
"DollarStoreCaviar", # broken images
"DreamCatcher", # does not follow standard layout
"EdgeofDecember", # missing images
"FroakieShocaiz", # unsuitable navigation
"Fumiko", # does not follow standard layout
"FurryExperience", # timeout
"GART", # does not follow standard layout
"GarytheAlchemist", # does not follow standard layout
"GBAsCrib", # timeout
"HAndJ", # missing images
"HEARD", # missing images
"Indigo", # broken domain name
"IwillbenapoSpamDump", # missing images
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
"JennyHaniver", # does not follow standard layout
"KiLAiLO", # does not follow standard layout
"KirbysoftheAlternateDimension", # missing images
"Letsreviewshallwe", # missing images
"LoudEra", # does not follow standard layout
"LunarHill", # does not follow standard layout
"Mafiagame", # does not follow standard layout
"MegaManSpriteExpo", # missing images
"MyLifewithFelENESPANOL", # does not follow standard layout
"MylifewithFel", # does not follow standard layout
"NegativeZen", # does not follow standard layout
"Nemutionpobae", # does not follow standard layout
"NightShot", # does not follow standard layout
"NormalIsBoring", # does not follow standard layout
"Okamirai", # images are 403 forbidden
"OmnisSpriteShowcase", # missing images
"OpticalDisarray", # does not follow standard layout
"PicturesofYou", # does not follow standard layout
"PiecesofBrokenGlass", # broken images
"PlatonicManagementDilemma", # missing images
"Pornjunkiesstrip", # does not follow standard layout
"PrettyUgly", # does not follow standard layout
"Project217", # does not follow standard layout
"RemmyzRandomz", # does not follow standard layout
"Ribon", # does not follow standard layout
"RubysWorld", # does not follow standard layout
"SecretSanta2011", # missing images
"ShinkaTheLastEevee", # does not follow standard layout
"SimplePixel", # does not follow standard layout
"SJArtCollab", # missing images
"SladesMansionofawesomeness", # does not follow standard layout
"SlightlyDifferent", # missing images
"SpaceSchool", # does not follow standard layout
"SushiGummy", # does not follow standard layout
"TheAfterSubtract", # does not follow standard layout
"ThePokemonArtBox", # does not follow standard layout
"THEVOIDWEBCOMIC", # does not follow standard layout
"TC2KsPokemobians", # does not follow standard layout
"ThreadCrashers", # has no previous comic link
"ToDefeatThemAll", # does not follow standard layout
"TotallyKotor", # missing images
"TwoKeys", # does not follow standard layout
"Vbcomics", # does not follow standard layout
"WerewolfRichard", # does not follow standard layout
"WinterMelody", # missing images
]
# does not follow standard layout
"300DaysOfSyao",
"ADifferentPerspective",
"Captor",
"ClubLove",
"Comatose",
"DeSTRESS",
"DreamCatcher",
"Fumiko",
"GART",
"GarytheAlchemist",
"ItoshiisCrazyNuzlockeAdventures",
"JennyHaniver",
"KiLAiLO",
"LoudEra",
"LunarHill",
"Mafiagame",
"MylifewithFel",
"MyLifewithFelENESPANOL",
"NegativeZen",
"Nemutionpobae",
"NightShot",
"NormalIsBoring",
"OpticalDisarray",
"PicturesofYou",
"Pornjunkiesstrip",
"PrettyUgly",
"Project217",
"RemmyzRandomz",
"Ribon",
"RubysWorld",
"ShinkaTheLastEevee",
"SimplePixel",
"SladesMansionofawesomeness",
"SpaceSchool",
"SushiGummy",
"TC2KsPokemobians",
"TheAfterSubtract",
"ThePokemonArtBox",
"THEVOIDWEBCOMIC",
"ToDefeatThemAll",
"TwoKeys",
"Vbcomics",
"WerewolfRichard",
# the latest URL of some comics repeats the previous URL # has no previous comic link
# flag this so the bounceStart uses the correct URL "ThreadCrashers",
repeat_comics = [ "AchievementStuck",
"1009sSpritersVacation",
"22Special22Care",
"2Kingdoms",
"2Masters",
"AbbimaysRandomness",
"AdaLeeComesOn",
"AdventuresofMitch",
"AkumaKisei",
"ALaMode",
"AnimalLoversYuriCollab",
"Area9",
"AStrangeTypeofLove",
"Autophobia",
"BearlyAbel",
"BeCarefreeWithMeSoon",
"BlindandBlue",
"BlueStreak",
"BlueWell",
"BlueYonder",
"Border",
"BoyLessons",
"Boywithasecret",
"BreakFreemagazine",
"BrightStars",
"ByTheBook",
"ClairetheFlare",
"CloeRemembrance",
"ComicFullofSprites",
"CrappilyDrawnMinicomics",
"CupidsaMoron",
"D00R",
"DeathNoteIridescent",
"DemonEater",
"DenizensAttention",
"DevilsCake",
"Dreamcatchers",
"EmeraldNuzlocke",
"EonsAgo",
"ERRORERROR",
"EvilPlan",
"FailureConfetti",
"FlyorFail",
"ForestHill",
"FrobertTheDemon",
"GarytheAlchemist",
"GhostsTaleACrossover",
"Glasshearts",
"GoldenSunGenerationsAftermathVolume1",
"GoldenSunGenerationsColossoVolume6",
"GuardiansoftheGalaxialSpaceways",
"HatShop",
"HDMTHCOMICS",
"Helix",
"Hephaestus",
"HolyBlasphemy",
"HopeForABreeze",
"Hotarugari",
"InsideOuTAYuriTale",
"Insomanywords",
"INUSITADOONLINE",
"ItsCharacterDevelopment",
"JosephAndYusra",
"JustAnotherDay",
"KasaKeira",
"KirbyAdventure",
"KirbyandtheDarkKnight",
"KirbyFunfestTheOriginals",
"KirbysofTHEVOID",
"KuroiHitsuji",
"KuroShouri",
"LandoftheSky",
"LeCirquedObscure",
"LethalDose",
"LOGOS",
"LostLove",
"LsEmpire",
"MariovsSonicvsMegaMan",
"Mega",
"MementoMori",
"Mokepon",
"MrGrimmsCircusofHorrors",
"MyFakeHeart",
"MyFriendScotty",
"MYth",
"NemesisKatharsis",
"NiceKitty",
"Nutshel",
"OptimalClutter",
"Panacea",
"PhilosophicalPenisJokes",
"PrettyUgly",
"PSY",
"PTO",
"RainLGBT",
"ReidyandFriendsShowcase",
"RubysWorld",
"SallySprocketAndPistonPete",
"SimonSues",
"SimpleBear",
"SmallPressAdventures",
"SonicWorldAdventure",
"SoulGuardian",
"SPOON",
"STASonictheAdventure",
"Stay",
"StellaInChrome",
"StrangersandFriends",
"SunmeetsMoon",
"TAG",
"TaikiTheWebcomic",
"TechnicolorLondon",
"TEN",
"ThatWasntThereYesterday",
"TheAntihero",
"TheBrideoftheShark",
"TheCafedAlizee",
"TheEssyaneWarriors",
"ThehumanBEing",
"TheKwiddexProtocol",
"TheLegendofZeldaMaidenoftheMoon",
"ThePirateBalthasar",
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
"TheReborn",
"TheTytonNuzlockeChallengeEmeraldEdition",
"ToD",
"TPTruePower",
"TwoKeys",
"UndertheSkin",
"WelcometoFreakshow",
"Whenweweresilent",
"WhiteHeart",
"Yaoishereforareason",
"Zodiac",
]
# images are 403 forbidden
"AngelJunkPileFelix",
"AntavioussGenLab",
"Okamirai",
# links to last valid strips # missing images
url_overrides = { "CatboyattheCon",
} "ContraandtheSpamDump",
"Darkkyosshorts",
"DollarStoreCaviar",
"EdgeofDecember",
"HAndJ",
"HEARD",
"IwillbenapoSpamDump",
"KirbysoftheAlternateDimension",
"Letsreviewshallwe",
"MegaManSpriteExpo",
"OmnisSpriteShowcase",
"PiecesofBrokenGlass",
"PlatonicManagementDilemma",
"SecretSanta2011",
"SerendipityAnEquestrianTale",
"SJArtCollab",
"SlightlyDifferent",
"TheAttackoftheRecoloursSeason1",
"TotallyKotor",
"WinterMelody",
"ZonowTheHedgehog",
# missing previous link
"BambooArmonicKnightsGuild",
# HTML content matcher # broken host name
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', "Razor",
after="site_banner") + )
tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def handle_url(url, session, res): num = 999
"""Parse one search result page.""" for comicdiv in data.cssselect(
print("Parsing", url, file=sys.stderr) 'div#webcomic_search_results div.full_banner_div'):
try: page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
data = get_page(url, session).text name = comicdiv.cssselect('img.banny')
except IOError as msg: if name:
print("ERROR:", msg, file=sys.stderr) name = name[0].attrib['title']
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
page_url = urljoin(url, page_url)
name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end + 300]),
file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
data2 = get_page(page_url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
mo = url_matcher.search(data2)
if not mo:
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
continue
comic_url = mo.group(1)
# search for adult flag
adult = adult_matcher.search(data2[end:])
bounce = name not in repeat_comics
res[name] = [
url_overrides.get(name, comic_url), num, bool(adult), bounce
]
def get_results():
"""Parse all search result pages."""
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
session = requests.Session()
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
res = {}
# a search for an empty string returned 286 result pages
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
print(i + 1, file=sys.stderr, end=" ")
handle_url(base % (i * 12), session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname == cname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num, adult, bounce = entry
if num < min_comics:
continue
if has_comic(name):
prefix = u'#'
else: else:
prefix = u'' name = comicdiv.cssselect('h2')[0].text
fp.write(u"%sadd(%r, %r, %s, %s)\n" % ( # find out how many images this comic has
prefix, str(truncate_name(name)), str(url), adult, bounce mo = comicdiv.cssselect('span.small-meter')
)) if not mo:
print("ERROR matching number of comics", file=sys.stderr)
continue
num = int(mo[0].text.strip())
# search for url in extra page
data2 = self.get_url(page_url)
mo = data2.cssselect('div#quick_reading_links a:last-child')
if not mo:
print("ERROR matching comic URL", file=sys.stderr)
continue
comic_url = mo[0].attrib['href']
# search for adult flag
adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
self.add_comic(name, (comic_url, bool(adult)), num)
next_url = data.cssselect(
"div.search_nav td:last-child a")[0].attrib['href']
return (next_url, num)
def collect_results(self):
"""Parse all search result pages."""
# Sort by number of comics, so we can abort when we get under some
# threshold.
next_url = (
"http://www.smackjeeves.com/search.php?submit=1" +
"&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
"&last_update=6&style_all=on&genre_all=on&format_all=on")
last_count = 999
while last_count >= self.MIN_COMICS:
print(last_count, file=sys.stderr, end=" ")
next_url, last_count = self.handle_url(next_url)
def get_classdef(self, name, data):
sub, top = urlsplit(data[0]).hostname.split('.', 1)
cl = u"class SJ%s(_SmackJeeves):" % name
if top.lower() == "smackjeeves.com":
cl += "\n sub = '%s'" % sub
else:
cl += "\n host = '%s.%s'" % (sub, top)
if data[1]:
cl += "\n adult = True"
return cl
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: SmackJeevesUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()