Update SmackJeeves update helper.

Don't use it right now, it adds a HUGE amount of comics.
This commit is contained in:
Tobias Gruetzmacher 2016-04-20 23:48:29 +02:00
parent fe51a449df
commit 1d2e1f2dd1

View file

@ -9,354 +9,171 @@ for further processing.
"""
from __future__ import absolute_import, division, print_function
import codecs
import re
import sys
import os
try:
from urllib.parse import urljoin
from urllib.parse import urlsplit
except ImportError:
from urlparse import urljoin
from urlparse import urlsplit
import requests
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
from scriptutil import ComicListUpdater
json_file = __file__.replace(".py", ".json")
class SmackJeevesUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'
# names of comics to exclude
exclude_comics = [
"4plyKamalsHead", # does not follow standard layout
"9Lives", # missing images
"ADifferentPerspective", # does not follow standard layout
"AFairlyTwistedRealitySuper", # does not follow standard layout
"Ahoge", # does not follow standard layout
"AngelJunkPileFelix", # images are 403 forbidden
"AntavioussGenLab", # images are 403 forbidden
"AreyougayJohnny", # does not follow standard layout
"Authorbattlesthevideogame", # missing images
"BambooArmonicKnightsGuild", # missing previous link
"BassLegends", # does not follow standard layout
"BreIshurnasspritesandstuff", # comic moved
"CatboyattheCon", # missing images
"Comatose", # does not follow standard layout
"ContraandtheSpamDump", # missing images
"ClubLove", # does not follow standard layout
"Darkkyosshorts", # missing images
"DeSTRESS", # does not follow standard layout
"DollarStoreCaviar", # broken images
"DreamCatcher", # does not follow standard layout
"EdgeofDecember", # missing images
"FroakieShocaiz", # unsuitable navigation
"Fumiko", # does not follow standard layout
"FurryExperience", # timeout
"GART", # does not follow standard layout
"GarytheAlchemist", # does not follow standard layout
"GBAsCrib", # timeout
"HAndJ", # missing images
"HEARD", # missing images
"Indigo", # broken domain name
"IwillbenapoSpamDump", # missing images
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
"JennyHaniver", # does not follow standard layout
"KiLAiLO", # does not follow standard layout
"KirbysoftheAlternateDimension", # missing images
"Letsreviewshallwe", # missing images
"LoudEra", # does not follow standard layout
"LunarHill", # does not follow standard layout
"Mafiagame", # does not follow standard layout
"MegaManSpriteExpo", # missing images
"MyLifewithFelENESPANOL", # does not follow standard layout
"MylifewithFel", # does not follow standard layout
"NegativeZen", # does not follow standard layout
"Nemutionpobae", # does not follow standard layout
"NightShot", # does not follow standard layout
"NormalIsBoring", # does not follow standard layout
"Okamirai", # images are 403 forbidden
"OmnisSpriteShowcase", # missing images
"OpticalDisarray", # does not follow standard layout
"PicturesofYou", # does not follow standard layout
"PiecesofBrokenGlass", # broken images
"PlatonicManagementDilemma", # missing images
"Pornjunkiesstrip", # does not follow standard layout
"PrettyUgly", # does not follow standard layout
"Project217", # does not follow standard layout
"RemmyzRandomz", # does not follow standard layout
"Ribon", # does not follow standard layout
"RubysWorld", # does not follow standard layout
"SecretSanta2011", # missing images
"ShinkaTheLastEevee", # does not follow standard layout
"SimplePixel", # does not follow standard layout
"SJArtCollab", # missing images
"SladesMansionofawesomeness", # does not follow standard layout
"SlightlyDifferent", # missing images
"SpaceSchool", # does not follow standard layout
"SushiGummy", # does not follow standard layout
"TheAfterSubtract", # does not follow standard layout
"ThePokemonArtBox", # does not follow standard layout
"THEVOIDWEBCOMIC", # does not follow standard layout
"TC2KsPokemobians", # does not follow standard layout
"ThreadCrashers", # has no previous comic link
"ToDefeatThemAll", # does not follow standard layout
"TotallyKotor", # missing images
"TwoKeys", # does not follow standard layout
"Vbcomics", # does not follow standard layout
"WerewolfRichard", # does not follow standard layout
"WinterMelody", # missing images
]
# names of comics to exclude
excluded_comics = (
# comic moved/we have a better module
"Amya",
"Carciphona",
"Footloose",
"TitleUnrelated",
# does not follow standard layout
"300DaysOfSyao",
"ADifferentPerspective",
"Captor",
"ClubLove",
"Comatose",
"DeSTRESS",
"DreamCatcher",
"Fumiko",
"GART",
"GarytheAlchemist",
"ItoshiisCrazyNuzlockeAdventures",
"JennyHaniver",
"KiLAiLO",
"LoudEra",
"LunarHill",
"Mafiagame",
"MylifewithFel",
"MyLifewithFelENESPANOL",
"NegativeZen",
"Nemutionpobae",
"NightShot",
"NormalIsBoring",
"OpticalDisarray",
"PicturesofYou",
"Pornjunkiesstrip",
"PrettyUgly",
"Project217",
"RemmyzRandomz",
"Ribon",
"RubysWorld",
"ShinkaTheLastEevee",
"SimplePixel",
"SladesMansionofawesomeness",
"SpaceSchool",
"SushiGummy",
"TC2KsPokemobians",
"TheAfterSubtract",
"ThePokemonArtBox",
"THEVOIDWEBCOMIC",
"ToDefeatThemAll",
"TwoKeys",
"Vbcomics",
"WerewolfRichard",
# the latest URL of some comics repeats the previous URL
# flag this so the bounceStart uses the correct URL
repeat_comics = [
"1009sSpritersVacation",
"22Special22Care",
"2Kingdoms",
"2Masters",
"AbbimaysRandomness",
"AdaLeeComesOn",
"AdventuresofMitch",
"AkumaKisei",
"ALaMode",
"AnimalLoversYuriCollab",
"Area9",
"AStrangeTypeofLove",
"Autophobia",
"BearlyAbel",
"BeCarefreeWithMeSoon",
"BlindandBlue",
"BlueStreak",
"BlueWell",
"BlueYonder",
"Border",
"BoyLessons",
"Boywithasecret",
"BreakFreemagazine",
"BrightStars",
"ByTheBook",
"ClairetheFlare",
"CloeRemembrance",
"ComicFullofSprites",
"CrappilyDrawnMinicomics",
"CupidsaMoron",
"D00R",
"DeathNoteIridescent",
"DemonEater",
"DenizensAttention",
"DevilsCake",
"Dreamcatchers",
"EmeraldNuzlocke",
"EonsAgo",
"ERRORERROR",
"EvilPlan",
"FailureConfetti",
"FlyorFail",
"ForestHill",
"FrobertTheDemon",
"GarytheAlchemist",
"GhostsTaleACrossover",
"Glasshearts",
"GoldenSunGenerationsAftermathVolume1",
"GoldenSunGenerationsColossoVolume6",
"GuardiansoftheGalaxialSpaceways",
"HatShop",
"HDMTHCOMICS",
"Helix",
"Hephaestus",
"HolyBlasphemy",
"HopeForABreeze",
"Hotarugari",
"InsideOuTAYuriTale",
"Insomanywords",
"INUSITADOONLINE",
"ItsCharacterDevelopment",
"JosephAndYusra",
"JustAnotherDay",
"KasaKeira",
"KirbyAdventure",
"KirbyandtheDarkKnight",
"KirbyFunfestTheOriginals",
"KirbysofTHEVOID",
"KuroiHitsuji",
"KuroShouri",
"LandoftheSky",
"LeCirquedObscure",
"LethalDose",
"LOGOS",
"LostLove",
"LsEmpire",
"MariovsSonicvsMegaMan",
"Mega",
"MementoMori",
"Mokepon",
"MrGrimmsCircusofHorrors",
"MyFakeHeart",
"MyFriendScotty",
"MYth",
"NemesisKatharsis",
"NiceKitty",
"Nutshel",
"OptimalClutter",
"Panacea",
"PhilosophicalPenisJokes",
"PrettyUgly",
"PSY",
"PTO",
"RainLGBT",
"ReidyandFriendsShowcase",
"RubysWorld",
"SallySprocketAndPistonPete",
"SimonSues",
"SimpleBear",
"SmallPressAdventures",
"SonicWorldAdventure",
"SoulGuardian",
"SPOON",
"STASonictheAdventure",
"Stay",
"StellaInChrome",
"StrangersandFriends",
"SunmeetsMoon",
"TAG",
"TaikiTheWebcomic",
"TechnicolorLondon",
"TEN",
"ThatWasntThereYesterday",
"TheAntihero",
"TheBrideoftheShark",
"TheCafedAlizee",
"TheEssyaneWarriors",
"ThehumanBEing",
"TheKwiddexProtocol",
"TheLegendofZeldaMaidenoftheMoon",
"ThePirateBalthasar",
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
"TheReborn",
"TheTytonNuzlockeChallengeEmeraldEdition",
"ToD",
"TPTruePower",
"TwoKeys",
"UndertheSkin",
"WelcometoFreakshow",
"Whenweweresilent",
"WhiteHeart",
"Yaoishereforareason",
"Zodiac",
]
# has no previous comic link
"ThreadCrashers",
"AchievementStuck",
# images are 403 forbidden
"AngelJunkPileFelix",
"AntavioussGenLab",
"Okamirai",
# links to last valid strips
url_overrides = {
}
# missing images
"CatboyattheCon",
"ContraandtheSpamDump",
"Darkkyosshorts",
"DollarStoreCaviar",
"EdgeofDecember",
"HAndJ",
"HEARD",
"IwillbenapoSpamDump",
"KirbysoftheAlternateDimension",
"Letsreviewshallwe",
"MegaManSpriteExpo",
"OmnisSpriteShowcase",
"PiecesofBrokenGlass",
"PlatonicManagementDilemma",
"SecretSanta2011",
"SerendipityAnEquestrianTale",
"SJArtCollab",
"SlightlyDifferent",
"TheAttackoftheRecoloursSeason1",
"TotallyKotor",
"WinterMelody",
"ZonowTheHedgehog",
# missing previous link
"BambooArmonicKnightsGuild",
# HTML content matcher
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
after="site_banner") +
tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
# broken host name
"Razor",
)
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
def handle_url(url, session, res):
"""Parse one search result page."""
print("Parsing", url, file=sys.stderr)
try:
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
page_url = urljoin(url, page_url)
name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end + 300]),
file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
data2 = get_page(page_url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
mo = url_matcher.search(data2)
if not mo:
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
continue
comic_url = mo.group(1)
# search for adult flag
adult = adult_matcher.search(data2[end:])
bounce = name not in repeat_comics
res[name] = [
url_overrides.get(name, comic_url), num, bool(adult), bounce
]
def get_results():
"""Parse all search result pages."""
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
session = requests.Session()
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
res = {}
# a search for an empty string returned 286 result pages
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
print(i + 1, file=sys.stderr, end=" ")
handle_url(base % (i * 12), session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname == cname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num, adult, bounce = entry
if num < min_comics:
continue
if has_comic(name):
prefix = u'#'
num = 999
for comicdiv in data.cssselect(
'div#webcomic_search_results div.full_banner_div'):
page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
name = comicdiv.cssselect('img.banny')
if name:
name = name[0].attrib['title']
else:
prefix = u''
fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
prefix, str(truncate_name(name)), str(url), adult, bounce
))
name = comicdiv.cssselect('h2')[0].text
# find out how many images this comic has
mo = comicdiv.cssselect('span.small-meter')
if not mo:
print("ERROR matching number of comics", file=sys.stderr)
continue
num = int(mo[0].text.strip())
# search for url in extra page
data2 = self.get_url(page_url)
mo = data2.cssselect('div#quick_reading_links a:last-child')
if not mo:
print("ERROR matching comic URL", file=sys.stderr)
continue
comic_url = mo[0].attrib['href']
# search for adult flag
adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
self.add_comic(name, (comic_url, bool(adult)), num)
next_url = data.cssselect(
"div.search_nav td:last-child a")[0].attrib['href']
return (next_url, num)
def collect_results(self):
"""Parse all search result pages."""
# Sort by number of comics, so we can abort when we get under some
# threshold.
next_url = (
"http://www.smackjeeves.com/search.php?submit=1" +
"&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
"&last_update=6&style_all=on&genre_all=on&format_all=on")
last_count = 999
while last_count >= self.MIN_COMICS:
print(last_count, file=sys.stderr, end=" ")
next_url, last_count = self.handle_url(next_url)
def get_classdef(self, name, data):
sub, top = urlsplit(data[0]).hostname.split('.', 1)
cl = u"class SJ%s(_SmackJeeves):" % name
if top.lower() == "smackjeeves.com":
cl += "\n sub = '%s'" % sub
else:
cl += "\n host = '%s.%s'" % (sub, top)
if data[1]:
cl += "\n adult = True"
return cl
if __name__ == '__main__':
if len(sys.argv) > 1:
print_results(sys.argv[1:])
else:
get_results()
SmackJeevesUpdater(__file__).run()