Update SmackJeeves update helper.

Don't use it right now, it adds a HUGE amount of comics.
This commit is contained in:
Tobias Gruetzmacher 2016-04-20 23:48:29 +02:00
parent fe51a449df
commit 1d2e1f2dd1

View file

@ -9,354 +9,171 @@ for further processing.
""" """
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import codecs
import re
import sys import sys
import os
try: try:
from urllib.parse import urljoin from urllib.parse import urlsplit
except ImportError: except ImportError:
from urlparse import urljoin from urlparse import urlsplit
import requests from scriptutil import ComicListUpdater
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
from dosagelib.util import get_page, tagre
from dosagelib.scraper import get_scrapers
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
json_file = __file__.replace(".py", ".json") class SmackJeevesUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'
# names of comics to exclude # names of comics to exclude
exclude_comics = [ excluded_comics = (
"4plyKamalsHead", # does not follow standard layout # comic moved/we have a better module
"9Lives", # missing images "Amya",
"ADifferentPerspective", # does not follow standard layout "Carciphona",
"AFairlyTwistedRealitySuper", # does not follow standard layout "Footloose",
"Ahoge", # does not follow standard layout "TitleUnrelated",
"AngelJunkPileFelix", # images are 403 forbidden
"AntavioussGenLab", # images are 403 forbidden
"AreyougayJohnny", # does not follow standard layout
"Authorbattlesthevideogame", # missing images
"BambooArmonicKnightsGuild", # missing previous link
"BassLegends", # does not follow standard layout
"BreIshurnasspritesandstuff", # comic moved
"CatboyattheCon", # missing images
"Comatose", # does not follow standard layout
"ContraandtheSpamDump", # missing images
"ClubLove", # does not follow standard layout
"Darkkyosshorts", # missing images
"DeSTRESS", # does not follow standard layout
"DollarStoreCaviar", # broken images
"DreamCatcher", # does not follow standard layout
"EdgeofDecember", # missing images
"FroakieShocaiz", # unsuitable navigation
"Fumiko", # does not follow standard layout
"FurryExperience", # timeout
"GART", # does not follow standard layout
"GarytheAlchemist", # does not follow standard layout
"GBAsCrib", # timeout
"HAndJ", # missing images
"HEARD", # missing images
"Indigo", # broken domain name
"IwillbenapoSpamDump", # missing images
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
"JennyHaniver", # does not follow standard layout
"KiLAiLO", # does not follow standard layout
"KirbysoftheAlternateDimension", # missing images
"Letsreviewshallwe", # missing images
"LoudEra", # does not follow standard layout
"LunarHill", # does not follow standard layout
"Mafiagame", # does not follow standard layout
"MegaManSpriteExpo", # missing images
"MyLifewithFelENESPANOL", # does not follow standard layout
"MylifewithFel", # does not follow standard layout
"NegativeZen", # does not follow standard layout
"Nemutionpobae", # does not follow standard layout
"NightShot", # does not follow standard layout
"NormalIsBoring", # does not follow standard layout
"Okamirai", # images are 403 forbidden
"OmnisSpriteShowcase", # missing images
"OpticalDisarray", # does not follow standard layout
"PicturesofYou", # does not follow standard layout
"PiecesofBrokenGlass", # broken images
"PlatonicManagementDilemma", # missing images
"Pornjunkiesstrip", # does not follow standard layout
"PrettyUgly", # does not follow standard layout
"Project217", # does not follow standard layout
"RemmyzRandomz", # does not follow standard layout
"Ribon", # does not follow standard layout
"RubysWorld", # does not follow standard layout
"SecretSanta2011", # missing images
"ShinkaTheLastEevee", # does not follow standard layout
"SimplePixel", # does not follow standard layout
"SJArtCollab", # missing images
"SladesMansionofawesomeness", # does not follow standard layout
"SlightlyDifferent", # missing images
"SpaceSchool", # does not follow standard layout
"SushiGummy", # does not follow standard layout
"TheAfterSubtract", # does not follow standard layout
"ThePokemonArtBox", # does not follow standard layout
"THEVOIDWEBCOMIC", # does not follow standard layout
"TC2KsPokemobians", # does not follow standard layout
"ThreadCrashers", # has no previous comic link
"ToDefeatThemAll", # does not follow standard layout
"TotallyKotor", # missing images
"TwoKeys", # does not follow standard layout
"Vbcomics", # does not follow standard layout
"WerewolfRichard", # does not follow standard layout
"WinterMelody", # missing images
]
# does not follow standard layout
# the latest URL of some comics repeats the previous URL "300DaysOfSyao",
# flag this so the bounceStart uses the correct URL "ADifferentPerspective",
repeat_comics = [ "Captor",
"1009sSpritersVacation", "ClubLove",
"22Special22Care", "Comatose",
"2Kingdoms", "DeSTRESS",
"2Masters", "DreamCatcher",
"AbbimaysRandomness", "Fumiko",
"AdaLeeComesOn", "GART",
"AdventuresofMitch",
"AkumaKisei",
"ALaMode",
"AnimalLoversYuriCollab",
"Area9",
"AStrangeTypeofLove",
"Autophobia",
"BearlyAbel",
"BeCarefreeWithMeSoon",
"BlindandBlue",
"BlueStreak",
"BlueWell",
"BlueYonder",
"Border",
"BoyLessons",
"Boywithasecret",
"BreakFreemagazine",
"BrightStars",
"ByTheBook",
"ClairetheFlare",
"CloeRemembrance",
"ComicFullofSprites",
"CrappilyDrawnMinicomics",
"CupidsaMoron",
"D00R",
"DeathNoteIridescent",
"DemonEater",
"DenizensAttention",
"DevilsCake",
"Dreamcatchers",
"EmeraldNuzlocke",
"EonsAgo",
"ERRORERROR",
"EvilPlan",
"FailureConfetti",
"FlyorFail",
"ForestHill",
"FrobertTheDemon",
"GarytheAlchemist", "GarytheAlchemist",
"GhostsTaleACrossover", "ItoshiisCrazyNuzlockeAdventures",
"Glasshearts", "JennyHaniver",
"GoldenSunGenerationsAftermathVolume1", "KiLAiLO",
"GoldenSunGenerationsColossoVolume6", "LoudEra",
"GuardiansoftheGalaxialSpaceways", "LunarHill",
"HatShop", "Mafiagame",
"HDMTHCOMICS", "MylifewithFel",
"Helix", "MyLifewithFelENESPANOL",
"Hephaestus", "NegativeZen",
"HolyBlasphemy", "Nemutionpobae",
"HopeForABreeze", "NightShot",
"Hotarugari", "NormalIsBoring",
"InsideOuTAYuriTale", "OpticalDisarray",
"Insomanywords", "PicturesofYou",
"INUSITADOONLINE", "Pornjunkiesstrip",
"ItsCharacterDevelopment",
"JosephAndYusra",
"JustAnotherDay",
"KasaKeira",
"KirbyAdventure",
"KirbyandtheDarkKnight",
"KirbyFunfestTheOriginals",
"KirbysofTHEVOID",
"KuroiHitsuji",
"KuroShouri",
"LandoftheSky",
"LeCirquedObscure",
"LethalDose",
"LOGOS",
"LostLove",
"LsEmpire",
"MariovsSonicvsMegaMan",
"Mega",
"MementoMori",
"Mokepon",
"MrGrimmsCircusofHorrors",
"MyFakeHeart",
"MyFriendScotty",
"MYth",
"NemesisKatharsis",
"NiceKitty",
"Nutshel",
"OptimalClutter",
"Panacea",
"PhilosophicalPenisJokes",
"PrettyUgly", "PrettyUgly",
"PSY", "Project217",
"PTO", "RemmyzRandomz",
"RainLGBT", "Ribon",
"ReidyandFriendsShowcase",
"RubysWorld", "RubysWorld",
"SallySprocketAndPistonPete", "ShinkaTheLastEevee",
"SimonSues", "SimplePixel",
"SimpleBear", "SladesMansionofawesomeness",
"SmallPressAdventures", "SpaceSchool",
"SonicWorldAdventure", "SushiGummy",
"SoulGuardian", "TC2KsPokemobians",
"SPOON", "TheAfterSubtract",
"STASonictheAdventure", "ThePokemonArtBox",
"Stay", "THEVOIDWEBCOMIC",
"StellaInChrome", "ToDefeatThemAll",
"StrangersandFriends",
"SunmeetsMoon",
"TAG",
"TaikiTheWebcomic",
"TechnicolorLondon",
"TEN",
"ThatWasntThereYesterday",
"TheAntihero",
"TheBrideoftheShark",
"TheCafedAlizee",
"TheEssyaneWarriors",
"ThehumanBEing",
"TheKwiddexProtocol",
"TheLegendofZeldaMaidenoftheMoon",
"ThePirateBalthasar",
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
"TheReborn",
"TheTytonNuzlockeChallengeEmeraldEdition",
"ToD",
"TPTruePower",
"TwoKeys", "TwoKeys",
"UndertheSkin", "Vbcomics",
"WelcometoFreakshow", "WerewolfRichard",
"Whenweweresilent",
"WhiteHeart",
"Yaoishereforareason",
"Zodiac",
]
# has no previous comic link
"ThreadCrashers",
"AchievementStuck",
# links to last valid strips # images are 403 forbidden
url_overrides = { "AngelJunkPileFelix",
} "AntavioussGenLab",
"Okamirai",
# missing images
"CatboyattheCon",
"ContraandtheSpamDump",
"Darkkyosshorts",
"DollarStoreCaviar",
"EdgeofDecember",
"HAndJ",
"HEARD",
"IwillbenapoSpamDump",
"KirbysoftheAlternateDimension",
"Letsreviewshallwe",
"MegaManSpriteExpo",
"OmnisSpriteShowcase",
"PiecesofBrokenGlass",
"PlatonicManagementDilemma",
"SecretSanta2011",
"SerendipityAnEquestrianTale",
"SJArtCollab",
"SlightlyDifferent",
"TheAttackoftheRecoloursSeason1",
"TotallyKotor",
"WinterMelody",
"ZonowTheHedgehog",
# HTML content matcher # missing previous link
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', "BambooArmonicKnightsGuild",
after="site_banner") +
tagre("img", "title", r'([^"]+)'))
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
# broken host name
"Razor",
)
def handle_url(url, session, res): def handle_url(self, url):
"""Parse one search result page.""" """Parse one search result page."""
print("Parsing", url, file=sys.stderr) data = self.get_url(url)
try:
data = get_page(url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
for match in page_matcher.finditer(data):
page_url = match.group(1)
page_url = urljoin(url, page_url)
name = format_name(match.group(2))
if name in exclude_comics:
continue
if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
# find out how many images this comic has
end = match.end()
mo = num_matcher.search(data[end:])
if not mo:
print("ERROR matching number:", repr(data[end:end + 300]),
file=sys.stderr)
continue
num = int(mo.group(1))
# search for url in extra page
print("Getting", page_url)
try:
data2 = get_page(page_url, session).text
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
return
mo = url_matcher.search(data2)
if not mo:
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
continue
comic_url = mo.group(1)
# search for adult flag
adult = adult_matcher.search(data2[end:])
bounce = name not in repeat_comics
res[name] = [
url_overrides.get(name, comic_url), num, bool(adult), bounce
]
num = 999
def get_results(): for comicdiv in data.cssselect(
"""Parse all search result pages.""" 'div#webcomic_search_results div.full_banner_div'):
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d" page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
session = requests.Session() name = comicdiv.cssselect('img.banny')
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag} if name:
res = {} name = name[0].attrib['title']
# a search for an empty string returned 286 result pages
result_pages = 286
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
for i in range(0, result_pages):
print(i + 1, file=sys.stderr, end=" ")
handle_url(base % (i * 12), session, res)
save_result(res, json_file)
def has_comic(name):
"""Check if comic name already exists."""
cname = name.lower()
for scraperobj in get_scrapers():
lname = scraperobj.name.lower()
if lname == cname:
return True
return False
def print_results(args):
"""Print all comics that have at least the given number of minimum comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
with codecs.open(filename, 'a', 'utf-8') as fp:
for name, entry in sorted(load_result(json_file).items()):
if name in exclude_comics:
continue
url, num, adult, bounce = entry
if num < min_comics:
continue
if has_comic(name):
prefix = u'#'
else: else:
prefix = u'' name = comicdiv.cssselect('h2')[0].text
fp.write(u"%sadd(%r, %r, %s, %s)\n" % ( # find out how many images this comic has
prefix, str(truncate_name(name)), str(url), adult, bounce mo = comicdiv.cssselect('span.small-meter')
)) if not mo:
print("ERROR matching number of comics", file=sys.stderr)
continue
num = int(mo[0].text.strip())
# search for url in extra page
data2 = self.get_url(page_url)
mo = data2.cssselect('div#quick_reading_links a:last-child')
if not mo:
print("ERROR matching comic URL", file=sys.stderr)
continue
comic_url = mo[0].attrib['href']
# search for adult flag
adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
self.add_comic(name, (comic_url, bool(adult)), num)
next_url = data.cssselect(
"div.search_nav td:last-child a")[0].attrib['href']
return (next_url, num)
def collect_results(self):
"""Parse all search result pages."""
# Sort by number of comics, so we can abort when we get under some
# threshold.
next_url = (
"http://www.smackjeeves.com/search.php?submit=1" +
"&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
"&last_update=6&style_all=on&genre_all=on&format_all=on")
last_count = 999
while last_count >= self.MIN_COMICS:
print(last_count, file=sys.stderr, end=" ")
next_url, last_count = self.handle_url(next_url)
def get_classdef(self, name, data):
sub, top = urlsplit(data[0]).hostname.split('.', 1)
cl = u"class SJ%s(_SmackJeeves):" % name
if top.lower() == "smackjeeves.com":
cl += "\n sub = '%s'" % sub
else:
cl += "\n host = '%s.%s'" % (sub, top)
if data[1]:
cl += "\n adult = True"
return cl
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: SmackJeevesUpdater(__file__).run()
print_results(sys.argv[1:])
else:
get_results()