2012-12-12 16:41:29 +00:00
|
|
|
#!/usr/bin/env python
|
2013-01-09 21:21:19 +00:00
|
|
|
# Copyright (C) 2012-2013 Bastian Kleineidam
|
2012-12-12 16:41:29 +00:00
|
|
|
"""
|
|
|
|
Script to get a list of smackjeeves.com comics and save the info in a JSON file for further processing.
|
|
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import urlparse
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
|
|
|
from dosagelib.util import getPageContent, asciify, unescape, tagre, unquote
|
|
|
|
from dosagelib.scraper import get_scrapers
|
2013-01-09 21:20:03 +00:00
|
|
|
from scriptutil import contains_case_insensitive, remove_html_tags, capfirst, compact_whitespace, save_result, load_result, truncate_name
|
2012-12-12 16:41:29 +00:00
|
|
|
|
|
|
|
json_file = __file__.replace(".py", ".json")
|
|
|
|
|
|
|
|
# names of comics to exclude
|
|
|
|
exclude_comics = [
|
|
|
|
"ADifferentPerspective", # does not follow standard layout
|
|
|
|
"Ahoge", # does not follow standard layout
|
|
|
|
"AngelJunkPileFelix", # images are 403 forbidden
|
|
|
|
"Authorbattlesthevideogame", # missing images
|
|
|
|
"BambooArmonicKnightsGuild", # missing previous link
|
|
|
|
"ClubLove", # does not follow standard layout
|
|
|
|
"DeSTRESS", # does not follow standard layout
|
|
|
|
"DollarStoreCaviar", # broken images
|
|
|
|
"DreamCatcher", # does not follow standard layout
|
|
|
|
"EdgeofDecember", # missing images
|
|
|
|
"Fumiko", # does not follow standard layout
|
|
|
|
"FurryExperience", # timeout
|
|
|
|
"GBAsCrib", # timeout
|
|
|
|
"HEARD", # missing images
|
|
|
|
"JennyHaniver", # does not follow standard layout
|
|
|
|
"KiLAiLO", # does not follow standard layout
|
|
|
|
"LoudEra", # does not follow standard layout
|
|
|
|
"LunarHill", # does not follow standard layout
|
|
|
|
"MyLifewithFelENESPANOL", # does not follow standard layout
|
|
|
|
"MylifewithFel", # does not follow standard layout
|
|
|
|
"NegativeZen", # does not follow standard layout
|
|
|
|
"NightShot", # does not follow standard layout
|
2012-12-13 20:05:27 +00:00
|
|
|
"NormalIsBoring", # does not follow standard layout
|
|
|
|
"Okamirai", # images are 403 forbidden
|
2012-12-12 16:41:29 +00:00
|
|
|
"OmnisSpriteShowcase", # missing images
|
|
|
|
"OpticalDisarray", # does not follow standard layout
|
|
|
|
"PicturesofYou", # does not follow standard layout
|
|
|
|
"PlatonicManagementDilemma", # missing images
|
|
|
|
"Pornjunkiesstrip", # does not follow standard layout
|
|
|
|
"Project217", # does not follow standard layout
|
|
|
|
"Ribon", # does not follow standard layout
|
|
|
|
"SecretSanta2011", # missing images
|
|
|
|
"ShinkaTheLastEevee", # does not follow standard layout
|
2012-12-13 20:05:27 +00:00
|
|
|
"SimplePixel", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"SJArtCollab", # missing images
|
|
|
|
"SlightlyDifferent", # missing images
|
|
|
|
"TheAfterSubtract", # does not follow standard layout
|
|
|
|
"THEVOIDWEBCOMIC", # does not follow standard layout
|
|
|
|
"ThreadCrashers", # has no previous comic link
|
|
|
|
"TotallyKotor", # missing images
|
|
|
|
"Vbcomics", # does not follow standard layout
|
|
|
|
"WerewolfRichard", # does not follow standard layout
|
|
|
|
"WinterMelody", # missing images
|
|
|
|
]
|
|
|
|
|
|
|
|
# the latest URL of some comics repeats the previous URL
|
|
|
|
# flag this so the bounceStart uses the correct URL
|
|
|
|
repeat_comics = [
|
|
|
|
"1009sSpritersVacation",
|
|
|
|
"22Special22Care",
|
|
|
|
"2Kingdoms",
|
|
|
|
"2Masters",
|
|
|
|
"AbbimaysRandomness",
|
|
|
|
"AdaLeeComesOn",
|
|
|
|
"AdventuresofMitch",
|
|
|
|
"AkumaKisei",
|
|
|
|
"ALaMode",
|
|
|
|
"AnimalLoversYuriCollab",
|
|
|
|
"Area9",
|
|
|
|
"AStrangeTypeofLove",
|
|
|
|
"Autophobia",
|
|
|
|
"BearlyAbel",
|
|
|
|
"BeCarefreeWithMeSoon",
|
|
|
|
"BlindandBlue",
|
|
|
|
"BlueStreak",
|
|
|
|
"BlueWell",
|
|
|
|
"BlueYonder",
|
|
|
|
"Border",
|
|
|
|
"BoyLessons",
|
|
|
|
"Boywithasecret",
|
|
|
|
"BreakFreemagazine",
|
|
|
|
"BrightStars",
|
|
|
|
"ByTheBook",
|
|
|
|
"ClairetheFlare",
|
|
|
|
"CloeRemembrance",
|
|
|
|
"ComicFullofSprites",
|
|
|
|
"CrappilyDrawnMinicomics",
|
|
|
|
"CupidsaMoron",
|
|
|
|
"D00R",
|
|
|
|
"DeathNoteIridescent",
|
|
|
|
"DemonEater",
|
|
|
|
"DenizensAttention",
|
|
|
|
"DevilsCake",
|
|
|
|
"Dreamcatchers",
|
|
|
|
"EmeraldNuzlocke",
|
|
|
|
"EonsAgo",
|
|
|
|
"ERRORERROR",
|
|
|
|
"EvilPlan",
|
|
|
|
"FailureConfetti",
|
|
|
|
"FlyorFail",
|
|
|
|
"ForestHill",
|
|
|
|
"FrobertTheDemon",
|
|
|
|
"GarytheAlchemist",
|
|
|
|
"GhostsTaleACrossover",
|
|
|
|
"Glasshearts",
|
|
|
|
"GoldenSunGenerationsAftermathVolume1",
|
|
|
|
"GoldenSunGenerationsColossoVolume6",
|
|
|
|
"GuardiansoftheGalaxialSpaceways",
|
|
|
|
"HatShop",
|
|
|
|
"HDMTHCOMICS",
|
|
|
|
"Helix",
|
|
|
|
"Hephaestus",
|
|
|
|
"HolyBlasphemy",
|
|
|
|
"HopeForABreeze",
|
|
|
|
"Hotarugari",
|
|
|
|
"InsideOuTAYuriTale",
|
|
|
|
"Insomanywords",
|
|
|
|
"INUSITADOONLINE",
|
|
|
|
"ItsCharacterDevelopment",
|
|
|
|
"JosephAndYusra",
|
|
|
|
"JustAnotherDay",
|
|
|
|
"KasaKeira",
|
|
|
|
"KirbyAdventure",
|
|
|
|
"KirbyandtheDarkKnight",
|
|
|
|
"KirbyFunfestTheOriginals",
|
|
|
|
"KirbysofTHEVOID",
|
|
|
|
"KuroiHitsuji",
|
|
|
|
"KuroShouri",
|
|
|
|
"LandoftheSky",
|
|
|
|
"LeCirquedObscure",
|
|
|
|
"LethalDose",
|
|
|
|
"LOGOS",
|
|
|
|
"LostLove",
|
|
|
|
"LsEmpire",
|
|
|
|
"MariovsSonicvsMegaMan",
|
|
|
|
"Mega",
|
|
|
|
"MementoMori",
|
|
|
|
"Mokepon",
|
|
|
|
"MrGrimmsCircusofHorrors",
|
|
|
|
"MyFakeHeart",
|
|
|
|
"MyFriendScotty",
|
|
|
|
"MYth",
|
|
|
|
"NemesisKatharsis",
|
|
|
|
"NiceKitty",
|
|
|
|
"Nutshel",
|
|
|
|
"OptimalClutter",
|
|
|
|
"Panacea",
|
|
|
|
"PhilosophicalPenisJokes",
|
|
|
|
"PrettyUgly",
|
|
|
|
"PSY",
|
|
|
|
"PTO",
|
|
|
|
"RainLGBT",
|
|
|
|
"ReidyandFriendsShowcase",
|
|
|
|
"RubysWorld",
|
|
|
|
"SallySprocketAndPistonPete",
|
|
|
|
"SimonSues",
|
|
|
|
"SimpleBear",
|
|
|
|
"SmallPressAdventures",
|
|
|
|
"SonicWorldAdventure",
|
|
|
|
"SoulGuardian",
|
|
|
|
"SPOON",
|
|
|
|
"STASonictheAdventure",
|
|
|
|
"Stay",
|
|
|
|
"StellaInChrome",
|
|
|
|
"StrangersandFriends",
|
|
|
|
"SunmeetsMoon",
|
|
|
|
"TAG",
|
|
|
|
"TaikiTheWebcomic",
|
|
|
|
"TechnicolorLondon",
|
|
|
|
"TEN",
|
|
|
|
"ThatWasntThereYesterday",
|
|
|
|
"TheAntihero",
|
|
|
|
"TheBrideoftheShark",
|
|
|
|
"TheCafedAlizee",
|
|
|
|
"TheEssyaneWarriors",
|
|
|
|
"ThehumanBEing",
|
|
|
|
"TheKwiddexProtocol",
|
|
|
|
"TheLegendofZeldaMaidenoftheMoon",
|
|
|
|
"ThePirateBalthasar",
|
|
|
|
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
|
|
|
|
"TheReborn",
|
|
|
|
"TheTytonNuzlockeChallengeEmeraldEdition",
|
|
|
|
"ToD",
|
|
|
|
"TPTruePower",
|
|
|
|
"TwoKeys",
|
|
|
|
"UndertheSkin",
|
|
|
|
"WelcometoFreakshow",
|
|
|
|
"Whenweweresilent",
|
|
|
|
"WhiteHeart",
|
|
|
|
"Yaoishereforareason",
|
|
|
|
"Zodiac",
|
|
|
|
]
|
|
|
|
|
|
|
|
# links to last valid strips
|
|
|
|
url_overrides = {
|
|
|
|
}
|
|
|
|
|
|
|
|
# HTML content matcher
|
|
|
|
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)', after="site_banner") +
|
|
|
|
tagre("img", "title", r'([^"]+)'))
|
|
|
|
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
|
|
|
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
|
|
|
desc_matcher = re.compile(r"</div>(.+?)</div>", re.DOTALL)
|
|
|
|
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
|
|
|
|
|
|
|
def handle_url(url, res):
|
|
|
|
"""Parse one search result page."""
|
|
|
|
print("Parsing", url, file=sys.stderr)
|
|
|
|
try:
|
|
|
|
data, baseUrl = getPageContent(url)
|
|
|
|
except IOError as msg:
|
|
|
|
print("ERROR:", msg, file=sys.stderr)
|
|
|
|
return
|
|
|
|
for match in page_matcher.finditer(data):
|
|
|
|
page_url = match.group(1)
|
|
|
|
page_url = urlparse.urljoin(url, page_url)
|
|
|
|
name = unescape(match.group(2))
|
|
|
|
name = asciify(name.replace('&', 'And').replace('@', 'At'))
|
|
|
|
name = capfirst(name)
|
|
|
|
if name in exclude_comics:
|
|
|
|
continue
|
|
|
|
if contains_case_insensitive(res, name):
|
|
|
|
# we cannot handle two comics that only differ in case
|
2012-12-19 19:42:53 +00:00
|
|
|
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
2012-12-12 16:41:29 +00:00
|
|
|
continue
|
|
|
|
# find out how many images this comic has
|
|
|
|
end = match.end()
|
|
|
|
mo = num_matcher.search(data[end:])
|
|
|
|
if not mo:
|
|
|
|
print("ERROR matching number:", repr(data[end:end+300]), file=sys.stderr)
|
|
|
|
continue
|
|
|
|
num = int(mo.group(1))
|
|
|
|
# search for url in extra page
|
|
|
|
print("Getting", page_url)
|
|
|
|
try:
|
|
|
|
data2, baseUrl2 = getPageContent(page_url)
|
|
|
|
except IOError as msg:
|
|
|
|
print("ERROR:", msg, file=sys.stderr)
|
|
|
|
return
|
|
|
|
mo = url_matcher.search(data2)
|
|
|
|
if not mo:
|
|
|
|
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
|
|
|
|
continue
|
|
|
|
comic_url = mo.group(1)
|
|
|
|
# search for description
|
|
|
|
end = mo.end()
|
|
|
|
mo = desc_matcher.search(data2[end:])
|
|
|
|
if not mo:
|
|
|
|
print("ERROR matching comic description:", repr(data2[end:end+300]), file=sys.stderr)
|
|
|
|
continue
|
|
|
|
desc = remove_html_tags(mo.group(1))
|
|
|
|
desc = unescape(desc)
|
|
|
|
desc = unquote(desc)
|
|
|
|
desc = compact_whitespace(desc).strip()
|
|
|
|
# search for adult flag
|
|
|
|
adult = adult_matcher.search(data2[end:])
|
|
|
|
bounce = name not in repeat_comics
|
|
|
|
res[name] = [
|
|
|
|
url_overrides.get(name, comic_url), num, desc, bool(adult), bounce
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def get_results():
|
|
|
|
"""Parse all search result pages."""
|
|
|
|
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
|
|
|
# store info in a dictionary {name -> url, number of comics, description, adult flag}
|
|
|
|
res = {}
|
|
|
|
# a search for an empty string returned 286 result pages
|
|
|
|
result_pages = 286
|
|
|
|
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
|
|
|
for i in range(0, result_pages):
|
|
|
|
print(i+1, file=sys.stderr, end=" ")
|
|
|
|
handle_url(base % (i*12), res)
|
2012-12-19 19:42:53 +00:00
|
|
|
save_result(res, json_file)
|
2012-12-12 16:41:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def has_comic(name):
|
|
|
|
cname = name.lower()
|
|
|
|
for scraperclass in get_scrapers():
|
|
|
|
lname = scraperclass.get_name().lower()
|
|
|
|
if lname == cname:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def print_results(args):
|
|
|
|
"""Print all comics that have at least the given number of minimum comic strips."""
|
|
|
|
min_comics = int(args[0])
|
2012-12-19 19:42:53 +00:00
|
|
|
for name, entry in sorted(load_result(json_file).items()):
|
2012-12-12 16:41:29 +00:00
|
|
|
if name in exclude_comics:
|
|
|
|
continue
|
|
|
|
url, num, desc, adult, bounce = entry
|
|
|
|
if num < min_comics:
|
|
|
|
continue
|
|
|
|
if has_comic(name):
|
|
|
|
prefix = '#'
|
|
|
|
else:
|
|
|
|
prefix = ''
|
|
|
|
print("%sadd(%r, %r, %r, %s, %s)" % (
|
2013-01-09 21:20:03 +00:00
|
|
|
prefix, str(truncate_name(name)), str(url), desc, adult, bounce
|
2012-12-12 16:41:29 +00:00
|
|
|
))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
print_results(sys.argv[1:])
|
|
|
|
else:
|
|
|
|
get_results()
|