2012-12-12 16:41:29 +00:00
|
|
|
#!/usr/bin/env python
|
2016-04-12 22:52:16 +00:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright (C) 2004-2005 Tristan Seligmann and Jonathan Jacobs
|
2014-01-05 15:50:57 +00:00
|
|
|
# Copyright (C) 2012-2014 Bastian Kleineidam
|
2016-04-12 22:52:16 +00:00
|
|
|
# Copyright (C) 2015-2016 Tobias Gruetzmacher
|
2012-12-12 16:41:29 +00:00
|
|
|
"""
|
2016-04-12 22:52:16 +00:00
|
|
|
Script to get a list of smackjeeves.com comics and save the info in a JSON file
|
|
|
|
for further processing.
|
2012-12-12 16:41:29 +00:00
|
|
|
"""
|
2016-04-12 22:52:16 +00:00
|
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
|
2013-05-22 20:29:03 +00:00
|
|
|
import codecs
|
2012-12-12 16:41:29 +00:00
|
|
|
import re
|
|
|
|
import sys
|
|
|
|
import os
|
2016-04-12 22:52:16 +00:00
|
|
|
try:
|
|
|
|
from urllib.parse import urljoin
|
|
|
|
except ImportError:
|
|
|
|
from urlparse import urljoin
|
|
|
|
|
2013-02-12 20:53:57 +00:00
|
|
|
import requests
|
2016-04-12 22:52:16 +00:00
|
|
|
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
|
|
|
from dosagelib.util import get_page, tagre
|
2016-04-14 20:22:37 +00:00
|
|
|
from dosagelib.scraper import get_scrapers
|
2015-04-20 18:25:12 +00:00
|
|
|
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
2012-12-12 16:41:29 +00:00
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
json_file = __file__.replace(".py", ".json")
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# names of comics to exclude
|
|
|
|
exclude_comics = [
|
2013-05-25 21:24:33 +00:00
|
|
|
"4plyKamalsHead", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"9Lives", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"ADifferentPerspective", # does not follow standard layout
|
2013-04-04 16:30:02 +00:00
|
|
|
"AFairlyTwistedRealitySuper", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"Ahoge", # does not follow standard layout
|
|
|
|
"AngelJunkPileFelix", # images are 403 forbidden
|
2013-04-28 17:58:38 +00:00
|
|
|
"AntavioussGenLab", # images are 403 forbidden
|
2013-04-04 16:30:02 +00:00
|
|
|
"AreyougayJohnny", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"Authorbattlesthevideogame", # missing images
|
|
|
|
"BambooArmonicKnightsGuild", # missing previous link
|
2013-05-25 21:24:33 +00:00
|
|
|
"BassLegends", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"BreIshurnasspritesandstuff", # comic moved
|
|
|
|
"CatboyattheCon", # missing images
|
2013-07-09 20:21:12 +00:00
|
|
|
"Comatose", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"ContraandtheSpamDump", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"ClubLove", # does not follow standard layout
|
2013-02-19 19:58:04 +00:00
|
|
|
"Darkkyosshorts", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"DeSTRESS", # does not follow standard layout
|
|
|
|
"DollarStoreCaviar", # broken images
|
|
|
|
"DreamCatcher", # does not follow standard layout
|
|
|
|
"EdgeofDecember", # missing images
|
2013-04-28 17:58:38 +00:00
|
|
|
"FroakieShocaiz", # unsuitable navigation
|
2012-12-12 16:41:29 +00:00
|
|
|
"Fumiko", # does not follow standard layout
|
|
|
|
"FurryExperience", # timeout
|
2013-02-19 19:58:04 +00:00
|
|
|
"GART", # does not follow standard layout
|
2013-05-25 21:24:33 +00:00
|
|
|
"GarytheAlchemist", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"GBAsCrib", # timeout
|
2013-07-09 20:21:12 +00:00
|
|
|
"HAndJ", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"HEARD", # missing images
|
2013-03-26 16:33:15 +00:00
|
|
|
"Indigo", # broken domain name
|
2013-02-19 19:58:04 +00:00
|
|
|
"IwillbenapoSpamDump", # missing images
|
2013-05-25 21:24:33 +00:00
|
|
|
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"JennyHaniver", # does not follow standard layout
|
|
|
|
"KiLAiLO", # does not follow standard layout
|
2013-02-19 19:58:04 +00:00
|
|
|
"KirbysoftheAlternateDimension", # missing images
|
2013-04-28 17:58:38 +00:00
|
|
|
"Letsreviewshallwe", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"LoudEra", # does not follow standard layout
|
|
|
|
"LunarHill", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"Mafiagame", # does not follow standard layout
|
|
|
|
"MegaManSpriteExpo", # missing images
|
2012-12-12 16:41:29 +00:00
|
|
|
"MyLifewithFelENESPANOL", # does not follow standard layout
|
|
|
|
"MylifewithFel", # does not follow standard layout
|
|
|
|
"NegativeZen", # does not follow standard layout
|
2013-02-19 19:58:04 +00:00
|
|
|
"Nemutionpobae", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"NightShot", # does not follow standard layout
|
2012-12-13 20:05:27 +00:00
|
|
|
"NormalIsBoring", # does not follow standard layout
|
|
|
|
"Okamirai", # images are 403 forbidden
|
2012-12-12 16:41:29 +00:00
|
|
|
"OmnisSpriteShowcase", # missing images
|
|
|
|
"OpticalDisarray", # does not follow standard layout
|
|
|
|
"PicturesofYou", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"PiecesofBrokenGlass", # broken images
|
2012-12-12 16:41:29 +00:00
|
|
|
"PlatonicManagementDilemma", # missing images
|
|
|
|
"Pornjunkiesstrip", # does not follow standard layout
|
2013-02-19 19:58:04 +00:00
|
|
|
"PrettyUgly", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"Project217", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"RemmyzRandomz", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"Ribon", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"RubysWorld", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"SecretSanta2011", # missing images
|
|
|
|
"ShinkaTheLastEevee", # does not follow standard layout
|
2012-12-13 20:05:27 +00:00
|
|
|
"SimplePixel", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"SJArtCollab", # missing images
|
2013-04-11 16:27:43 +00:00
|
|
|
"SladesMansionofawesomeness", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"SlightlyDifferent", # missing images
|
2013-02-12 20:53:57 +00:00
|
|
|
"SpaceSchool", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"SushiGummy", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"TheAfterSubtract", # does not follow standard layout
|
2013-04-11 16:27:43 +00:00
|
|
|
"ThePokemonArtBox", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"THEVOIDWEBCOMIC", # does not follow standard layout
|
2013-05-25 21:24:33 +00:00
|
|
|
"TC2KsPokemobians", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"ThreadCrashers", # has no previous comic link
|
2013-05-25 21:24:33 +00:00
|
|
|
"ToDefeatThemAll", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"TotallyKotor", # missing images
|
2013-07-09 20:21:12 +00:00
|
|
|
"TwoKeys", # does not follow standard layout
|
2012-12-12 16:41:29 +00:00
|
|
|
"Vbcomics", # does not follow standard layout
|
|
|
|
"WerewolfRichard", # does not follow standard layout
|
|
|
|
"WinterMelody", # missing images
|
|
|
|
]
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# the latest URL of some comics repeats the previous URL
|
|
|
|
# flag this so the bounceStart uses the correct URL
|
|
|
|
repeat_comics = [
|
|
|
|
"1009sSpritersVacation",
|
|
|
|
"22Special22Care",
|
|
|
|
"2Kingdoms",
|
|
|
|
"2Masters",
|
|
|
|
"AbbimaysRandomness",
|
|
|
|
"AdaLeeComesOn",
|
|
|
|
"AdventuresofMitch",
|
|
|
|
"AkumaKisei",
|
|
|
|
"ALaMode",
|
|
|
|
"AnimalLoversYuriCollab",
|
|
|
|
"Area9",
|
|
|
|
"AStrangeTypeofLove",
|
|
|
|
"Autophobia",
|
|
|
|
"BearlyAbel",
|
|
|
|
"BeCarefreeWithMeSoon",
|
|
|
|
"BlindandBlue",
|
|
|
|
"BlueStreak",
|
|
|
|
"BlueWell",
|
|
|
|
"BlueYonder",
|
|
|
|
"Border",
|
|
|
|
"BoyLessons",
|
|
|
|
"Boywithasecret",
|
|
|
|
"BreakFreemagazine",
|
|
|
|
"BrightStars",
|
|
|
|
"ByTheBook",
|
|
|
|
"ClairetheFlare",
|
|
|
|
"CloeRemembrance",
|
|
|
|
"ComicFullofSprites",
|
|
|
|
"CrappilyDrawnMinicomics",
|
|
|
|
"CupidsaMoron",
|
|
|
|
"D00R",
|
|
|
|
"DeathNoteIridescent",
|
|
|
|
"DemonEater",
|
|
|
|
"DenizensAttention",
|
|
|
|
"DevilsCake",
|
|
|
|
"Dreamcatchers",
|
|
|
|
"EmeraldNuzlocke",
|
|
|
|
"EonsAgo",
|
|
|
|
"ERRORERROR",
|
|
|
|
"EvilPlan",
|
|
|
|
"FailureConfetti",
|
|
|
|
"FlyorFail",
|
|
|
|
"ForestHill",
|
|
|
|
"FrobertTheDemon",
|
|
|
|
"GarytheAlchemist",
|
|
|
|
"GhostsTaleACrossover",
|
|
|
|
"Glasshearts",
|
|
|
|
"GoldenSunGenerationsAftermathVolume1",
|
|
|
|
"GoldenSunGenerationsColossoVolume6",
|
|
|
|
"GuardiansoftheGalaxialSpaceways",
|
|
|
|
"HatShop",
|
|
|
|
"HDMTHCOMICS",
|
|
|
|
"Helix",
|
|
|
|
"Hephaestus",
|
|
|
|
"HolyBlasphemy",
|
|
|
|
"HopeForABreeze",
|
|
|
|
"Hotarugari",
|
|
|
|
"InsideOuTAYuriTale",
|
|
|
|
"Insomanywords",
|
|
|
|
"INUSITADOONLINE",
|
|
|
|
"ItsCharacterDevelopment",
|
|
|
|
"JosephAndYusra",
|
|
|
|
"JustAnotherDay",
|
|
|
|
"KasaKeira",
|
|
|
|
"KirbyAdventure",
|
|
|
|
"KirbyandtheDarkKnight",
|
|
|
|
"KirbyFunfestTheOriginals",
|
|
|
|
"KirbysofTHEVOID",
|
|
|
|
"KuroiHitsuji",
|
|
|
|
"KuroShouri",
|
|
|
|
"LandoftheSky",
|
|
|
|
"LeCirquedObscure",
|
|
|
|
"LethalDose",
|
|
|
|
"LOGOS",
|
|
|
|
"LostLove",
|
|
|
|
"LsEmpire",
|
|
|
|
"MariovsSonicvsMegaMan",
|
|
|
|
"Mega",
|
|
|
|
"MementoMori",
|
|
|
|
"Mokepon",
|
|
|
|
"MrGrimmsCircusofHorrors",
|
|
|
|
"MyFakeHeart",
|
|
|
|
"MyFriendScotty",
|
|
|
|
"MYth",
|
|
|
|
"NemesisKatharsis",
|
|
|
|
"NiceKitty",
|
|
|
|
"Nutshel",
|
|
|
|
"OptimalClutter",
|
|
|
|
"Panacea",
|
|
|
|
"PhilosophicalPenisJokes",
|
|
|
|
"PrettyUgly",
|
|
|
|
"PSY",
|
|
|
|
"PTO",
|
|
|
|
"RainLGBT",
|
|
|
|
"ReidyandFriendsShowcase",
|
|
|
|
"RubysWorld",
|
|
|
|
"SallySprocketAndPistonPete",
|
|
|
|
"SimonSues",
|
|
|
|
"SimpleBear",
|
|
|
|
"SmallPressAdventures",
|
|
|
|
"SonicWorldAdventure",
|
|
|
|
"SoulGuardian",
|
|
|
|
"SPOON",
|
|
|
|
"STASonictheAdventure",
|
|
|
|
"Stay",
|
|
|
|
"StellaInChrome",
|
|
|
|
"StrangersandFriends",
|
|
|
|
"SunmeetsMoon",
|
|
|
|
"TAG",
|
|
|
|
"TaikiTheWebcomic",
|
|
|
|
"TechnicolorLondon",
|
|
|
|
"TEN",
|
|
|
|
"ThatWasntThereYesterday",
|
|
|
|
"TheAntihero",
|
|
|
|
"TheBrideoftheShark",
|
|
|
|
"TheCafedAlizee",
|
|
|
|
"TheEssyaneWarriors",
|
|
|
|
"ThehumanBEing",
|
|
|
|
"TheKwiddexProtocol",
|
|
|
|
"TheLegendofZeldaMaidenoftheMoon",
|
|
|
|
"ThePirateBalthasar",
|
|
|
|
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
|
|
|
|
"TheReborn",
|
|
|
|
"TheTytonNuzlockeChallengeEmeraldEdition",
|
|
|
|
"ToD",
|
|
|
|
"TPTruePower",
|
|
|
|
"TwoKeys",
|
|
|
|
"UndertheSkin",
|
|
|
|
"WelcometoFreakshow",
|
|
|
|
"Whenweweresilent",
|
|
|
|
"WhiteHeart",
|
|
|
|
"Yaoishereforareason",
|
|
|
|
"Zodiac",
|
|
|
|
]
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# links to last valid strips
|
|
|
|
url_overrides = {
|
|
|
|
}
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2012-12-12 16:41:29 +00:00
|
|
|
# HTML content matcher
|
2016-04-12 22:52:16 +00:00
|
|
|
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
|
|
|
|
after="site_banner") +
|
|
|
|
tagre("img", "title", r'([^"]+)'))
|
2012-12-12 16:41:29 +00:00
|
|
|
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
|
|
|
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
|
|
|
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
|
|
|
|
2016-04-12 22:52:16 +00:00
|
|
|
|
2013-02-12 20:53:57 +00:00
|
|
|
def handle_url(url, session, res):
|
2012-12-12 16:41:29 +00:00
|
|
|
"""Parse one search result page."""
|
|
|
|
print("Parsing", url, file=sys.stderr)
|
|
|
|
try:
|
2016-04-12 22:52:16 +00:00
|
|
|
data = get_page(url, session).text
|
2012-12-12 16:41:29 +00:00
|
|
|
except IOError as msg:
|
|
|
|
print("ERROR:", msg, file=sys.stderr)
|
|
|
|
return
|
|
|
|
for match in page_matcher.finditer(data):
|
|
|
|
page_url = match.group(1)
|
2016-04-12 22:52:16 +00:00
|
|
|
page_url = urljoin(url, page_url)
|
2013-02-13 19:02:47 +00:00
|
|
|
name = format_name(match.group(2))
|
2012-12-12 16:41:29 +00:00
|
|
|
if name in exclude_comics:
|
|
|
|
continue
|
|
|
|
if contains_case_insensitive(res, name):
|
|
|
|
# we cannot handle two comics that only differ in case
|
2013-03-12 19:47:11 +00:00
|
|
|
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
2012-12-12 16:41:29 +00:00
|
|
|
continue
|
|
|
|
# find out how many images this comic has
|
|
|
|
end = match.end()
|
|
|
|
mo = num_matcher.search(data[end:])
|
|
|
|
if not mo:
|
2016-04-12 22:52:16 +00:00
|
|
|
print("ERROR matching number:", repr(data[end:end + 300]),
|
|
|
|
file=sys.stderr)
|
2012-12-12 16:41:29 +00:00
|
|
|
continue
|
|
|
|
num = int(mo.group(1))
|
|
|
|
# search for url in extra page
|
|
|
|
print("Getting", page_url)
|
|
|
|
try:
|
2016-04-12 22:52:16 +00:00
|
|
|
data2 = get_page(page_url, session).text
|
2012-12-12 16:41:29 +00:00
|
|
|
except IOError as msg:
|
|
|
|
print("ERROR:", msg, file=sys.stderr)
|
|
|
|
return
|
|
|
|
mo = url_matcher.search(data2)
|
|
|
|
if not mo:
|
|
|
|
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
|
|
|
|
continue
|
|
|
|
comic_url = mo.group(1)
|
|
|
|
# search for adult flag
|
|
|
|
adult = adult_matcher.search(data2[end:])
|
|
|
|
bounce = name not in repeat_comics
|
|
|
|
res[name] = [
|
2015-04-20 18:25:12 +00:00
|
|
|
url_overrides.get(name, comic_url), num, bool(adult), bounce
|
2012-12-12 16:41:29 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def get_results():
|
|
|
|
"""Parse all search result pages."""
|
|
|
|
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
2013-02-12 20:53:57 +00:00
|
|
|
session = requests.Session()
|
2015-04-20 18:25:12 +00:00
|
|
|
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
|
2012-12-12 16:41:29 +00:00
|
|
|
res = {}
|
|
|
|
# a search for an empty string returned 286 result pages
|
|
|
|
result_pages = 286
|
|
|
|
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
|
|
|
for i in range(0, result_pages):
|
2016-04-12 22:52:16 +00:00
|
|
|
print(i + 1, file=sys.stderr, end=" ")
|
|
|
|
handle_url(base % (i * 12), session, res)
|
2012-12-19 19:42:53 +00:00
|
|
|
save_result(res, json_file)
|
2012-12-12 16:41:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
def has_comic(name):
|
2013-01-09 21:26:00 +00:00
|
|
|
"""Check if comic name already exists."""
|
2012-12-12 16:41:29 +00:00
|
|
|
cname = name.lower()
|
2016-04-14 20:22:37 +00:00
|
|
|
for scraperobj in get_scrapers():
|
|
|
|
lname = scraperobj.name.lower()
|
2012-12-12 16:41:29 +00:00
|
|
|
if lname == cname:
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
def print_results(args):
|
|
|
|
"""Print all comics that have at least the given number of minimum comic strips."""
|
2013-05-22 20:29:03 +00:00
|
|
|
min_comics, filename = args
|
|
|
|
min_comics = int(min_comics)
|
|
|
|
with codecs.open(filename, 'a', 'utf-8') as fp:
|
|
|
|
for name, entry in sorted(load_result(json_file).items()):
|
|
|
|
if name in exclude_comics:
|
|
|
|
continue
|
2015-04-20 18:25:12 +00:00
|
|
|
url, num, adult, bounce = entry
|
2013-05-22 20:29:03 +00:00
|
|
|
if num < min_comics:
|
|
|
|
continue
|
|
|
|
if has_comic(name):
|
|
|
|
prefix = u'#'
|
|
|
|
else:
|
|
|
|
prefix = u''
|
2015-04-20 18:25:12 +00:00
|
|
|
fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
|
|
|
|
prefix, str(truncate_name(name)), str(url), adult, bounce
|
2013-05-22 20:29:03 +00:00
|
|
|
))
|
2012-12-12 16:41:29 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if len(sys.argv) > 1:
|
|
|
|
print_results(sys.argv[1:])
|
|
|
|
else:
|
|
|
|
get_results()
|