Update SmackJeeves update helper.
Don't use it right now, it adds a HUGE amount of comics.
This commit is contained in:
parent
fe51a449df
commit
1d2e1f2dd1
1 changed files with 148 additions and 331 deletions
|
@ -9,354 +9,171 @@ for further processing.
|
||||||
"""
|
"""
|
||||||
from __future__ import absolute_import, division, print_function
|
from __future__ import absolute_import, division, print_function
|
||||||
|
|
||||||
import codecs
|
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
import os
|
|
||||||
try:
|
try:
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urlsplit
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urlparse import urljoin
|
from urlparse import urlsplit
|
||||||
|
|
||||||
import requests
|
from scriptutil import ComicListUpdater
|
||||||
|
|
||||||
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) # noqa
|
|
||||||
from dosagelib.util import get_page, tagre
|
|
||||||
from dosagelib.scraper import get_scrapers
|
|
||||||
from scriptutil import contains_case_insensitive, save_result, load_result, truncate_name, format_name
|
|
||||||
|
|
||||||
|
|
||||||
json_file = __file__.replace(".py", ".json")
|
class SmackJeevesUpdater(ComicListUpdater):
|
||||||
|
# Absolute minumum number of pages a comic may have (restrict search space)
|
||||||
|
MIN_COMICS = 90
|
||||||
|
|
||||||
|
ADULT_IMG = 'http://www.smackjeeves.com/images/mature_content.png'
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
excluded_comics = (
|
||||||
"4plyKamalsHead", # does not follow standard layout
|
# comic moved/we have a better module
|
||||||
"9Lives", # missing images
|
"Amya",
|
||||||
"ADifferentPerspective", # does not follow standard layout
|
"Carciphona",
|
||||||
"AFairlyTwistedRealitySuper", # does not follow standard layout
|
"Footloose",
|
||||||
"Ahoge", # does not follow standard layout
|
"TitleUnrelated",
|
||||||
"AngelJunkPileFelix", # images are 403 forbidden
|
|
||||||
"AntavioussGenLab", # images are 403 forbidden
|
|
||||||
"AreyougayJohnny", # does not follow standard layout
|
|
||||||
"Authorbattlesthevideogame", # missing images
|
|
||||||
"BambooArmonicKnightsGuild", # missing previous link
|
|
||||||
"BassLegends", # does not follow standard layout
|
|
||||||
"BreIshurnasspritesandstuff", # comic moved
|
|
||||||
"CatboyattheCon", # missing images
|
|
||||||
"Comatose", # does not follow standard layout
|
|
||||||
"ContraandtheSpamDump", # missing images
|
|
||||||
"ClubLove", # does not follow standard layout
|
|
||||||
"Darkkyosshorts", # missing images
|
|
||||||
"DeSTRESS", # does not follow standard layout
|
|
||||||
"DollarStoreCaviar", # broken images
|
|
||||||
"DreamCatcher", # does not follow standard layout
|
|
||||||
"EdgeofDecember", # missing images
|
|
||||||
"FroakieShocaiz", # unsuitable navigation
|
|
||||||
"Fumiko", # does not follow standard layout
|
|
||||||
"FurryExperience", # timeout
|
|
||||||
"GART", # does not follow standard layout
|
|
||||||
"GarytheAlchemist", # does not follow standard layout
|
|
||||||
"GBAsCrib", # timeout
|
|
||||||
"HAndJ", # missing images
|
|
||||||
"HEARD", # missing images
|
|
||||||
"Indigo", # broken domain name
|
|
||||||
"IwillbenapoSpamDump", # missing images
|
|
||||||
"ItoshiisCrazyNuzlockeAdventures", # does not follow standard layout
|
|
||||||
"JennyHaniver", # does not follow standard layout
|
|
||||||
"KiLAiLO", # does not follow standard layout
|
|
||||||
"KirbysoftheAlternateDimension", # missing images
|
|
||||||
"Letsreviewshallwe", # missing images
|
|
||||||
"LoudEra", # does not follow standard layout
|
|
||||||
"LunarHill", # does not follow standard layout
|
|
||||||
"Mafiagame", # does not follow standard layout
|
|
||||||
"MegaManSpriteExpo", # missing images
|
|
||||||
"MyLifewithFelENESPANOL", # does not follow standard layout
|
|
||||||
"MylifewithFel", # does not follow standard layout
|
|
||||||
"NegativeZen", # does not follow standard layout
|
|
||||||
"Nemutionpobae", # does not follow standard layout
|
|
||||||
"NightShot", # does not follow standard layout
|
|
||||||
"NormalIsBoring", # does not follow standard layout
|
|
||||||
"Okamirai", # images are 403 forbidden
|
|
||||||
"OmnisSpriteShowcase", # missing images
|
|
||||||
"OpticalDisarray", # does not follow standard layout
|
|
||||||
"PicturesofYou", # does not follow standard layout
|
|
||||||
"PiecesofBrokenGlass", # broken images
|
|
||||||
"PlatonicManagementDilemma", # missing images
|
|
||||||
"Pornjunkiesstrip", # does not follow standard layout
|
|
||||||
"PrettyUgly", # does not follow standard layout
|
|
||||||
"Project217", # does not follow standard layout
|
|
||||||
"RemmyzRandomz", # does not follow standard layout
|
|
||||||
"Ribon", # does not follow standard layout
|
|
||||||
"RubysWorld", # does not follow standard layout
|
|
||||||
"SecretSanta2011", # missing images
|
|
||||||
"ShinkaTheLastEevee", # does not follow standard layout
|
|
||||||
"SimplePixel", # does not follow standard layout
|
|
||||||
"SJArtCollab", # missing images
|
|
||||||
"SladesMansionofawesomeness", # does not follow standard layout
|
|
||||||
"SlightlyDifferent", # missing images
|
|
||||||
"SpaceSchool", # does not follow standard layout
|
|
||||||
"SushiGummy", # does not follow standard layout
|
|
||||||
"TheAfterSubtract", # does not follow standard layout
|
|
||||||
"ThePokemonArtBox", # does not follow standard layout
|
|
||||||
"THEVOIDWEBCOMIC", # does not follow standard layout
|
|
||||||
"TC2KsPokemobians", # does not follow standard layout
|
|
||||||
"ThreadCrashers", # has no previous comic link
|
|
||||||
"ToDefeatThemAll", # does not follow standard layout
|
|
||||||
"TotallyKotor", # missing images
|
|
||||||
"TwoKeys", # does not follow standard layout
|
|
||||||
"Vbcomics", # does not follow standard layout
|
|
||||||
"WerewolfRichard", # does not follow standard layout
|
|
||||||
"WinterMelody", # missing images
|
|
||||||
]
|
|
||||||
|
|
||||||
|
# does not follow standard layout
|
||||||
# the latest URL of some comics repeats the previous URL
|
"300DaysOfSyao",
|
||||||
# flag this so the bounceStart uses the correct URL
|
"ADifferentPerspective",
|
||||||
repeat_comics = [
|
"Captor",
|
||||||
"1009sSpritersVacation",
|
"ClubLove",
|
||||||
"22Special22Care",
|
"Comatose",
|
||||||
"2Kingdoms",
|
"DeSTRESS",
|
||||||
"2Masters",
|
"DreamCatcher",
|
||||||
"AbbimaysRandomness",
|
"Fumiko",
|
||||||
"AdaLeeComesOn",
|
"GART",
|
||||||
"AdventuresofMitch",
|
|
||||||
"AkumaKisei",
|
|
||||||
"ALaMode",
|
|
||||||
"AnimalLoversYuriCollab",
|
|
||||||
"Area9",
|
|
||||||
"AStrangeTypeofLove",
|
|
||||||
"Autophobia",
|
|
||||||
"BearlyAbel",
|
|
||||||
"BeCarefreeWithMeSoon",
|
|
||||||
"BlindandBlue",
|
|
||||||
"BlueStreak",
|
|
||||||
"BlueWell",
|
|
||||||
"BlueYonder",
|
|
||||||
"Border",
|
|
||||||
"BoyLessons",
|
|
||||||
"Boywithasecret",
|
|
||||||
"BreakFreemagazine",
|
|
||||||
"BrightStars",
|
|
||||||
"ByTheBook",
|
|
||||||
"ClairetheFlare",
|
|
||||||
"CloeRemembrance",
|
|
||||||
"ComicFullofSprites",
|
|
||||||
"CrappilyDrawnMinicomics",
|
|
||||||
"CupidsaMoron",
|
|
||||||
"D00R",
|
|
||||||
"DeathNoteIridescent",
|
|
||||||
"DemonEater",
|
|
||||||
"DenizensAttention",
|
|
||||||
"DevilsCake",
|
|
||||||
"Dreamcatchers",
|
|
||||||
"EmeraldNuzlocke",
|
|
||||||
"EonsAgo",
|
|
||||||
"ERRORERROR",
|
|
||||||
"EvilPlan",
|
|
||||||
"FailureConfetti",
|
|
||||||
"FlyorFail",
|
|
||||||
"ForestHill",
|
|
||||||
"FrobertTheDemon",
|
|
||||||
"GarytheAlchemist",
|
"GarytheAlchemist",
|
||||||
"GhostsTaleACrossover",
|
"ItoshiisCrazyNuzlockeAdventures",
|
||||||
"Glasshearts",
|
"JennyHaniver",
|
||||||
"GoldenSunGenerationsAftermathVolume1",
|
"KiLAiLO",
|
||||||
"GoldenSunGenerationsColossoVolume6",
|
"LoudEra",
|
||||||
"GuardiansoftheGalaxialSpaceways",
|
"LunarHill",
|
||||||
"HatShop",
|
"Mafiagame",
|
||||||
"HDMTHCOMICS",
|
"MylifewithFel",
|
||||||
"Helix",
|
"MyLifewithFelENESPANOL",
|
||||||
"Hephaestus",
|
"NegativeZen",
|
||||||
"HolyBlasphemy",
|
"Nemutionpobae",
|
||||||
"HopeForABreeze",
|
"NightShot",
|
||||||
"Hotarugari",
|
"NormalIsBoring",
|
||||||
"InsideOuTAYuriTale",
|
"OpticalDisarray",
|
||||||
"Insomanywords",
|
"PicturesofYou",
|
||||||
"INUSITADOONLINE",
|
"Pornjunkiesstrip",
|
||||||
"ItsCharacterDevelopment",
|
|
||||||
"JosephAndYusra",
|
|
||||||
"JustAnotherDay",
|
|
||||||
"KasaKeira",
|
|
||||||
"KirbyAdventure",
|
|
||||||
"KirbyandtheDarkKnight",
|
|
||||||
"KirbyFunfestTheOriginals",
|
|
||||||
"KirbysofTHEVOID",
|
|
||||||
"KuroiHitsuji",
|
|
||||||
"KuroShouri",
|
|
||||||
"LandoftheSky",
|
|
||||||
"LeCirquedObscure",
|
|
||||||
"LethalDose",
|
|
||||||
"LOGOS",
|
|
||||||
"LostLove",
|
|
||||||
"LsEmpire",
|
|
||||||
"MariovsSonicvsMegaMan",
|
|
||||||
"Mega",
|
|
||||||
"MementoMori",
|
|
||||||
"Mokepon",
|
|
||||||
"MrGrimmsCircusofHorrors",
|
|
||||||
"MyFakeHeart",
|
|
||||||
"MyFriendScotty",
|
|
||||||
"MYth",
|
|
||||||
"NemesisKatharsis",
|
|
||||||
"NiceKitty",
|
|
||||||
"Nutshel",
|
|
||||||
"OptimalClutter",
|
|
||||||
"Panacea",
|
|
||||||
"PhilosophicalPenisJokes",
|
|
||||||
"PrettyUgly",
|
"PrettyUgly",
|
||||||
"PSY",
|
"Project217",
|
||||||
"PTO",
|
"RemmyzRandomz",
|
||||||
"RainLGBT",
|
"Ribon",
|
||||||
"ReidyandFriendsShowcase",
|
|
||||||
"RubysWorld",
|
"RubysWorld",
|
||||||
"SallySprocketAndPistonPete",
|
"ShinkaTheLastEevee",
|
||||||
"SimonSues",
|
"SimplePixel",
|
||||||
"SimpleBear",
|
"SladesMansionofawesomeness",
|
||||||
"SmallPressAdventures",
|
"SpaceSchool",
|
||||||
"SonicWorldAdventure",
|
"SushiGummy",
|
||||||
"SoulGuardian",
|
"TC2KsPokemobians",
|
||||||
"SPOON",
|
"TheAfterSubtract",
|
||||||
"STASonictheAdventure",
|
"ThePokemonArtBox",
|
||||||
"Stay",
|
"THEVOIDWEBCOMIC",
|
||||||
"StellaInChrome",
|
"ToDefeatThemAll",
|
||||||
"StrangersandFriends",
|
|
||||||
"SunmeetsMoon",
|
|
||||||
"TAG",
|
|
||||||
"TaikiTheWebcomic",
|
|
||||||
"TechnicolorLondon",
|
|
||||||
"TEN",
|
|
||||||
"ThatWasntThereYesterday",
|
|
||||||
"TheAntihero",
|
|
||||||
"TheBrideoftheShark",
|
|
||||||
"TheCafedAlizee",
|
|
||||||
"TheEssyaneWarriors",
|
|
||||||
"ThehumanBEing",
|
|
||||||
"TheKwiddexProtocol",
|
|
||||||
"TheLegendofZeldaMaidenoftheMoon",
|
|
||||||
"ThePirateBalthasar",
|
|
||||||
"TheRandomObscureFairyTaleNoOnesEverReallyHeardOf",
|
|
||||||
"TheReborn",
|
|
||||||
"TheTytonNuzlockeChallengeEmeraldEdition",
|
|
||||||
"ToD",
|
|
||||||
"TPTruePower",
|
|
||||||
"TwoKeys",
|
"TwoKeys",
|
||||||
"UndertheSkin",
|
"Vbcomics",
|
||||||
"WelcometoFreakshow",
|
"WerewolfRichard",
|
||||||
"Whenweweresilent",
|
|
||||||
"WhiteHeart",
|
|
||||||
"Yaoishereforareason",
|
|
||||||
"Zodiac",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
# has no previous comic link
|
||||||
|
"ThreadCrashers",
|
||||||
|
"AchievementStuck",
|
||||||
|
|
||||||
# links to last valid strips
|
# images are 403 forbidden
|
||||||
url_overrides = {
|
"AngelJunkPileFelix",
|
||||||
}
|
"AntavioussGenLab",
|
||||||
|
"Okamirai",
|
||||||
|
|
||||||
|
# missing images
|
||||||
|
"CatboyattheCon",
|
||||||
|
"ContraandtheSpamDump",
|
||||||
|
"Darkkyosshorts",
|
||||||
|
"DollarStoreCaviar",
|
||||||
|
"EdgeofDecember",
|
||||||
|
"HAndJ",
|
||||||
|
"HEARD",
|
||||||
|
"IwillbenapoSpamDump",
|
||||||
|
"KirbysoftheAlternateDimension",
|
||||||
|
"Letsreviewshallwe",
|
||||||
|
"MegaManSpriteExpo",
|
||||||
|
"OmnisSpriteShowcase",
|
||||||
|
"PiecesofBrokenGlass",
|
||||||
|
"PlatonicManagementDilemma",
|
||||||
|
"SecretSanta2011",
|
||||||
|
"SerendipityAnEquestrianTale",
|
||||||
|
"SJArtCollab",
|
||||||
|
"SlightlyDifferent",
|
||||||
|
"TheAttackoftheRecoloursSeason1",
|
||||||
|
"TotallyKotor",
|
||||||
|
"WinterMelody",
|
||||||
|
"ZonowTheHedgehog",
|
||||||
|
|
||||||
# HTML content matcher
|
# missing previous link
|
||||||
page_matcher = re.compile(tagre("a", "href", r'(comicprofile\.php\?id=\d+)',
|
"BambooArmonicKnightsGuild",
|
||||||
after="site_banner") +
|
|
||||||
tagre("img", "title", r'([^"]+)'))
|
|
||||||
url_matcher = re.compile(tagre("a", "href", r'(http://[^"]+/comics/)') + "Latest Comic")
|
|
||||||
num_matcher = re.compile(r'50%">\s+(\d+)\s+')
|
|
||||||
adult_matcher = re.compile(tagre("img", "src", r'http://www\.smackjeeves\.com/images/mature_content\.png'))
|
|
||||||
|
|
||||||
|
# broken host name
|
||||||
|
"Razor",
|
||||||
|
)
|
||||||
|
|
||||||
def handle_url(url, session, res):
|
def handle_url(self, url):
|
||||||
"""Parse one search result page."""
|
"""Parse one search result page."""
|
||||||
print("Parsing", url, file=sys.stderr)
|
data = self.get_url(url)
|
||||||
try:
|
|
||||||
data = get_page(url, session).text
|
|
||||||
except IOError as msg:
|
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
|
||||||
return
|
|
||||||
for match in page_matcher.finditer(data):
|
|
||||||
page_url = match.group(1)
|
|
||||||
page_url = urljoin(url, page_url)
|
|
||||||
name = format_name(match.group(2))
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
if contains_case_insensitive(res, name):
|
|
||||||
# we cannot handle two comics that only differ in case
|
|
||||||
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
|
||||||
continue
|
|
||||||
# find out how many images this comic has
|
|
||||||
end = match.end()
|
|
||||||
mo = num_matcher.search(data[end:])
|
|
||||||
if not mo:
|
|
||||||
print("ERROR matching number:", repr(data[end:end + 300]),
|
|
||||||
file=sys.stderr)
|
|
||||||
continue
|
|
||||||
num = int(mo.group(1))
|
|
||||||
# search for url in extra page
|
|
||||||
print("Getting", page_url)
|
|
||||||
try:
|
|
||||||
data2 = get_page(page_url, session).text
|
|
||||||
except IOError as msg:
|
|
||||||
print("ERROR:", msg, file=sys.stderr)
|
|
||||||
return
|
|
||||||
mo = url_matcher.search(data2)
|
|
||||||
if not mo:
|
|
||||||
print("ERROR matching comic URL:", repr(data2[:300]), file=sys.stderr)
|
|
||||||
continue
|
|
||||||
comic_url = mo.group(1)
|
|
||||||
# search for adult flag
|
|
||||||
adult = adult_matcher.search(data2[end:])
|
|
||||||
bounce = name not in repeat_comics
|
|
||||||
res[name] = [
|
|
||||||
url_overrides.get(name, comic_url), num, bool(adult), bounce
|
|
||||||
]
|
|
||||||
|
|
||||||
|
num = 999
|
||||||
def get_results():
|
for comicdiv in data.cssselect(
|
||||||
"""Parse all search result pages."""
|
'div#webcomic_search_results div.full_banner_div'):
|
||||||
base = "http://www.smackjeeves.com/search.php?submit=Search+for+Webcomics&search_mode=webcomics&comic_title=&special=all&last_update=3&style_all=on&genre_all=on&format_all=on&sort_by=2&start=%d"
|
page_url = comicdiv.cssselect('a:first-child')[0].attrib['href']
|
||||||
session = requests.Session()
|
name = comicdiv.cssselect('img.banny')
|
||||||
# store info in a dictionary {name -> url, number of comics, adult flag, bounce flag}
|
if name:
|
||||||
res = {}
|
name = name[0].attrib['title']
|
||||||
# a search for an empty string returned 286 result pages
|
|
||||||
result_pages = 286
|
|
||||||
print("Parsing", result_pages, "search result pages...", file=sys.stderr)
|
|
||||||
for i in range(0, result_pages):
|
|
||||||
print(i + 1, file=sys.stderr, end=" ")
|
|
||||||
handle_url(base % (i * 12), session, res)
|
|
||||||
save_result(res, json_file)
|
|
||||||
|
|
||||||
|
|
||||||
def has_comic(name):
|
|
||||||
"""Check if comic name already exists."""
|
|
||||||
cname = name.lower()
|
|
||||||
for scraperobj in get_scrapers():
|
|
||||||
lname = scraperobj.name.lower()
|
|
||||||
if lname == cname:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def print_results(args):
|
|
||||||
"""Print all comics that have at least the given number of minimum comic strips."""
|
|
||||||
min_comics, filename = args
|
|
||||||
min_comics = int(min_comics)
|
|
||||||
with codecs.open(filename, 'a', 'utf-8') as fp:
|
|
||||||
for name, entry in sorted(load_result(json_file).items()):
|
|
||||||
if name in exclude_comics:
|
|
||||||
continue
|
|
||||||
url, num, adult, bounce = entry
|
|
||||||
if num < min_comics:
|
|
||||||
continue
|
|
||||||
if has_comic(name):
|
|
||||||
prefix = u'#'
|
|
||||||
else:
|
else:
|
||||||
prefix = u''
|
name = comicdiv.cssselect('h2')[0].text
|
||||||
fp.write(u"%sadd(%r, %r, %s, %s)\n" % (
|
# find out how many images this comic has
|
||||||
prefix, str(truncate_name(name)), str(url), adult, bounce
|
mo = comicdiv.cssselect('span.small-meter')
|
||||||
))
|
if not mo:
|
||||||
|
print("ERROR matching number of comics", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
num = int(mo[0].text.strip())
|
||||||
|
# search for url in extra page
|
||||||
|
data2 = self.get_url(page_url)
|
||||||
|
mo = data2.cssselect('div#quick_reading_links a:last-child')
|
||||||
|
if not mo:
|
||||||
|
print("ERROR matching comic URL", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
comic_url = mo[0].attrib['href']
|
||||||
|
# search for adult flag
|
||||||
|
adult = data2.xpath('//img[@src="' + self.ADULT_IMG + '"]')
|
||||||
|
self.add_comic(name, (comic_url, bool(adult)), num)
|
||||||
|
|
||||||
|
next_url = data.cssselect(
|
||||||
|
"div.search_nav td:last-child a")[0].attrib['href']
|
||||||
|
return (next_url, num)
|
||||||
|
|
||||||
|
def collect_results(self):
|
||||||
|
"""Parse all search result pages."""
|
||||||
|
# Sort by number of comics, so we can abort when we get under some
|
||||||
|
# threshold.
|
||||||
|
next_url = (
|
||||||
|
"http://www.smackjeeves.com/search.php?submit=1" +
|
||||||
|
"&search_mode=webcomics&comic_title=&sort_by=4&special=all" +
|
||||||
|
"&last_update=6&style_all=on&genre_all=on&format_all=on")
|
||||||
|
last_count = 999
|
||||||
|
while last_count >= self.MIN_COMICS:
|
||||||
|
print(last_count, file=sys.stderr, end=" ")
|
||||||
|
next_url, last_count = self.handle_url(next_url)
|
||||||
|
|
||||||
|
def get_classdef(self, name, data):
|
||||||
|
sub, top = urlsplit(data[0]).hostname.split('.', 1)
|
||||||
|
cl = u"class SJ%s(_SmackJeeves):" % name
|
||||||
|
if top.lower() == "smackjeeves.com":
|
||||||
|
cl += "\n sub = '%s'" % sub
|
||||||
|
else:
|
||||||
|
cl += "\n host = '%s.%s'" % (sub, top)
|
||||||
|
if data[1]:
|
||||||
|
cl += "\n adult = True"
|
||||||
|
return cl
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if len(sys.argv) > 1:
|
SmackJeevesUpdater(__file__).run()
|
||||||
print_results(sys.argv[1:])
|
|
||||||
else:
|
|
||||||
get_results()
|
|
||||||
|
|
Loading…
Reference in a new issue