Adopt SmackJeeves to new site design (fixes #144)

Some things got lost on the way:
- Since there is no comprehensive comic directory anymore, removed
  automatic update script. New comics need to be added manually.
- Some authors used the opportunity to move from SmackJeeves somewhere
  else - some of those got new modules (either standalone or ComicFury)
- Abunch of comics just disappeared...
This commit is contained in:
Tobias Gruetzmacher 2019-12-26 22:03:18 +01:00
parent 02c0da24fa
commit 963db7f448
11 changed files with 696 additions and 898 deletions

View file

@ -634,6 +634,7 @@ class ComicFury(_ParserScraper):
cls('MadGirl', 'madgirl'),
cls('MagicElDesencuentro', 'magiceldesencuentro', 'es'),
cls('MagicTheScattering', 'magicthescattering'),
cls('Magience', 'magience'),
cls('MAGISAPARASAYOupdatesMonFri', 'mag-isa'),
cls('MagnaComica', 'magnacomica'),
cls('Maluk', 'maluk'),

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2018 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -214,6 +214,12 @@ class DorkTower(_ParserScraper):
prevSearch = '//a[%s][text()="Previous"]' % xpath_class('btn')
class DoomsdayMyDear(_ParserScraper):
url = 'http://doomsdaymydear.com/'
imageSearch = '//img[{}]'.format(xpath_class('attachment-full'))
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
class Dracula(_BasicScraper):
url = 'http://draculacomic.net/'
stripUrl = url + 'comic.php?comicID=%s'

View file

@ -109,6 +109,10 @@ class FonFlatter(_ParserScraper):
)
class ForestHill(_WordPressScraper):
url = 'https://www.foresthillcomic.org/'
class ForLackOfABetterComic(_BasicScraper):
url = 'http://forlackofabettercomic.com/'
rurl = r'http://(?:www\.)?forlackofabettercomic\.com/'

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -411,21 +411,27 @@ class Removed(Scraper):
cls('SmackJeeves/AchievementStuck'),
cls('SmackJeeves/AGirlAndHerShadow'),
cls('SmackJeeves/Allthatglitters'),
cls('SmackJeeves/AloversRule'),
cls('SmackJeeves/Anathemacomics'),
cls('SmackJeeves/AngelBeast'),
cls('SmackJeeves/ArchportCityChronicles'),
cls('SmackJeeves/AwesomeSauce'),
cls('SmackJeeves/BetweenLightandDark'),
cls('SmackJeeves/BetweenWorlds'),
cls('SmackJeeves/BeyondTemptation'),
cls('SmackJeeves/BLDShortComics'),
cls('SmackJeeves/Bloodyfairytale'),
cls('SmackJeeves/BLOT'),
cls('SmackJeeves/BlueWell'),
cls('SmackJeeves/BreakfastonaCliff'),
cls('SmackJeeves/CafeAmargo'),
cls('SmackJeeves/Captor'),
cls('SmackJeeves/ChaosTheory2005'),
cls('SmackJeeves/CleanCure'),
cls('SmackJeeves/DaddysGirl'),
cls('SmackJeeves/Debtsettlement'),
cls('SmackJeeves/DebtSettlement2OperationExtinction'),
cls('SmackJeeves/DefyingGravityTheFourGreatGuardians'),
cls('SmackJeeves/Destinationunknown'),
cls('SmackJeeves/DevilTrainee'),
cls('SmackJeeves/DevilTraineeSpanish'),
@ -436,7 +442,9 @@ class Removed(Scraper):
cls('SmackJeeves/EternalKnights'),
cls('SmackJeeves/EvD'),
cls('SmackJeeves/FeathersPI'),
cls('SmackJeeves/FemmeSchism'),
cls('SmackJeeves/FireWire'),
cls('SmackJeeves/FrenzyRedux'),
cls('SmackJeeves/FrogKing'),
cls('SmackJeeves/FuckMyLife'),
cls('SmackJeeves/FurtherDowntheRabbitHole'),
@ -445,21 +453,33 @@ class Removed(Scraper):
cls('SmackJeeves/GraveImpressions'),
cls('SmackJeeves/GreenKirbyandabunchofotherpeopledoinstuff'),
cls('SmackJeeves/Harfang'),
cls('SmackJeeves/HIPS'),
cls('SmackJeeves/HotChocolate'),
cls('SmackJeeves/Hybristorific'),
cls('SmackJeeves/Ianua'),
cls('SmackJeeves/ImminentMoose'),
cls('SmackJeeves/InthePride'),
cls('SmackJeeves/Intoxicated'),
cls('SmackJeeves/Jantarpol'),
cls('SmackJeeves/Knife'),
cls('SmackJeeves/Kranburn'),
cls('SmackJeeves/KuroNeko'),
cls('SmackJeeves/LastLivingSouls'),
cls('SmackJeeves/LatchkeyKingdom'),
cls('SmackJeeves/LegendsofMobiusBookOne'),
cls('SmackJeeves/LiliBleu'),
cls('SmackJeeves/LoveTwister'),
cls('SmackJeeves/MagicalGirlAlice'),
cls('SmackJeeves/MasqueradeWTTM'),
cls('SmackJeeves/MegaManBattleNetwork7'),
cls('SmackJeeves/MegaManiacs'),
cls('SmackJeeves/MerirosvotSeikkailumerella'),
cls('SmackJeeves/MewsDynasty'),
cls('SmackJeeves/MixupofallMixups'),
cls('SmackJeeves/MomthegamestorerippedusoffAGAIN'),
cls('SmackJeeves/MoonlitDawnAMythicalTale'),
cls('SmackJeeves/MyBoyfriendisaMobBoss'),
cls('SmackJeeves/MyTrollLife'),
cls('SmackJeeves/NihilWandasJourney'),
cls('SmackJeeves/OddContact'),
cls('SmackJeeves/OneFrameGags'),
@ -472,24 +492,37 @@ class Removed(Scraper):
cls('SmackJeeves/PumpkinFlower'),
cls('SmackJeeves/Razor'),
cls('SmackJeeves/SAKANA'),
cls('SmackJeeves/SenoireDelirium'),
cls('SmackJeeves/SerendipityAnEquestrianTale'),
cls('SmackJeeves/ShacklesInstallment02'),
cls('SmackJeeves/SimonSues'),
cls('SmackJeeves/SonicUniverseAsk'),
cls('SmackJeeves/SoulGuardian'),
cls('SmackJeeves/Spidersilk', 'mov'),
cls('SmackJeeves/Symbios'),
cls('SmackJeeves/TechnicolorLondon'),
cls('SmackJeeves/TeKscloset'),
cls('SmackJeeves/TheAttackoftheRecoloursSeason1'),
cls('SmackJeeves/TheCurtandTonyShow'),
cls('SmackJeeves/TheDarkAgeofMobius'),
cls('SmackJeeves/TheHobbitbic'),
cls('SmackJeeves/ThehumanBEing'),
cls('SmackJeeves/TheKeyToReality'),
cls('SmackJeeves/TheLostland'),
cls('SmackJeeves/TheMewExperiment'),
cls('SmackJeeves/TheRandomObscureFairyTaleNoOnesEverReallyHeardOf'),
cls('SmackJeeves/TheSomewhereOther'),
cls('SmackJeeves/TheWastelands', 'mis'),
cls('SmackJeeves/ThinkBeforeYouThink', 'mov'),
cls('SmackJeeves/ThroughTheWonkyEye'),
cls('SmackJeeves/TitleUnrelated'),
cls('SmackJeeves/TotalPokemonIsland'),
cls('SmackJeeves/TrillyAndSilly'),
cls('SmackJeeves/TRIPP'),
cls('SmackJeeves/VampireFetish'),
cls('SmackJeeves/WolfWolf'),
cls('SmackJeeves/WonderTheatre'),
cls('SmackJeeves/YouAreTheReasonForTheEndOfTheWorld'),
cls('SnowFlakes'),
cls('StrawberryDeathCake'),
cls('Stubble'),
@ -629,6 +662,11 @@ class Renamed(Scraper):
cls('PetiteSymphony/Djandora', 'ComicsBreak/Djandora'),
cls('PetiteSymphony/Generation17', 'ComicsBreak/Generation17'),
cls('SmackJeeves/CityFolk', 'ComicFury/CityFolk'),
cls('SmackJeeves/DoomsdayMyDear', 'DoomsdayMyDear'),
cls('SmackJeeves/ForestHill', 'ForestHill'),
cls('SmackJeeves/Magience', 'ComicFury/Magience'),
cls('SmackJeeves/RiversideExtras', 'RiversideExtras'),
cls('SmackJeeves/StarTrip', 'StarTrip'),
cls('TracyAndTristan', 'ComicFury/TracyAndTristan'),
cls('Wulffmorgenthaler', 'WuMo'),
cls('ZebraGirl', 'ComicFury/ZebraGirl'),

View file

@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
from __future__ import absolute_import, division, print_function
@ -108,6 +108,12 @@ class Replay(_ParserScraper):
return name
class RiversideExtras(_ParserScraper):
url = 'https://riversidecomics.com/'
imageSearch = '//div[{}]//img'.format(xpath_class('webcomic-image'))
prevSearch = '//a[{}]'.format(xpath_class('previous-webcomic-link'))
class RomanticallyApocalyptic(_ParserScraper):
url = 'http://romanticallyapocalyptic.com/'
stripUrl = url + '%s'

View file

@ -507,6 +507,10 @@ class StarfireAgency(_WordPressScraper):
return chapter + '_' + filename
class StarTrip(_ComicControlScraper):
url = 'https://www.startripcomic.com/'
class StationV3(_ParserScraper):
url = 'http://www.stationv3.com/'
stripUrl = url + 'd3/%s.html'

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
# Copyright (C) 2015-2019 Tobias Gruetzmacher
"""
Script to get ComicFury comics and save the info in a JSON file for further
processing.
@ -20,7 +20,7 @@ class ComicFuryUpdater(ComicListUpdater):
MIN_COMICS = 90
dup_templates = ('ComicSherpa/%s', 'Creators/%s', 'GoComics/%s',
'KeenSpot/%s', 'SmackJeeves/%s', 'Arcamax/%s')
'KeenSpot/%s', 'Arcamax/%s')
langmap = {
'german': 'de',
@ -158,8 +158,8 @@ class ComicFuryUpdater(ComicListUpdater):
def collect_results(self):
"""Parse all search result pages."""
# Sort by page count, so we can abort when we get under some threshold.
baseUrl = ('http://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=1&incse=1&incnu=1&incla=1&' +
baseUrl = ('https://comicfury.com/search.php?search=1&webcomics=1&' +
'query=&worder=1&asc=0&incvi=2&incnu=2&incla=2&incse=2&' +
'all_ge=1&all_st=1&all_la=1&page=%d')
last_count = 999
page = 1

View file

@ -8,7 +8,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory comicskingdom"
list="arcamax comicfury comicgenesis creators gocomics keenspot webcomicfactory comicskingdom"
fi
for script in $list; do
echo "Executing ${script}.py"

View file

@ -1,172 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2017 Tobias Gruetzmacher
"""
Script to get a list of smackjeeves.com comics and save the info in a JSON file
for further processing.
"""
from __future__ import absolute_import, division, print_function
import sys
from six.moves.urllib.parse import urlsplit
from scriptutil import ComicListUpdater
class SmackJeevesUpdater(ComicListUpdater):
# Absolute minumum number of pages a comic may have (restrict search space)
MIN_COMICS = 90
# names of comics to exclude
excluded_comics = (
# comic moved/we have a better module
"Amya",
"Footloose",
"TitleUnrelated",
# does not follow standard layout
"300DaysOfSyao",
"ADifferentPerspective",
"Captor",
"ClubLove",
"Comatose",
"DeSTRESS",
"DreamCatcher",
"Fumiko",
"GART",
"GarytheAlchemist",
"ItoshiisCrazyNuzlockeAdventures",
"JennyHaniver",
"KiLAiLO",
"LoudEra",
"LunarHill",
"Mafiagame",
"MylifewithFel",
"MyLifewithFelENESPANOL",
"NegativeZen",
"Nemutionpobae",
"NightShot",
"NormalIsBoring",
"OpticalDisarray",
"PicturesofYou",
"Pornjunkiesstrip",
"PrettyUgly",
"Project217",
"RemmyzRandomz",
"Ribon",
"RubysWorld",
"ShinkaTheLastEevee",
"SimplePixel",
"SladesMansionofawesomeness",
"SpaceSchool",
"SushiGummy",
"TC2KsPokemobians",
"TheAfterSubtract",
"ThePokemonArtBox",
"THEVOIDWEBCOMIC",
"ToDefeatThemAll",
"TwoKeys",
"Vbcomics",
"WerewolfRichard",
# has no previous comic link
"ThreadCrashers",
"AchievementStuck",
# images are 403 forbidden
"AngelJunkPileFelix",
"AntavioussGenLab",
"Harfang",
"Okamirai",
# missing images
"AGirlAndHerShadow",
"Carciphona",
"CatboyattheCon",
"ContraandtheSpamDump",
"Darkkyosshorts",
"DollarStoreCaviar",
"EdgeofDecember",
"EvD",
"HAndJ",
"HEARD",
"IwillbenapoSpamDump",
"KirbysoftheAlternateDimension",
"Letsreviewshallwe",
"MegaManSpriteExpo",
"OmnisSpriteShowcase",
"PiecesofBrokenGlass",
"PlatonicManagementDilemma",
"SecretSanta2011",
"SerendipityAnEquestrianTale",
"SJArtCollab",
"SlightlyDifferent",
"TheAttackoftheRecoloursSeason1",
"ThroughTheWonkyEye",
"TotallyKotor",
"WinterMelody",
"ZonowTheHedgehog",
# missing previous link
"BambooArmonicKnightsGuild",
# broken host name
"Razor",
)
def __init__(self, name):
super(SmackJeevesUpdater, self).__init__(name)
self.sleep = 1.5
def handle_url(self, url):
"""Parse one search result page."""
data = self.get_url(url)
num = 999
for comictag in data.cssselect('a.card'):
page_url = comictag.attrib['href']
name = comictag.cssselect('div.title')[0].text
# search for url in extra page
data2 = self.get_url(page_url)
# find out how many images this comic has
mo = data2.cssselect('div.num-pages div.value')
num = int(mo[0].text.strip().replace(',', ''))
mo = data2.cssselect('div.buttons a:last-child')
comic_url = mo[0].attrib['href']
# search for adult flag
adult = data2.cssselect('div.mature')
updates = data2.cssselect('div.updates div.value')[0].text_content()
self.add_comic(name, (comic_url, len(adult) > 0, updates), num)
next_url = data.cssselect("a.next")[0].attrib['href']
return (next_url, num)
def collect_results(self):
"""Parse all search result pages."""
# Sort by number of comics, so we can abort when we get under some
# threshold.
next_url = "http://www.smackjeeves.com/search.php?last_update=6&sort_by=5"
last_count = 999
while last_count >= self.MIN_COMICS:
print(last_count, file=sys.stderr)
next_url, last_count = self.handle_url(next_url)
def get_entry(self, name, data):
sub, top = urlsplit(data[0]).hostname.split('.', 1)
if top.lower() == "smackjeeves.com":
opt = "sub='%s'" % sub
else:
opt = "host='%s.%s'" % (sub, top)
if data[1]:
opt += ", adult=True"
if data[2] == 'Completed':
opt += ", endOfLife=True"
return u"cls('%s', %s)," % (name, opt)
if __name__ == '__main__':
SmackJeevesUpdater(__file__).run()

View file

@ -10,7 +10,7 @@ d=$(dirname $0)
if [ $# -ge 1 ]; then
list="$*"
else
list="arcamax comicfury comicgenesis creators gocomics keenspot smackjeeves webcomicfactory comicskingdom"
list="arcamax comicfury comicgenesis creators gocomics keenspot webcomicfactory comicskingdom"
fi
for script in $list; do
target="${d}/../dosagelib/plugins/${script}.py"