Check robots.txt, update excluded comic names.

This commit is contained in:
Bastian Kleineidam 2013-03-12 20:46:48 +01:00
parent c3ad143f7e
commit 681ff68132
2 changed files with 22 additions and 12 deletions

File diff suppressed because one or more lines are too long

View file

@ -10,7 +10,7 @@ import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name
@ -93,7 +93,7 @@ exclude_comics = [
"EarthRiser", # redirects to a new page "EarthRiser", # redirects to a new page
"EdgetheDevilhunter", # page is gone "EdgetheDevilhunter", # page is gone
"EdibleDirt", # page moved "EdibleDirt", # page moved
"Einstien27sDesk", # page is gone "EinstiensDesk", # page is gone
"ElfOnlyInn", # page moved "ElfOnlyInn", # page moved
"Ensuing", # broken links "Ensuing", # broken links
"etch", # broken images "etch", # broken images
@ -104,7 +104,7 @@ exclude_comics = [
"FaerieTales", # page does not follow standard layout "FaerieTales", # page does not follow standard layout
"FairestandFallen", # page does not follow standard layout "FairestandFallen", # page does not follow standard layout
"FairyTaleNewVillage", # missing images "FairyTaleNewVillage", # missing images
"Fate27sTear", # page moved "FatesTear", # page moved
"FaultyLogic", # page does not follow standard layout "FaultyLogic", # page does not follow standard layout
"FireontheMountain", # page does not follow standard layout "FireontheMountain", # page does not follow standard layout
"FiveBucksanHour", # page is gone "FiveBucksanHour", # page is gone
@ -234,14 +234,14 @@ exclude_comics = [
"Riboflavin", # page does not follow standard layout "Riboflavin", # page does not follow standard layout
"RitualsandOfferings", # page is gone "RitualsandOfferings", # page is gone
"RiverCityHigh", # page is gone "RiverCityHigh", # page is gone
"RM27sothercomics", # page does not follow standard layout "RMsothercomics", # page does not follow standard layout
"RogerAndDominic", # page does not follow standard layout "RogerAndDominic", # page does not follow standard layout
"RoleoftheDie", # page is gone "RoleoftheDie", # page is gone
"RonnieRaccoon", # page moved "RonnieRaccoon", # page moved
"RosalarianAndapossRandomCreepyTales", # page is gone "RosalarianAndapossRandomCreepyTales", # page is gone
"RulesofMakeBelieve", # page is gone "RulesofMakeBelieve", # page is gone
"Rveillerie", # page has 403 forbidden "Rveillerie", # page has 403 forbidden
"SaintPeter27sCross", # page does not follow standard layout "SaintPetersCross", # page does not follow standard layout
"Saturnalia", # page moved "Saturnalia", # page moved
"SavageIslands", # page has 403 forbidden "SavageIslands", # page has 403 forbidden
"SaveMeGebus", # page does not follow standard layout "SaveMeGebus", # page does not follow standard layout
@ -259,7 +259,7 @@ exclude_comics = [
"SLAGIT", # missing images "SLAGIT", # missing images
"SmithStone", # page has 403 forbidden "SmithStone", # page has 403 forbidden
"SnowflakeStudios", # page is gone "SnowflakeStudios", # page is gone
"Sock27d", # page is gone "Sockd", # page is gone
"Soks", # page is gone "Soks", # page is gone
"SoManyLevels", # page moved "SoManyLevels", # page moved
"SomethingSoft", # page is gone "SomethingSoft", # page is gone
@ -279,7 +279,7 @@ exclude_comics = [
"ThatWasMcPherson", # page moved "ThatWasMcPherson", # page moved
"The6GUYSInMyHead", # page has 403 forbidden "The6GUYSInMyHead", # page has 403 forbidden
"TheAdventuresofCaptainMooki", # page moved "TheAdventuresofCaptainMooki", # page moved
"TheAdventuresofLi27lDenverPastrami", # page is gone "TheAdventuresofLilDenverPastrami", # page is gone
"TheAdventuresofPeppyThePipingPirate", # page is gone "TheAdventuresofPeppyThePipingPirate", # page is gone
"TheAmoeba", # page is gone "TheAmoeba", # page is gone
"TheAvatar", # page does not follow standard layout "TheAvatar", # page does not follow standard layout
@ -287,7 +287,7 @@ exclude_comics = [
"TheBestandtheBrightest", # page moved "TheBestandtheBrightest", # page moved
"TheDevilsPanties", # page moved "TheDevilsPanties", # page moved
"TheDoctorPepperShow", # page has 403 forbidden "TheDoctorPepperShow", # page has 403 forbidden
"TheGods27Pack", # page has 403 forbidden "TheGodsPack", # page has 403 forbidden
"TheMadBrothers", # page does not follow standard layout "TheMadBrothers", # page does not follow standard layout
"TheMediocres", # missing images "TheMediocres", # missing images
"TheNamelessStory", # page has 403 forbidden "TheNamelessStory", # page has 403 forbidden
@ -299,7 +299,7 @@ exclude_comics = [
"TheWotch", # page does not follow standard layout "TheWotch", # page does not follow standard layout
"ThunderandLightning", # page moved "ThunderandLightning", # page moved
"TinysWorld", # page does not follow standard layout "TinysWorld", # page does not follow standard layout
"ToonPimp27sPalace", # page moved "ToonPimpsPalace", # page moved
"Tossers", # page moved "Tossers", # page moved
"Towner", # page does not follow standard layout "Towner", # page does not follow standard layout
"Townies", # page is gone "Townies", # page is gone
@ -380,7 +380,7 @@ def handle_url(url, session, res):
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", name, file=sys.stderr) print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue continue
# find out how many images this comic has # find out how many images this comic has
end = match.end() end = match.end()
@ -389,7 +389,17 @@ def handle_url(url, session, res):
print("ERROR:", repr(data[end:end+300]), file=sys.stderr) print("ERROR:", repr(data[end:end+300]), file=sys.stderr)
continue continue
num = int(mo.group(1)) num = int(mo.group(1))
res[name] = (url_overrides.get(name, url), num) url = url_overrides.get(name, url)
try:
if "/d/" not in url:
check_robotstxt(url+"d/", session)
else:
check_robotstxt(url, session)
except IOError:
print("INFO: robots.txt denied for", repr(name))
continue
else:
res[name] = (url, num)
def get_results(): def get_results():