Check robots.txt, update excluded comic names.

This commit is contained in:
Bastian Kleineidam 2013-03-12 20:46:57 +01:00
parent 681ff68132
commit d1f0f23b4c
2 changed files with 30 additions and 3 deletions

File diff suppressed because one or more lines are too long

View file

@ -10,7 +10,7 @@ import sys
import os import os
import requests import requests
sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from dosagelib.util import getPageContent, asciify, unescape, tagre from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
from dosagelib.scraper import get_scraperclasses from dosagelib.scraper import get_scraperclasses
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name, format_description from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name, format_description
@ -26,6 +26,25 @@ desc_matcher = re.compile(r'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
# names of comics to exclude # names of comics to exclude
exclude_comics = [ exclude_comics = [
"BrawlintheFamily", # non-standard navigation
"CrowScare", # non-standard navigation
"Dreamless", # non-standard navigation
"EV", # non-standard navigation
"Exposure", # non-standard navigation
"Flipside", # non-standard navigation
"HerobyNight", # non-standard navigation
"LastBlood", # non-standard navigation
"MysticRevolution", # non-standard navigation
"NoRoomForMagic", # non-standard navigation
"PunchanPie", # non-standard navigation
"RoadWaffles", # non-standard navigation
"Shadowbinders", # non-standard navigation
"ShockwaveDarkside", # non-standard navigation
"Supernovas", # non-standard navigation
"Twokinds", # non-standard navigation
"WisdomofMoo", # non-standard navigation
"Yirmumah", # non-standard navigation
"YouDamnKid", # non-standard navigation
] ]
# links to last valid strips # links to last valid strips
@ -51,7 +70,15 @@ def handle_url(url, session, res):
continue continue
if contains_case_insensitive(res, name): if contains_case_insensitive(res, name):
# we cannot handle two comics that only differ in case # we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", name, file=sys.stderr) print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
continue
try:
if "/d/" not in comicurl:
check_robotstxt(comicurl+"d/", session)
else:
check_robotstxt(comicurl, session)
except IOError:
print("INFO: robots.txt denied for", repr(name))
continue continue
res[name] = (comicurl, desc) res[name] = (comicurl, desc)