Check robots.txt, update excluded comic names.
This commit is contained in:
parent
681ff68132
commit
d1f0f23b4c
2 changed files with 30 additions and 3 deletions
File diff suppressed because one or more lines are too long
|
@ -10,7 +10,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
from dosagelib.util import getPageContent, asciify, unescape, tagre
|
from dosagelib.util import getPageContent, asciify, unescape, tagre, check_robotstxt
|
||||||
from dosagelib.scraper import get_scraperclasses
|
from dosagelib.scraper import get_scraperclasses
|
||||||
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name, format_description
|
from scriptutil import contains_case_insensitive, capfirst, save_result, load_result, truncate_name, format_description
|
||||||
|
|
||||||
|
@ -26,6 +26,25 @@ desc_matcher = re.compile(r'</font><br>(.+)(?:</b>)?</td></tr>', re.DOTALL)
|
||||||
|
|
||||||
# names of comics to exclude
|
# names of comics to exclude
|
||||||
exclude_comics = [
|
exclude_comics = [
|
||||||
|
"BrawlintheFamily", # non-standard navigation
|
||||||
|
"CrowScare", # non-standard navigation
|
||||||
|
"Dreamless", # non-standard navigation
|
||||||
|
"EV", # non-standard navigation
|
||||||
|
"Exposure", # non-standard navigation
|
||||||
|
"Flipside", # non-standard navigation
|
||||||
|
"HerobyNight", # non-standard navigation
|
||||||
|
"LastBlood", # non-standard navigation
|
||||||
|
"MysticRevolution", # non-standard navigation
|
||||||
|
"NoRoomForMagic", # non-standard navigation
|
||||||
|
"PunchanPie", # non-standard navigation
|
||||||
|
"RoadWaffles", # non-standard navigation
|
||||||
|
"Shadowbinders", # non-standard navigation
|
||||||
|
"ShockwaveDarkside", # non-standard navigation
|
||||||
|
"Supernovas", # non-standard navigation
|
||||||
|
"Twokinds", # non-standard navigation
|
||||||
|
"WisdomofMoo", # non-standard navigation
|
||||||
|
"Yirmumah", # non-standard navigation
|
||||||
|
"YouDamnKid", # non-standard navigation
|
||||||
]
|
]
|
||||||
|
|
||||||
# links to last valid strips
|
# links to last valid strips
|
||||||
|
@ -51,7 +70,15 @@ def handle_url(url, session, res):
|
||||||
continue
|
continue
|
||||||
if contains_case_insensitive(res, name):
|
if contains_case_insensitive(res, name):
|
||||||
# we cannot handle two comics that only differ in case
|
# we cannot handle two comics that only differ in case
|
||||||
print("INFO: skipping possible duplicate", name, file=sys.stderr)
|
print("INFO: skipping possible duplicate", repr(name), file=sys.stderr)
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
if "/d/" not in comicurl:
|
||||||
|
check_robotstxt(comicurl+"d/", session)
|
||||||
|
else:
|
||||||
|
check_robotstxt(comicurl, session)
|
||||||
|
except IOError:
|
||||||
|
print("INFO: robots.txt denied for", repr(name))
|
||||||
continue
|
continue
|
||||||
res[name] = (comicurl, desc)
|
res[name] = (comicurl, desc)
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue