From 46f75f7b34325e79d9ac321a7e5ba2e4f32afd44 Mon Sep 17 00:00:00 2001 From: Shish Date: Thu, 30 Mar 2023 20:36:58 +0100 Subject: [PATCH] [static] allow extensions to customise robots.txt --- ext/comment/main.php | 7 +++++++ ext/rule34/main.php | 8 ++++++++ ext/static_files/main.php | 22 ++++++++++++++++++++++ ext/static_files/static/robots.txt | 11 ----------- ext/view/main.php | 8 ++++++++ 5 files changed, 45 insertions(+), 11 deletions(-) delete mode 100644 ext/static_files/static/robots.txt diff --git a/ext/comment/main.php b/ext/comment/main.php index b6da4eed..bf588318 100644 --- a/ext/comment/main.php +++ b/ext/comment/main.php @@ -216,6 +216,13 @@ class CommentList extends Extension } } + public function onRobotsBuilding(RobotsBuildingEvent $event) + { + // comment lists change all the time, crawlers should + // index individual image's comments + $event->add_disallow("comment"); + } + private function onPageRequest_add() { global $user, $page; diff --git a/ext/rule34/main.php b/ext/rule34/main.php index 9e3758d7..22000a0a 100644 --- a/ext/rule34/main.php +++ b/ext/rule34/main.php @@ -101,6 +101,14 @@ class Rule34 extends Extension } } + public function onRobotsBuilding(RobotsBuildingEvent $event) + { + // robots should only check the canonical site, not mirrors + if ($_SERVER['HTTP_HOST'] != "rule34.paheal.net") { + $event->add_disallow(""); + } + } + public function onPageRequest(PageRequestEvent $event) { global $database, $page, $user; diff --git a/ext/static_files/main.php b/ext/static_files/main.php index 8a8693f0..c2b4aef7 100644 --- a/ext/static_files/main.php +++ b/ext/static_files/main.php @@ -4,11 +4,33 @@ declare(strict_types=1); namespace Shimmie2; +class RobotsBuildingEvent extends Event +{ + public array $parts = [ + "User-agent: *", + // Site is rate limited to 1 request / sec, + // returns 503 for more than that + "Crawl-delay: 3", + ]; + + public function add_disallow(string $path): void + { + $this->parts[] = "Disallow: /$path"; + } +} + class StaticFiles extends Extension { public function onPageRequest(PageRequestEvent $event) { global $config, $page; + + if ($event->page_matches("robots.txt")) { + $rbe = send_event(new RobotsBuildingEvent()); + $page->set_mode(PageMode::DATA); + $page->set_data(join("\n", $rbe->parts)); + } + // hax. if ($page->mode == PageMode::PAGE && (!isset($page->blocks) || $this->count_main($page->blocks) == 0)) { $h_pagename = html_escape(implode('/', $event->args)); diff --git a/ext/static_files/static/robots.txt b/ext/static_files/static/robots.txt deleted file mode 100644 index e9ae5fde..00000000 --- a/ext/static_files/static/robots.txt +++ /dev/null @@ -1,11 +0,0 @@ -User-agent: * -# comment lists change all the time, crawlers should -# index individual image's comments -Disallow: /comment/ -# next and prev are just CPU-heavier ways of getting -# to the same images that the index shows -Disallow: /post/next/ -Disallow: /post/prev/ -# Site is rate limited to 1 request / sec, -# returns 503 for more than that -Crawl-delay: 3 diff --git a/ext/view/main.php b/ext/view/main.php index ceb2c83d..24700253 100644 --- a/ext/view/main.php +++ b/ext/view/main.php @@ -94,6 +94,14 @@ class ViewImage extends Extension } } + public function onRobotsBuilding(RobotsBuildingEvent $event) + { + // next and prev are just CPU-heavier ways of getting + // to the same images that the index shows + $event->add_disallow("post/next"); + $event->add_disallow("post/prev"); + } + public function onDisplayingImage(DisplayingImageEvent $event) { global $page, $user;