From 916b153fc2ee09075b5520a799394caa2c6b538b Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Sat, 28 Dec 2024 20:04:43 -0700 Subject: [PATCH] Automatically populate robots.txt Signed-off-by: Jon Stovell --- Languages/en_US/Help.php | 1 + Languages/en_US/Search.php | 5 + Sources/Actions/Admin/SearchEngines.php | 194 ++++++++++++++++++++++++ 3 files changed, 200 insertions(+) diff --git a/Languages/en_US/Help.php b/Languages/en_US/Help.php index b47100aa5d..2d4429efdf 100644 --- a/Languages/en_US/Help.php +++ b/Languages/en_US/Help.php @@ -570,6 +570,7 @@ As above except only Administrators can see spider status - to all other users spiders appear as guests. '; +$helptxt['robots_txt'] = 'The robots.txt file is used to implement the Robots Exclusion Protocol, a standard used by websites to indicate to search engine spiders and other web robots which portions of the website they are allowed to visit. This file is typically located in your website’s root directory.

SMF adds some rules to this file in order to guide spiders away from URLs that they should not bother to crawl. This improves efficiency and reduces server load when a spider is crawling your forum.'; $helptxt['birthday_email'] = 'Choose the index of the birthday email message to use. A preview will be shown in the Email Subject and Email Body fields.
Note: Selecting this setting does not automatically enable birthday emails. To enable birthday emails use the Scheduled Tasks page and enable the birthday email task.'; $helptxt['pm_bcc'] = 'When sending a personal message you can choose to add a recipient as BCC (Blind Carbon Copy). BCC recipients do not have their identities revealed to the other recipients of the message.'; diff --git a/Languages/en_US/Search.php b/Languages/en_US/Search.php index 35b141602b..87494d9459 100644 --- a/Languages/en_US/Search.php +++ b/Languages/en_US/Search.php @@ -180,4 +180,9 @@ $txt['spider_stats_page_hits'] = 'Page Hits'; $txt['spider_stats_no_entries'] = 'There are currently no spider statistics available.'; +$txt['robots_txt'] = 'Add SMF rules to robots.txt'; +$txt['robots_txt_info'] = 'Enter the path to your robots.txt file so that SMF can append rules to it.'; +$txt['robots_txt_auto'] = 'Detect path'; +$txt['robots_txt_not_writable'] = 'The robots.txt file is not writable.'; + ?> \ No newline at end of file diff --git a/Sources/Actions/Admin/SearchEngines.php b/Sources/Actions/Admin/SearchEngines.php index bfc9d00da5..154a235a99 100644 --- a/Sources/Actions/Admin/SearchEngines.php +++ b/Sources/Actions/Admin/SearchEngines.php @@ -30,6 +30,7 @@ use SMF\SecurityToken; use SMF\Theme; use SMF\Time; +use SMF\Url; use SMF\User; use SMF\Utils; @@ -637,6 +638,7 @@ public function settings(): void ACP::saveDBSettings($config_vars); self::recacheSpiderNames(); + self::addRobotsTxtRules(); $_SESSION['adm-save'] = true; Utils::redirectexit('action=admin;area=sengines;sa=settings'); @@ -794,6 +796,29 @@ function disableFields() } disableFields();'; + // Now the setting for robots.txt. + $config_vars[] = ''; + + if (empty(Config::$modSettings['robots_txt'])) { + $detected_path = self::detectRobotsTxt(); + + if ($detected_path !== (Config::$modSettings['robots_txt'] ?? '')) { + $post_input = ''; + } + } elseif (!is_writable(Config::$modSettings['robots_txt'])) { + $invalid = true; + $post_input = '
' . Lang::$txt['robots_txt_not_writable'] . ''; + } + + $config_vars[] = [ + 'text', + 'robots_txt', + 'subtext' => Lang::$txt['robots_txt_info'], + 'size' => 45, + 'invalid' => $invalid ?? false, + 'postinput' => $post_input ?? '', + ]; + IntegrationHook::call('integrate_modify_search_engine_settings', [&$config_vars]); return $config_vars; @@ -1100,6 +1125,175 @@ protected function __construct() Utils::$context['sub_action'] = &$this->subaction; } + + /** + * Finds and returns the file path to robots.txt, or else the file path + * where it should be created if it doesn't already exist. + * + * @return string The path to robots.txt. + */ + protected static function detectRobotsTxt(): string + { + // First try $_SERVER['CONTEXT_DOCUMENT_ROOT'], then try $_SERVER['DOCUMENT_ROOT']. + foreach (['CONTEXT_DOCUMENT_ROOT', 'DOCUMENT_ROOT'] as $var) { + if ( + isset($_SERVER[$var]) + && str_starts_with( + strtr(Config::$boarddir, ['/' => DIRECTORY_SEPARATOR]), + strtr($_SERVER[$var], ['/' => DIRECTORY_SEPARATOR]), + ) + ) { + return rtrim(strtr($_SERVER[$var], ['/' => DIRECTORY_SEPARATOR]), DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . 'robots.txt'; + } + + } + + // If the server has an odd configuration, try to figure out the path ourselves. + $path_from_boarddir = strtr(Config::$boarddir, ['/' => DIRECTORY_SEPARATOR]); + $path_from_boardurl = strtr(Url::create(Config::$boardurl)->path, ['/' => DIRECTORY_SEPARATOR]); + + // Walk up the path until we find the document root. + while ( + // Stop if we find robots.txt + !file_exists($path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt') + // Stop if the URL path and the filesystem path diverge. + && basename($path_from_boarddir) === basename($path_from_boardurl) + // Stop if we get to the root of the path according to the URL. + && dirname($path_from_boardurl) !== $path_from_boardurl + ) { + $path_from_boarddir = dirname($path_from_boarddir); + $path_from_boardurl = dirname($path_from_boardurl); + } + + return $path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt'; + } + + /** + * Checks whether robots.txt is writable and, if so, adds some rules to it + * for SMF purposes. + */ + protected static function addRobotsTxtRules(): void + { + // Can we write to the file? + if ( + (Config::$modSettings['robots_txt'] ?? '') === '' + || ( + is_file(Config::$modSettings['robots_txt']) + && !Utils::makeWritable(Config::$modSettings['robots_txt']) + ) + || ( + !file_exists(Config::$modSettings['robots_txt']) + && !Utils::makeWritable(dirname(Config::$modSettings['robots_txt'])) + ) + ) { + return; + } + + // Define the rules we want to include. + $rules = [ + '*' => [ + 'allow' => [], + 'disallow' => [ + Url::create(Config::$scripturl)->path . '?msg=*', + ], + ], + ]; + + IntegrationHook::call('integrate_robots_txt_rules', [&$rules]); + + // Build the new file content. + $new_content = []; + + if (is_file(Config::$modSettings['robots_txt'])) { + $hash = md5_file(Config::$modSettings['robots_txt']); + + $current_user_agent = ''; + + // Keep all existing content and filter out anything in $rules that already exists. + foreach (file(Config::$modSettings['robots_txt']) as $line) { + // Found a new user agent line. + if (preg_match('/^user-agent:\h*([^\n]+)/i', $line, $matches)) { + $prev_user_agent = $current_user_agent; + $current_user_agent = $matches[1]; + + // Append any new rules for the previous user agent. + if (isset($rules[$prev_user_agent])) { + foreach ($rules[$prev_user_agent] as $type => $patterns) { + foreach ($patterns as $pattern) { + $new_content[] = ucfirst($type) . ': ' . $pattern . "\n"; + } + } + } + + // Don't do the same rules twice. + unset($rules[$prev_user_agent]); + } + + // Append this line. + $new_content[] = $line; + + // Filter out anything in $rules that already exists. + if (preg_match('/^((?:dis)?allow):\h*([^\n]+)/i', $line, $matches)) { + $type = strtolower($matches[1]); + $pattern = $matches[2]; + + $rules[$current_user_agent][$type] = array_diff( + $rules[$current_user_agent][$type], + [$pattern], + ); + } + } + } + + // Filter out empty $rules. + foreach ($rules as $user_agent => $rule_parts) { + foreach ($rule_parts as $type => $patterns) { + if ($rules[$user_agent][$type] === []) { + unset($rules[$user_agent][$type]); + } + } + + if ($rules[$user_agent] === []) { + unset($rules[$user_agent]); + } + } + + // Append the new rules. + foreach ($rules as $user_agent => $rule_parts) { + $new_content[] = "\n"; + $new_content[] = 'User-agent: ' . $user_agent . "\n"; + + foreach ($rule_parts as $type => $patterns) { + foreach ($patterns as $pattern) { + $new_content[] = ucfirst($type) . ': ' . $pattern . "\n"; + } + } + } + + // Finalize the content. + $new_content = trim(implode('', $new_content)) . "\n"; + + // If nothing changed, bail out. + if (isset($hash) && md5($new_content) === $hash) { + return; + } + + // Where should we save the backup file? + if (Utils::makeWritable(dirname(Config::$modSettings['robots_txt']))) { + $backup_file = preg_replace('/\.txt$/', '.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt', Config::$modSettings['robots_txt']); + } elseif (Utils::makeWritable(Config::$boarddir)) { + $backup_file = Config::$boarddir . DIRECTORY_SEPARATOR . 'robots.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt'; + } else { + $backup_file = null; + } + + // Write the new content to disk. + Config::safeFileWrite( + file: Config::$modSettings['robots_txt'], + data: $new_content, + backup_file: $backup_file, + ); + } } ?> \ No newline at end of file