Skip to content

Commit

Permalink
Automatically populate robots.txt
Browse files Browse the repository at this point in the history
Signed-off-by: Jon Stovell <[email protected]>
  • Loading branch information
Sesquipedalian committed Dec 30, 2024
1 parent 5a0150e commit 0d9163c
Show file tree
Hide file tree
Showing 3 changed files with 185 additions and 0 deletions.
1 change: 1 addition & 0 deletions Languages/en_US/Help.php
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@
As above except only Administrators can see spider status - to all other users spiders appear as guests.
</li>
</ul>';
$helptxt['robots_txt'] = 'The robots.txt file is used to implement the <a href="https://www.rfc-editor.org/rfc/rfc9309.html" class="bbc_link">Robots Exclusion Protocol</a>, a standard used by websites to indicate to search engine spiders and other web robots which portions of the website they are allowed to visit. This file is typically located in your website’s root directory.<br><br>SMF adds some rules to this file in order to guide spiders away from URLs that they should not bother to crawl. This improves efficiency and reduces server load when a spider is crawling your forum.';

$helptxt['birthday_email'] = 'Choose the index of the birthday email message to use. A preview will be shown in the Email Subject and Email Body fields.<br><strong>Note:</strong> Selecting this setting does not automatically enable birthday emails. To enable birthday emails use the <a href="{scripturl}?action=admin;area=scheduledtasks;{session_var}={session_id}" target="_blank" rel="noopener">Scheduled Tasks</a> page and enable the birthday email task.';
$helptxt['pm_bcc'] = 'When sending a personal message you can choose to add a recipient as BCC (Blind Carbon Copy). BCC recipients do not have their identities revealed to the other recipients of the message.';
Expand Down
5 changes: 5 additions & 0 deletions Languages/en_US/Search.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,4 +180,9 @@
$txt['spider_stats_page_hits'] = 'Page Hits';
$txt['spider_stats_no_entries'] = 'There are currently no spider statistics available.';

$txt['robots_txt'] = 'Add SMF rules to robots.txt';
$txt['robots_txt_info'] = 'Enter the path to your robots.txt file so that SMF can append rules to it.';
$txt['robots_txt_auto'] = 'Detect path';
$txt['robots_txt_not_writable'] = 'The robots.txt file is not writable.';

?>
179 changes: 179 additions & 0 deletions Sources/Actions/Admin/SearchEngines.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
use SMF\SecurityToken;
use SMF\Theme;
use SMF\Time;
use SMF\Url;
use SMF\User;
use SMF\Utils;

Expand Down Expand Up @@ -637,6 +638,7 @@ public function settings(): void
ACP::saveDBSettings($config_vars);

self::recacheSpiderNames();
self::addRobotsTxtRules();

$_SESSION['adm-save'] = true;
Utils::redirectexit('action=admin;area=sengines;sa=settings');
Expand Down Expand Up @@ -794,6 +796,33 @@ function disableFields()
}
disableFields();';

// Now the setting for robots.txt.
$config_vars[] = '';

if (empty(Config::$modSettings['robots_txt'])) {
$detected_path = self::detectRobotsTxt();

if (
$detected_path !== ''
&& $detected_path !== (Config::$modSettings['robots_txt'] ?? '')
&& is_writable($detected_path)
) {
$post_input = '<button class="button floatnone" onclick="document.getElementById(\'robots_txt\').value = ' . Utils::escapeJavaScript($detected_path) . '; return false;">' . Lang::getTxt('robots_txt_auto') . '</button>';
}
} elseif (!is_writable(Config::$modSettings['robots_txt'])) {
$invalid = true;
$post_input = '<br><span class="error">' . Lang::$txt['robots_txt_not_writable'] . '</span>';
}

$config_vars[] = [
'text',
'robots_txt',
'subtext' => Lang::$txt['robots_txt_info'],
'size' => 45,
'invalid' => $invalid ?? false,
'postinput' => $post_input ?? '',
];

IntegrationHook::call('integrate_modify_search_engine_settings', [&$config_vars]);

return $config_vars;
Expand Down Expand Up @@ -1100,6 +1129,156 @@ protected function __construct()

Utils::$context['sub_action'] = &$this->subaction;
}

/**
* Finds and returns the file path to robots.txt, or else the file path
* where it should be created if it doesn't already exist.
*
* @return string The path to robots.txt.
*/
protected static function detectRobotsTxt(): string
{
$path_from_boarddir = strtr(Config::$boarddir, ['/' => DIRECTORY_SEPARATOR]);
$path_from_boardurl = strtr(Url::create(Config::$boardurl)->path, ['/' => DIRECTORY_SEPARATOR]);

while (
!file_exists($path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt')
&& basename($path_from_boarddir) === basename($path_from_boardurl)
&& dirname($path_from_boardurl) !== $path_from_boardurl
) {
$path_from_boarddir = dirname($path_from_boarddir);
$path_from_boardurl = dirname($path_from_boardurl);
}

return $path_from_boarddir . DIRECTORY_SEPARATOR . 'robots.txt';
}

/**
* Checks whether robots.txt is writable and, if so, adds some rules to it
* for SMF purposes.
*/
protected static function addRobotsTxtRules(): void
{
// Can we write to the file?
if (
(Config::$modSettings['robots_txt'] ?? '') === ''
|| (
is_file(Config::$modSettings['robots_txt'])
&& !Utils::makeWritable(Config::$modSettings['robots_txt'])
)
|| (
!file_exists(Config::$modSettings['robots_txt'])
&& !Utils::makeWritable(dirname(Config::$modSettings['robots_txt']))
)
) {
return;
}

// Define the rules we want to include.
$rules = [
'*' => [
'allow' => [],
'disallow' => [
Url::create(Config::$scripturl)->path . '?msg=*',
],
],
];

IntegrationHook::call('integrate_robots_txt_rules', [&$rules]);

// Build the new file content.
$new_content = [];

if (is_file(Config::$modSettings['robots_txt'])) {
$hash = md5_file(Config::$modSettings['robots_txt']);

$current_user_agent = '';

// Keep all existing content and filter out anything in $rules that already exists.
foreach (file(Config::$modSettings['robots_txt']) as $line) {
// Found a new user agent line.
if (preg_match('/^user-agent:\h*([^\n]+)/i', $line, $matches)) {
$prev_user_agent = $current_user_agent;
$current_user_agent = $matches[1];

// Append any new rules for the previous user agent.
if (isset($rules[$prev_user_agent])) {
foreach ($rules[$prev_user_agent] as $type => $patterns) {
foreach ($patterns as $pattern) {
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
}
}
}

// Don't do the same rules twice.
unset($rules[$prev_user_agent]);
}

// Append this line.
$new_content[] = $line;

// Filter out anything in $rules that already exists.
if (preg_match('/^((?:dis)?allow):\h*([^\n]+)/i', $line, $matches)) {
$type = strtolower($matches[1]);
$pattern = $matches[2];

$rules[$current_user_agent][$type] = array_diff(
$rules[$current_user_agent][$type],
[$pattern],
);
}
}
}

// Filter out empty $rules.
foreach ($rules as $user_agent => $rule_parts) {
foreach ($rule_parts as $type => $patterns) {
if ($rules[$user_agent][$type] === []) {
unset($rules[$user_agent][$type]);
}
}

if ($rules[$user_agent] === []) {
unset($rules[$user_agent]);
}
}

// Append the new rules.
foreach ($rules as $user_agent => $rule_parts) {
$new_content[] = "\n";
$new_content[] = 'User-agent: ' . $user_agent . "\n";

foreach ($rule_parts as $type => $patterns) {
foreach ($patterns as $pattern) {
$new_content[] = ucfirst($type) . ': ' . $pattern . "\n";
}
}
}

// Finalize the content.
$new_content = trim(implode('', $new_content)) . "\n";

// If nothing changed, bail out.
if (isset($hash) && md5($new_content) === $hash) {
return;
}

// Where should we save the backup file?
if (Utils::makeWritable(dirname(Config::$modSettings['robots_txt']))) {
$backup_file = preg_replace('/\.txt$/', '.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt', Config::$modSettings['robots_txt']);
} elseif (Utils::makeWritable(Config::$boarddir)) {
$backup_file = Config::$boarddir . DIRECTORY_SEPARATOR . 'robots.' . (date_create('now UTC')->format('Ymd\THis\Z')) . '.txt';
} else {
$backup_file = null;
}

// Write the new content to disk.
Config::safeFileWrite(
file: Config::$modSettings['robots_txt'],
data: $new_content,
backup_file: $backup_file,
);
}
}

?>

0 comments on commit 0d9163c

Please sign in to comment.