diff --git a/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs b/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs
index 0eb1ee257ad1..6013e6c4c2f1 100644
--- a/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs
+++ b/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs
@@ -1,5 +1,6 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
+using System.Text.RegularExpressions;
using Umbraco.Cms.Core.Configuration.Models;
using Umbraco.Cms.Core.Models;
using Umbraco.Cms.Core.Serialization;
@@ -50,7 +51,7 @@ public override IEnumerable GetIndexValues(
};
// the actual content (RTE content without markup, i.e. the actual words) must be indexed under the property alias
- var richTextWithoutMarkup = richTextEditorValue.Markup.StripHtml();
+ var richTextWithoutMarkup = StripHtmlForIndexing(richTextEditorValue.Markup);
if (richTextEditorValue.Blocks?.ContentData.Any() is not true)
{
// no blocks; index the content for the culture and be done with it
@@ -132,4 +133,27 @@ public override IEnumerable GetIndexValues(
protected override IEnumerable GetDataItems(RichTextEditorValue input, bool published)
=> GetDataItems(input.Blocks?.ContentData ?? [], input.Blocks?.Expose ?? [], published);
+
+ ///
+ /// Strips HTML tags from content while preserving whitespace from line breaks.
+ /// This addresses the issue where <br> tags don't create word boundaries when HTML is stripped.
+ ///
+ /// The HTML content to strip
+ /// Plain text with proper word boundaries
+ private static string StripHtmlForIndexing(string html)
+ {
+ if (string.IsNullOrWhiteSpace(html))
+ {
+ return string.Empty;
+ }
+
+ // Replace
and
tags (with any amount of whitespace and attributes) with spaces
+ // This regex matches:
+ // -
(with / without spaces or attributes)
+ // -
(with / without spaces or attributes)
+ html = Regex.Replace(html, @"
]*/?>\s*", " ", RegexOptions.IgnoreCase);
+
+ // Use the existing Microsoft StripHtml function for everything else
+ return html.StripHtml();
+ }
}
diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs
new file mode 100644
index 000000000000..826395a64a4c
--- /dev/null
+++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs
@@ -0,0 +1,78 @@
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Options;
+using Moq;
+using NUnit.Framework;
+using Umbraco.Cms.Core.Configuration.Models;
+using Umbraco.Cms.Core.Models;
+using Umbraco.Cms.Core.PropertyEditors;
+using Umbraco.Cms.Core.Serialization;
+
+namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.PropertyEditors;
+
+///
+/// Tests for to ensure it correctly creates index values from rich text properties.
+///
+public class RichTextPropertyIndexValueFactoryTests
+{
+ ///
+ /// Tests that the factory can create index values from a rich text property with valid content
+ ///
+ ///
+ ///
+ [TestCase("Sample text
", "Sample text")]
+ [TestCase("John Smith
Company ABC
London
", "John Smith Company ABC London")]
+ [TestCase("John SmithCompany ABCLondon
", "John SmithCompany ABCLondon")]
+ [TestCase("John Smith
Company ABCLondon
", "John Smith Company ABCLondon")]
+ [TestCase("Another sample text with bold content
", "Another sample text with bold content")]
+ [TestCase("Text with link
", "Text with link")]
+ [TestCase("Text with 
", "Text with")]
+ [TestCase("Text with styled text
", "Text with styled text")]
+ [TestCase("Text with emphasized content
", "Text with emphasized content")]
+ [TestCase("Text with underlined content
", "Text with underlined content")]
+ [TestCase("Text with inline code
", "Text with inline code")]
+ [TestCase("Text with
code block
", "Text with code block")]
+ [TestCase("Text with
quoted text
", "Text with quoted text")]
+ [TestCase("Text with
",
+ "Text with list item 1list item 2")]
+ [TestCase("Text with
- ordered item 1
- ordered item 2
",
+ "Text with ordered item 1ordered item 2")]
+ [TestCase("Text with
div content
", "Text with div content")]
+ [TestCase("Text with span content
", "Text with span content")]
+ [TestCase("Text with bold and italic content
",
+ "Text with bold and italic content")]
+ [TestCase("Text with external link
",
+ "Text with external link")]
+ [TestCase("John Smith
Company ABC
London
", "John Smith Company ABC London")]
+ [TestCase("John Smith
Company ABC
London
", "John Smith Company ABC London")]
+ public void Can_Create_Index_Values_From_RichText_Property(string testContent, string expected)
+ {
+ var propertyEditorCollection = new PropertyEditorCollection(new DataEditorCollection(() => null));
+ var jsonSerializer = Mock.Of();
+ var indexingSettings = Mock.Of>();
+ Mock.Get(indexingSettings).Setup(x => x.CurrentValue).Returns(new IndexingSettings { });
+ var logger = Mock.Of>();
+ string alias = "richText";
+
+ var factory = new RichTextPropertyIndexValueFactory(
+ propertyEditorCollection,
+ jsonSerializer,
+ indexingSettings,
+ logger);
+
+ // create a mock property with the rich text value
+ var property = Mock.Of(p => p.Alias == alias
+ && (string)p.GetValue(It.IsAny(), It.IsAny(),
+ It.IsAny()) == testContent);
+
+ // get the index value for the property
+ var indexValue = factory
+ .GetIndexValues(property, null, null, true, [], new Dictionary())
+ .FirstOrDefault(kvp => kvp.FieldName == alias);
+ Assert.IsNotNull(indexValue);
+
+ // assert that index the value is created correctly (it might contain a trailing whitespace, but that's OK)
+ var expectedIndexValue = indexValue.Values.SingleOrDefault() as string;
+ Assert.IsNotNull(expectedIndexValue);
+ Assert.AreEqual(expected, expectedIndexValue.TrimEnd());
+ }
+}