diff --git a/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs b/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs index 0eb1ee257ad1..6013e6c4c2f1 100644 --- a/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs +++ b/src/Umbraco.Infrastructure/PropertyEditors/RichTextPropertyIndexValueFactory.cs @@ -1,5 +1,6 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using System.Text.RegularExpressions; using Umbraco.Cms.Core.Configuration.Models; using Umbraco.Cms.Core.Models; using Umbraco.Cms.Core.Serialization; @@ -50,7 +51,7 @@ public override IEnumerable GetIndexValues( }; // the actual content (RTE content without markup, i.e. the actual words) must be indexed under the property alias - var richTextWithoutMarkup = richTextEditorValue.Markup.StripHtml(); + var richTextWithoutMarkup = StripHtmlForIndexing(richTextEditorValue.Markup); if (richTextEditorValue.Blocks?.ContentData.Any() is not true) { // no blocks; index the content for the culture and be done with it @@ -132,4 +133,27 @@ public override IEnumerable GetIndexValues( protected override IEnumerable GetDataItems(RichTextEditorValue input, bool published) => GetDataItems(input.Blocks?.ContentData ?? [], input.Blocks?.Expose ?? [], published); + + /// + /// Strips HTML tags from content while preserving whitespace from line breaks. + /// This addresses the issue where <br> tags don't create word boundaries when HTML is stripped. + /// + /// The HTML content to strip + /// Plain text with proper word boundaries + private static string StripHtmlForIndexing(string html) + { + if (string.IsNullOrWhiteSpace(html)) + { + return string.Empty; + } + + // Replace
and
tags (with any amount of whitespace and attributes) with spaces + // This regex matches: + // -
(with / without spaces or attributes) + // -
(with / without spaces or attributes) + html = Regex.Replace(html, @"]*/?>\s*", " ", RegexOptions.IgnoreCase); + + // Use the existing Microsoft StripHtml function for everything else + return html.StripHtml(); + } } diff --git a/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs new file mode 100644 index 000000000000..826395a64a4c --- /dev/null +++ b/tests/Umbraco.Tests.UnitTests/Umbraco.Core/PropertyEditors/RichTextPropertyIndexValueFactoryTests.cs @@ -0,0 +1,78 @@ +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Moq; +using NUnit.Framework; +using Umbraco.Cms.Core.Configuration.Models; +using Umbraco.Cms.Core.Models; +using Umbraco.Cms.Core.PropertyEditors; +using Umbraco.Cms.Core.Serialization; + +namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.PropertyEditors; + +/// +/// Tests for to ensure it correctly creates index values from rich text properties. +/// +public class RichTextPropertyIndexValueFactoryTests +{ + /// + /// Tests that the factory can create index values from a rich text property with valid content + /// + /// + /// + [TestCase("

Sample text

", "Sample text")] + [TestCase("

John Smith
Company ABC
London

", "John Smith Company ABC London")] + [TestCase("

John SmithCompany ABCLondon

", "John SmithCompany ABCLondon")] + [TestCase("

John Smith
Company ABCLondon

", "John Smith Company ABCLondon")] + [TestCase("

Another sample text with bold content

", "Another sample text with bold content")] + [TestCase("

Text with link

", "Text with link")] + [TestCase("

Text with \"image\"

", "Text with")] + [TestCase("

Text with styled text

", "Text with styled text")] + [TestCase("

Text with emphasized content

", "Text with emphasized content")] + [TestCase("

Text with underlined content

", "Text with underlined content")] + [TestCase("

Text with inline code

", "Text with inline code")] + [TestCase("

Text with

code block

", "Text with code block")] + [TestCase("

Text with

quoted text

", "Text with quoted text")] + [TestCase("

Text with

  • list item 1
  • list item 2

", + "Text with list item 1list item 2")] + [TestCase("

Text with

  1. ordered item 1
  2. ordered item 2

", + "Text with ordered item 1ordered item 2")] + [TestCase("

Text with

div content

", "Text with div content")] + [TestCase("

Text with span content

", "Text with span content")] + [TestCase("

Text with bold and italic content

", + "Text with bold and italic content")] + [TestCase("

Text with external link

", + "Text with external link")] + [TestCase("

John Smith
Company ABC
London

", "John Smith Company ABC London")] + [TestCase("

John Smith
Company ABC
London

", "John Smith Company ABC London")] + public void Can_Create_Index_Values_From_RichText_Property(string testContent, string expected) + { + var propertyEditorCollection = new PropertyEditorCollection(new DataEditorCollection(() => null)); + var jsonSerializer = Mock.Of(); + var indexingSettings = Mock.Of>(); + Mock.Get(indexingSettings).Setup(x => x.CurrentValue).Returns(new IndexingSettings { }); + var logger = Mock.Of>(); + string alias = "richText"; + + var factory = new RichTextPropertyIndexValueFactory( + propertyEditorCollection, + jsonSerializer, + indexingSettings, + logger); + + // create a mock property with the rich text value + var property = Mock.Of(p => p.Alias == alias + && (string)p.GetValue(It.IsAny(), It.IsAny(), + It.IsAny()) == testContent); + + // get the index value for the property + var indexValue = factory + .GetIndexValues(property, null, null, true, [], new Dictionary()) + .FirstOrDefault(kvp => kvp.FieldName == alias); + Assert.IsNotNull(indexValue); + + // assert that index the value is created correctly (it might contain a trailing whitespace, but that's OK) + var expectedIndexValue = indexValue.Values.SingleOrDefault() as string; + Assert.IsNotNull(expectedIndexValue); + Assert.AreEqual(expected, expectedIndexValue.TrimEnd()); + } +}