Skip to content

Cherry picked #19540 to V16 (and fixed changed signatures) #19592

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using System.Text.RegularExpressions;
using Umbraco.Cms.Core.Configuration.Models;
using Umbraco.Cms.Core.Models;
using Umbraco.Cms.Core.Serialization;
Expand Down Expand Up @@ -50,7 +51,7 @@ public override IEnumerable<IndexValue> GetIndexValues(
};

// the actual content (RTE content without markup, i.e. the actual words) must be indexed under the property alias
var richTextWithoutMarkup = richTextEditorValue.Markup.StripHtml();
var richTextWithoutMarkup = StripHtmlForIndexing(richTextEditorValue.Markup);
if (richTextEditorValue.Blocks?.ContentData.Any() is not true)
{
// no blocks; index the content for the culture and be done with it
Expand Down Expand Up @@ -132,4 +133,27 @@ public override IEnumerable<IndexValue> GetIndexValues(

protected override IEnumerable<RawDataItem> GetDataItems(RichTextEditorValue input, bool published)
=> GetDataItems(input.Blocks?.ContentData ?? [], input.Blocks?.Expose ?? [], published);

/// <summary>
/// Strips HTML tags from content while preserving whitespace from line breaks.
/// This addresses the issue where &lt;br&gt; tags don't create word boundaries when HTML is stripped.
/// </summary>
/// <param name="html">The HTML content to strip</param>
/// <returns>Plain text with proper word boundaries</returns>
private static string StripHtmlForIndexing(string html)
{
if (string.IsNullOrWhiteSpace(html))
{
return string.Empty;
}

// Replace <br> and <br/> tags (with any amount of whitespace and attributes) with spaces
// This regex matches:
// - <br> (with / without spaces or attributes)
// - <br /> (with / without spaces or attributes)
html = Regex.Replace(html, @"<br\b[^>]*/?>\s*", " ", RegexOptions.IgnoreCase);

// Use the existing Microsoft StripHtml function for everything else
return html.StripHtml();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using Moq;
using NUnit.Framework;
using Umbraco.Cms.Core.Configuration.Models;
using Umbraco.Cms.Core.Models;
using Umbraco.Cms.Core.PropertyEditors;
using Umbraco.Cms.Core.Serialization;

namespace Umbraco.Cms.Tests.UnitTests.Umbraco.Core.PropertyEditors;

/// <summary>
/// Tests for <see cref="RichTextPropertyIndexValueFactory"/> to ensure it correctly creates index values from rich text properties.
/// </summary>
public class RichTextPropertyIndexValueFactoryTests
{
/// <summary>
/// Tests that the factory can create index values from a rich text property with valid content
/// </summary>
/// <param name="testContent"></param>
/// <param name="expected"></param>
[TestCase("<p>Sample text</p>", "Sample text")]
[TestCase("<p>John Smith<br>Company ABC<br>London</p>", "John Smith Company ABC London")]
[TestCase("<p>John Smith<break>Company ABC<break>London</p>", "John SmithCompany ABCLondon")]
[TestCase("<p>John Smith<br>Company ABC<branything>London</p>", "John Smith Company ABCLondon")]
[TestCase("<p>Another sample text with <strong>bold</strong> content</p>", "Another sample text with bold content")]
[TestCase("<p>Text with <a href=\"https://example.com\">link</a></p>", "Text with link")]
[TestCase("<p>Text with <img src=\"image.jpg\" alt=\"image\" /></p>", "Text with")]
[TestCase("<p>Text with <span style=\"color: red;\">styled text</span></p>", "Text with styled text")]
[TestCase("<p>Text with <em>emphasized</em> content</p>", "Text with emphasized content")]
[TestCase("<p>Text with <u>underlined</u> content</p>", "Text with underlined content")]
[TestCase("<p>Text with <code>inline code</code></p>", "Text with inline code")]
[TestCase("<p>Text with <pre><code>code block</code></pre></p>", "Text with code block")]
[TestCase("<p>Text with <blockquote>quoted text</blockquote></p>", "Text with quoted text")]
[TestCase("<p>Text with <ul><li>list item 1</li><li>list item 2</li></ul></p>",
"Text with list item 1list item 2")]
[TestCase("<p>Text with <ol><li>ordered item 1</li><li>ordered item 2</li></ol></p>",
"Text with ordered item 1ordered item 2")]
[TestCase("<p>Text with <div class=\"class-name\">div content</div></p>", "Text with div content")]
[TestCase("<p>Text with <span class=\"class-name\">span content</span></p>", "Text with span content")]
[TestCase("<p>Text with <strong>bold</strong> and <em>italic</em> content</p>",
"Text with bold and italic content")]
[TestCase("<p>Text with <a href=\"https://example.com\" target=\"_blank\">external link</a></p>",
"Text with external link")]
[TestCase("<p>John Smith<br class=\"test\">Company ABC<br>London</p>", "John Smith Company ABC London")]
[TestCase("<p>John Smith<br \r\n />Company ABC<br>London</p>", "John Smith Company ABC London")]
public void Can_Create_Index_Values_From_RichText_Property(string testContent, string expected)
{
var propertyEditorCollection = new PropertyEditorCollection(new DataEditorCollection(() => null));
var jsonSerializer = Mock.Of<IJsonSerializer>();
var indexingSettings = Mock.Of<IOptionsMonitor<IndexingSettings>>();
Mock.Get(indexingSettings).Setup(x => x.CurrentValue).Returns(new IndexingSettings { });
var logger = Mock.Of<ILogger<RichTextPropertyIndexValueFactory>>();
string alias = "richText";

var factory = new RichTextPropertyIndexValueFactory(
propertyEditorCollection,
jsonSerializer,
indexingSettings,
logger);

// create a mock property with the rich text value
var property = Mock.Of<IProperty>(p => p.Alias == alias
&& (string)p.GetValue(It.IsAny<string>(), It.IsAny<string>(),
It.IsAny<bool>()) == testContent);

// get the index value for the property
var indexValue = factory
.GetIndexValues(property, null, null, true, [], new Dictionary<Guid, IContentType>())
.FirstOrDefault(kvp => kvp.FieldName == alias);
Assert.IsNotNull(indexValue);

// assert that index the value is created correctly (it might contain a trailing whitespace, but that's OK)
var expectedIndexValue = indexValue.Values.SingleOrDefault() as string;
Assert.IsNotNull(expectedIndexValue);
Assert.AreEqual(expected, expectedIndexValue.TrimEnd());
}
}
Loading