Skip to content

Commit

Permalink
upgraded and modified htmlsanitizer
Browse files Browse the repository at this point in the history
- html sanitizer now keeps child nodes to avoid removing potentially relevant markup
- added new test for html converter
  • Loading branch information
michael-mason committed May 11, 2021
1 parent 70947bb commit 4a2b233
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 6 deletions.
22 changes: 20 additions & 2 deletions src/CollectionsOnline.Core/CollectionsOnline.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,26 @@
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<ItemGroup>
<Reference Include="AngleSharp, Version=0.15.0.0, Culture=neutral, PublicKeyToken=e83494dcdc6d31ea">
<HintPath>..\packages\AngleSharp.0.15.0\lib\net472\AngleSharp.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="AngleSharp.Css, Version=0.15.0.0, Culture=neutral, PublicKeyToken=e83494dcdc6d31ea">
<HintPath>..\packages\AngleSharp.Css.0.15.0\lib\net472\AngleSharp.Css.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="CsQuery">
<HintPath>..\packages\CsQuery.1.3.4\lib\net40\CsQuery.dll</HintPath>
</Reference>
<Reference Include="HtmlAgilityPack">
<HintPath>..\packages\HtmlAgilityPack.1.4.9\lib\Net45\HtmlAgilityPack.dll</HintPath>
</Reference>
<Reference Include="HtmlSanitizer, Version=2.0.5735.24296, Culture=neutral, processorArchitecture=MSIL">
<HintPath>..\packages\HtmlSanitizer.2.0.5735.24296\lib\net40\HtmlSanitizer.dll</HintPath>
<Reference Include="HtmlSanitizer, Version=5.0.0.0, Culture=neutral, PublicKeyToken=61c49a1a9e79cc28">
<HintPath>..\packages\HtmlSanitizer.5.0.404\lib\net46\HtmlSanitizer.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="Microsoft.CSharp" />
<Reference Include="mscorlib" />
<Reference Include="Ninject, Version=3.2.0.0, Culture=neutral, PublicKeyToken=c7192dc5380945e7, processorArchitecture=MSIL">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Ninject.3.2.2.0\lib\net45-full\Ninject.dll</HintPath>
Expand Down Expand Up @@ -75,6 +85,14 @@
<Reference Include="System.ComponentModel.Composition" />
<Reference Include="System.Configuration" />
<Reference Include="System.Core" />
<Reference Include="System.Runtime.CompilerServices.Unsafe, Version=5.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
<HintPath>..\packages\System.Runtime.CompilerServices.Unsafe.5.0.0\lib\net45\System.Runtime.CompilerServices.Unsafe.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="System.Text.Encoding.CodePages, Version=5.0.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
<HintPath>..\packages\System.Text.Encoding.CodePages.5.0.0\lib\net461\System.Text.Encoding.CodePages.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="System.XML" />
</ItemGroup>
<ItemGroup>
Expand Down
8 changes: 5 additions & 3 deletions src/CollectionsOnline.Core/Utilities/HtmlConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ public static HtmlSanitizerResult HtmlSanitizer(string html)
{
var sanitizer = new HtmlSanitizer(DefaultAllowedTags, allowedAttributes:DefaultAllowedAttributes);

sanitizer.KeepChildNodes = true;

var result = new HtmlSanitizerResult();

sanitizer.RemovingTag += ((s, e) => { result.HasRemovedTag = true; });
sanitizer.RemovingStyle += ((s, e) => { result.HasRemovedStyle = true; });
sanitizer.RemovingAttribute += ((s, e) => { result.HasRemovedAttribute = true; });
sanitizer.RemovingTag += (s, e) => { result.HasRemovedTag = true; };
sanitizer.RemovingStyle += (s, e) => { result.HasRemovedStyle = true; };
sanitizer.RemovingAttribute += (s, e) => { result.HasRemovedAttribute = true; };

result.Html = sanitizer.Sanitize(html);

Expand Down
6 changes: 5 additions & 1 deletion src/CollectionsOnline.Core/packages.config
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="HtmlSanitizer" version="5.0.404" targetFramework="net472" />
<package id="AngleSharp" version="0.15.0" targetFramework="net472" />
<package id="AngleSharp.Css" version="0.15.0" targetFramework="net472" />
<package id="CsQuery" version="1.3.4" targetFramework="net451" />
<package id="HtmlAgilityPack" version="1.4.9" targetFramework="net451" />
<package id="HtmlSanitizer" version="2.0.5735.24296" targetFramework="net451" />
<package id="Ninject" version="3.2.2.0" targetFramework="net45" />
<package id="RavenDB.Client" version="3.5.1" targetFramework="net451" />
<package id="Serilog" version="1.5.14" targetFramework="net451" />
<package id="Serilog.Sinks.Seq" version="1.5.27" targetFramework="net451" />
<package id="SerilogMetrics" version="1.0.33" targetFramework="net451" />
<package id="System.Runtime.CompilerServices.Unsafe" version="5.0.0" targetFramework="net472" />
<package id="System.Text.Encoding.CodePages" version="5.0.0" targetFramework="net472" />
</packages>
13 changes: 13 additions & 0 deletions src/CollectionsOnline.Tests/Core/Utilities/HtmlConverterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,18 @@ public void HtmlToText_WithScientificName_ReturnsOnlyText()
// Then
result.ShouldBe("df Austriella corrugata (Deshayes, 1843)");
}

[Fact]
public void HtmlSanitizer_WithMalformedHTML_ReturnsSanitizedHTML()
{
// Given
string result;
var html = @"<h3><font size=""3"">Peter Hunter OAM, Kodak Australasia Pty Ltd: Public Relations, 1961 - 1995</font></h3><h3><font size=""3"">Early career in England</font></h3>";

// When
result = HtmlConverter.HtmlSanitizer(html).Html;

result.ShouldBe("<h3>Peter Hunter OAM, Kodak Australasia Pty Ltd: Public Relations, 1961 - 1995</h3><h3>Early career in England</h3>");
}
}
}

0 comments on commit 4a2b233

Please sign in to comment.