Add fix for missing white-space between nodes

Refs #12 Refs #14
rrrene · Apr 30, 2017 · c4bbac6 · c4bbac6
1 parent 823de7c
commit c4bbac6
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 7 deletions.
diff --git a/lib/html_sanitize_ex/parser.ex b/lib/html_sanitize_ex/parser.ex
@@ -10,23 +10,33 @@ defmodule HtmlSanitizeEx.Parser do  @doc """
   @type html_tree :: tuple | list
 
   @my_root_node "html_sanitize_ex"
+  @linebreak [239, 188, 191]
 
   @spec parse(binary) :: html_tree
 
   def parse(html) do
-    html = "<#{@my_root_node}>#{html}</#{@my_root_node}>"
+    html = "<#{@my_root_node}>#{before_parse(html)}</#{@my_root_node}>"
     {@my_root_node, [], parsed} = :mochiweb_html.parse(html)
 
     if length(parsed) == 1, do: hd(parsed), else: parsed
   end
 
+  defp before_parse(html) do
+    String.replace(html, ~r/(>)(\r?\n)/, "\\1 #{@linebreak} \\2")
+  end
+
   def to_html(tokens) do
     {@my_root_node, [], ensure_list(tokens)}
     |> :mochiweb_html.to_html
     |> Enum.join
     |> String.replace(~r/^<#{@my_root_node}>/, "")
     |> String.replace(~r/<\/#{@my_root_node}>$/, "")
     |> String.replace("&lt;/html_sanitize_ex&gt;", "")
+    |> after_to_html()
+  end
+
+  defp after_to_html(html) do
+    String.replace(html, ~r/(\ ?#{@linebreak} )(\r?\n)/, "\\2")
   end
 
   defp ensure_list(list) do

diff --git a/test/basic_html_test.exs b/test/basic_html_test.exs
@@ -54,7 +54,7 @@ defmodule HtmlSanitizeExScrubberBasicHTMLTest do
 
   test "strips certain tags in multi line strings" do
     input = "<title>This is <b>a <a href=\"\" target=\"_blank\">test</a></b>.</title>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n"
-    expected = "This is <b>a <a href=\"\">test</a></b>.<p>It no <b>longer <strong>contains <em>any HTML</em>.</strong></b></p>"
+    expected = "This is <b>a <a href=\"\">test</a></b>.\n\n\n\n<p>It no <b>longer <strong>contains <em>any HTML</em>.</strong></b></p>\n"
     assert expected == basic_html_sanitize(input)
   end
 
@@ -110,7 +110,7 @@ defmodule HtmlSanitizeExScrubberBasicHTMLTest do
 
   @tag href_scrubbing: true
   test "test_strip_links_leaves_nonlink_tags" do
-    assert "<a href=\"almost\">My mind</a><a href=\"almost\">all <b>day</b> long</a>" == basic_html_sanitize("<a href='almost'>My mind</a>\n<A href='almost'>all <b>day</b> long</A>")
+    assert "<a href=\"almost\">My mind</a>\n<a href=\"almost\">all <b>day</b> long</a>" == basic_html_sanitize("<a href='almost'>My mind</a>\n<A href='almost'>all <b>day</b> long</A>")
   end
 
   @tag href_scrubbing: true

diff --git a/test/markdown_html_test.exs b/test/markdown_html_test.exs
@@ -83,7 +83,7 @@ defmodule HtmlSanitizeExScrubberMarkdownHTMLTest do
 
   test "strips certain tags in multi line strings" do
     input = "<title>This is <b>a <a href=\"\" target=\"_top\">test</a></b>.</title>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n"
-    expected = "This is <b>a <a href=\"\">test</a></b>.<p>It no <b>longer <strong>contains <em>any HTML</em>.</strong></b></p>"
+    expected = "This is <b>a <a href=\"\">test</a></b>.\n\n\n\n<p>It no <b>longer <strong>contains <em>any HTML</em>.</strong></b></p>\n"
     assert expected == sanitize(input)
   end
 
@@ -139,7 +139,7 @@ defmodule HtmlSanitizeExScrubberMarkdownHTMLTest do
 
   @tag href_scrubbing: true
   test "test_strip_links_leaves_nonlink_tags" do
-    assert "<a href=\"almost\">My mind</a><a href=\"almost\">all <b>day</b> long</a>" == sanitize("<a href='almost'>My mind</a>\n<A href='almost'>all <b>day</b> long</A>")
+    assert "<a href=\"almost\">My mind</a>\n<a href=\"almost\">all <b>day</b> long</a>" == sanitize("<a href='almost'>My mind</a>\n<A href='almost'>all <b>day</b> long</A>")
   end
 
   @tag href_scrubbing: true

diff --git a/test/strip_tags_test.exs b/test/strip_tags_test.exs
@@ -37,7 +37,7 @@ defmodule HtmlSanitizeExScrubberStripTagsTest do
 
   test "strips tags in multi line strings" do
     input = "<title>This is <b>a <a href=\"\" target=\"_blank\">test</a></b>.</title>\n\n<!-- it has a comment -->\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n"
-    expected = "This is a test.It no longer contains any HTML."
+    expected = "This is a test.\n\n\n\nIt no longer contains any HTML.\n"
     assert expected == strip_tags(input)
   end
 

diff --git a/test/traverser_test.exs b/test/traverser_test.exs
@@ -27,7 +27,7 @@ defmodule HtmlSanitizeExTraverserTest do
 
   test "should return expected tree 2" do
     input = "<title>This is <b>the <a href=\"http://[email protected]\" target=\"_blank\">test</a></b>.</title>\n\n\n\n<p>It no <b>longer <strong>contains <em>any <strike>HTML</strike></em>.</strong></b></p>\n"
-    expected = ["This is ", {"b", [], ["the ", "test"]}, ".", "It no ", {"b", [], ["longer ", "contains ", "any ", "HTML", "."]}]
+    expected = ["This is ", {"b", [], ["the ", "test"]}, ".", " ï¼¿ \n\n\n\n", "It no ", {"b", [], ["longer ", "contains ", "any ", "HTML", "."]}, " ï¼¿ \n"]
     assert expected == parse_to_tree(input)
   end