diff --git a/lib/html_sanitize_ex/parser.ex b/lib/html_sanitize_ex/parser.ex index 4426dc5..4e17037 100644 --- a/lib/html_sanitize_ex/parser.ex +++ b/lib/html_sanitize_ex/parser.ex @@ -10,16 +10,21 @@ defmodule HtmlSanitizeEx.Parser do @doc """ @type html_tree :: tuple | list @my_root_node "html_sanitize_ex" + @linebreak [239, 188, 191] @spec parse(binary) :: html_tree def parse(html) do - html = "<#{@my_root_node}>#{html}" + html = "<#{@my_root_node}>#{before_parse(html)}" {@my_root_node, [], parsed} = :mochiweb_html.parse(html) if length(parsed) == 1, do: hd(parsed), else: parsed end + defp before_parse(html) do + String.replace(html, ~r/(>)(\r?\n)/, "\\1 #{@linebreak} \\2") + end + def to_html(tokens) do {@my_root_node, [], ensure_list(tokens)} |> :mochiweb_html.to_html @@ -27,6 +32,11 @@ defmodule HtmlSanitizeEx.Parser do @doc """ |> String.replace(~r/^<#{@my_root_node}>/, "") |> String.replace(~r/<\/#{@my_root_node}>$/, "") |> String.replace("</html_sanitize_ex>", "") + |> after_to_html() + end + + defp after_to_html(html) do + String.replace(html, ~r/(\ ?#{@linebreak} )(\r?\n)/, "\\2") end defp ensure_list(list) do diff --git a/test/basic_html_test.exs b/test/basic_html_test.exs index ed474ee..4f90535 100644 --- a/test/basic_html_test.exs +++ b/test/basic_html_test.exs @@ -54,7 +54,7 @@ defmodule HtmlSanitizeExScrubberBasicHTMLTest do test "strips certain tags in multi line strings" do input = "This is <b>a <a href=\"\" target=\"_blank\">test</a></b>.\n\n\n\n

It no longer contains any HTML.

\n" - expected = "This is a test.

It no longer contains any HTML.

" + expected = "This is a test.\n\n\n\n

It no longer contains any HTML.

\n" assert expected == basic_html_sanitize(input) end @@ -110,7 +110,7 @@ defmodule HtmlSanitizeExScrubberBasicHTMLTest do @tag href_scrubbing: true test "test_strip_links_leaves_nonlink_tags" do - assert "My mindall day long" == basic_html_sanitize("My mind\nall day long") + assert "My mind\nall day long" == basic_html_sanitize("My mind\nall day long") end @tag href_scrubbing: true diff --git a/test/markdown_html_test.exs b/test/markdown_html_test.exs index bd94fa3..5fdf632 100644 --- a/test/markdown_html_test.exs +++ b/test/markdown_html_test.exs @@ -83,7 +83,7 @@ defmodule HtmlSanitizeExScrubberMarkdownHTMLTest do test "strips certain tags in multi line strings" do input = "This is <b>a <a href=\"\" target=\"_top\">test</a></b>.\n\n\n\n

It no longer contains any HTML.

\n" - expected = "This is a test.

It no longer contains any HTML.

" + expected = "This is a test.\n\n\n\n

It no longer contains any HTML.

\n" assert expected == sanitize(input) end @@ -139,7 +139,7 @@ defmodule HtmlSanitizeExScrubberMarkdownHTMLTest do @tag href_scrubbing: true test "test_strip_links_leaves_nonlink_tags" do - assert "My mindall day long" == sanitize("My mind\nall day long") + assert "My mind\nall day long" == sanitize("My mind\nall day long") end @tag href_scrubbing: true diff --git a/test/strip_tags_test.exs b/test/strip_tags_test.exs index 3ed2dad..d491834 100644 --- a/test/strip_tags_test.exs +++ b/test/strip_tags_test.exs @@ -37,7 +37,7 @@ defmodule HtmlSanitizeExScrubberStripTagsTest do test "strips tags in multi line strings" do input = "This is <b>a <a href=\"\" target=\"_blank\">test</a></b>.\n\n\n\n

It no longer contains any HTML.

\n" - expected = "This is a test.It no longer contains any HTML." + expected = "This is a test.\n\n\n\nIt no longer contains any HTML.\n" assert expected == strip_tags(input) end diff --git a/test/traverser_test.exs b/test/traverser_test.exs index 92dbd38..75fac92 100644 --- a/test/traverser_test.exs +++ b/test/traverser_test.exs @@ -27,7 +27,7 @@ defmodule HtmlSanitizeExTraverserTest do test "should return expected tree 2" do input = "This is <b>the <a href=\"http://me@example.com\" target=\"_blank\">test</a></b>.\n\n\n\n

It no longer contains any HTML.

\n" - expected = ["This is ", {"b", [], ["the ", "test"]}, ".", "It no ", {"b", [], ["longer ", "contains ", "any ", "HTML", "."]}] + expected = ["This is ", {"b", [], ["the ", "test"]}, ".", " _ \n\n\n\n", "It no ", {"b", [], ["longer ", "contains ", "any ", "HTML", "."]}, " _ \n"] assert expected == parse_to_tree(input) end