It won't hurt 💉

APB9785 · Jul 1, 2015 · 69ea11d · 69ea11d
commit 69ea11d
Show file tree

Hide file tree

Showing 18 changed files with 850 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+/_build
+/deps
+/docs/all.json
+test.json
+erl_crash.dump
+*.ez
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2014 René Föhring
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,4 @@
+HtmlSanitizeEx
+==============
+
+** TODO: Add description **
diff --git a/config/config.exs b/config/config.exs
@@ -0,0 +1,24 @@
+# This file is responsible for configuring your application
+# and its dependencies with the aid of the Mix.Config module.
+use Mix.Config
+
+# This configuration is loaded before any dependency and is restricted
+# to this project. If another project depends on this project, this
+# file won't be loaded nor affect the parent project. For this reason,
+# if you want to provide default values for your application for third-
+# party users, it should be done in your mix.exs file.
+
+# Sample configuration:
+#
+#     config :logger, :console,
+#       level: :info,
+#       format: "$date $time [$level] $metadata$message\n",
+#       metadata: [:user_id]
+
+# It is also possible to import configuration files, relative to this
+# directory. For example, you can emulate configuration per environment
+# by uncommenting the line below and defining dev.exs, test.exs and such.
+# Configuration from the imported file will override the ones defined
+# here (which is why it is important to import them last).
+#
+#     import_config "#{Mix.env}.exs"
diff --git a/lib/html_sanitize_ex.ex b/lib/html_sanitize_ex.ex
@@ -0,0 +1,19 @@
+defmodule HtmlSanitizeEx do
+  alias HtmlSanitizeEx.Scrubber
+
+  def noscrub(html) do
+    html |> Scrubber.scrub(Scrubber.NoScrub)
+  end
+
+  def basic_html(html) do
+    html |> Scrubber.scrub(Scrubber.BasicHTML)
+  end
+
+  def markdown(html) do
+    html |> Scrubber.scrub(Scrubber.BasicHTML)
+  end
+
+  def strip_tags(html) do
+    html |> Scrubber.scrub(Scrubber.StripTags)
+  end
+end
diff --git a/lib/html_sanitize_ex/parser.ex b/lib/html_sanitize_ex/parser.ex
@@ -0,0 +1,38 @@
+defmodule HtmlSanitizeEx.Parser do  @doc """
+  Parses a HTML string.
+  ## Examples
+      iex> Floki.parse("<div class=js-action>hello world</div>")
+      {"div", [{"class", "js-action"}], ["hello world"]}
+      iex> Floki.parse("<div>first</div><div>second</div>")
+      [{"div", [], ["first"]}, {"div", [], ["second"]}]
+  """
+
+  @type html_tree :: tuple | list
+
+  @my_root_node "html_sanitize_ex"
+
+  @spec parse(binary) :: html_tree
+
+  def parse(html) do
+    html = "<#{@my_root_node}>#{html}</#{@my_root_node}>"
+    {@my_root_node, [], parsed} = :mochiweb_html.parse(html)
+
+    if length(parsed) == 1, do: hd(parsed), else: parsed
+  end
+
+  def to_html(tokens) do
+    {@my_root_node, [], ensure_list(tokens)}
+      |> :mochiweb_html.to_html
+      |> Enum.join
+      |> String.replace(~r/^<#{@my_root_node}>/, "")
+      |> String.replace(~r/<\/#{@my_root_node}>$/, "")
+      |> String.replace("&lt;/html_sanitize_ex&gt;", "")
+  end
+
+  defp ensure_list(list) do
+    case list do
+      [head | tail] -> list
+      _ -> [list]
+    end
+  end
+end
diff --git a/lib/html_sanitize_ex/scrubber.ex b/lib/html_sanitize_ex/scrubber.ex
@@ -0,0 +1,17 @@
+defmodule HtmlSanitizeEx.Scrubber do
+  def scrub("", _) do
+    ""
+  end
+
+  def scrub(nil, _) do
+    ""
+  end
+
+  def scrub(html, scrubber_module) do
+    html
+      |> scrubber_module.before_scrub
+      |> HtmlSanitizeEx.Parser.parse
+      |> HtmlSanitizeEx.Traverser.traverse(scrubber_module)
+      |> HtmlSanitizeEx.Parser.to_html
+  end
+end
diff --git a/lib/html_sanitize_ex/scrubber/basic_html.ex b/lib/html_sanitize_ex/scrubber/basic_html.ex
@@ -0,0 +1,72 @@
+defmodule HtmlSanitizeEx.Scrubber.BasicHTML do
+  def before_scrub(text) do
+    HtmlSanitizeEx.Scrubber.StripTags.before_scrub(text)
+  end
+
+  require HtmlSanitizeEx.Scrubber.Meta
+  alias HtmlSanitizeEx.Scrubber.Meta
+
+  Meta.allow_tags_and_scrub_its_attributes ["h1", "h2", "h3", "h4", "h5",
+                  "a", "b", "blockquote", "br", "code", "del", "em", "hr", "i",
+                  "img", "li", "ol", "ul", "p", "pre", "span", "strong", "u",
+                  "table", "tbody", "td", "th", "thead", "tr"]
+
+  Meta.allow_tag_with_these_attributes "a", ["name", "title"]
+
+  def scrub_attribute("a", {"href", "&" <> _}), do: nil
+
+  def scrub_attribute("a", {"href", href}) do
+    IO.inspect href
+    if no_scheme?(href) || valid_scheme?(href) do
+      {"href", href}
+    end
+  end
+
+  Meta.allow_tag_with_these_attributes "img", ["width", "height", "title", "alt"]
+
+  def scrub_attribute("img", {"src", "http://" <> src}) do
+    if no_scheme?(src) || valid_scheme?(src) do
+      {"src", src}
+    end
+  end
+
+  defp no_scheme?(uri) do
+    !String.match?(uri, ~r/\:/)
+  end
+
+  @valid_schemes ["http://", "https://"]
+
+  defp valid_scheme?(uri) do
+    String.starts_with?(uri, @valid_schemes)
+  end
+
+  # If we have covered the attribute until here, we just scrab it.
+  def scrub_attribute(tag, attribute) do
+    nil
+  end
+
+  # If we haven't covered the attribute until here, we just scrab it.
+  def scrub({tag, attributes, children}) do
+    children
+  end
+
+  def scrub({:comment, children}), do: ""
+  def scrub({token, children}), do: children
+
+  @doc """
+    Scrubs a text node.
+  """
+  def scrub(text) do
+    scrub_text(text)
+  end
+
+  @doc false
+  def scrub_attributes(tag, attributes) do
+    Enum.map(attributes, fn(attr) -> scrub_attribute(tag, attr) end)
+      |> Enum.reject(&(is_nil(&1)))
+  end
+
+  def scrub_text(text) do
+    text
+  end
+end
diff --git a/lib/html_sanitize_ex/scrubber/meta.ex b/lib/html_sanitize_ex/scrubber/meta.ex
@@ -0,0 +1,40 @@
+defmodule HtmlSanitizeEx.Scrubber.Meta do
+  @doc "Allow these tags and use the regular `scrub_attribute/2` function to scrub the attributes."
+  defmacro allow_tags_and_scrub_its_attributes(list) do
+    Enum.map(list, fn name -> allow_this_tag_and_scrub_its_attributes(name) end)
+  end
+
+  @doc "Allow these tags if they don't have attributes"
+  defmacro allow_tag_with_these_attributes(tag, list) do
+    Enum.map(list, fn name -> allow_this_tag_with_these_attributes(tag, name) end)
+  end
+
+  @doc "Allow these tags if they don't have attributes"
+  defmacro allow_these_tags_without_attributes(list) do
+    Enum.map(list, fn name -> allow_these_tag_without_attributes(name) end)
+  end
+
+  defp allow_this_tag_and_scrub_its_attributes(name) do
+    quote do
+      def scrub({unquote(name), attributes, children}) do
+        {unquote(name), scrub_attributes(unquote(name), attributes), children}
+      end
+    end
+  end
+
+  defp allow_this_tag_with_these_attributes(name, attr_name) do
+    quote do
+      def scrub_attribute(unquote(name), {unquote(attr_name), value}) do
+        {unquote(attr_name), value}
+      end
+    end
+  end
+
+  defp allow_these_tag_without_attributes(name) do
+    quote do
+      def scrub({unquote(name), [], children}) do
+        {unquote(name), [], children}
+      end
+    end
+  end
+end
diff --git a/lib/html_sanitize_ex/scrubber/no_scrub.ex b/lib/html_sanitize_ex/scrubber/no_scrub.ex
@@ -0,0 +1,59 @@
+defmodule HtmlSanitizeEx.Scrubber.NoScrub do
+  @moduledoc """
+  Scrubs neither tags, nor their attributes.
+
+  This meant for testing purposes and as a template for your own scrubber.
+  """
+
+  def before_scrub(text) do
+    text
+  end
+
+  @doc """
+    Scrubs a single tag given its attributes and children.
+
+    Calls `scrub_attribute/2` to scrub individual attributes.
+  """
+  def scrub({tag, attributes, children}) do
+    {tag, scrub_attributes(tag, attributes), children}
+  end
+
+  @doc """
+    Scrubs tokens like comments and doctypes.
+  """
+  def scrub({token, children}), do: children
+
+  @doc """
+    Scrubs a text node.
+  """
+  def scrub(text) do
+    text
+  end
+
+  @doc false
+  def scrub_attributes(tag, attributes) do
+    Enum.map(attributes, fn(attr) -> scrub_attribute(tag, attr) end)
+      |> Enum.reject(&(is_nil(&1)))
+  end
+
+  @doc """
+    Scrubs a single attribute for a given tag.
+
+    You can utilize scrub_attribute to write custom matchers so you can sanitize
+    specific attributes of specific tags:
+
+    As an example, if you only want to allow href attribute with the "http" and
+    "https" protocols, you could implement it like this:
+
+      def scrub_attribute("a", {"href", "http" <> target}) do
+        {"href", "http" <> target}
+      end
+
+      def scrub_attribute("a", {"href", _}) do
+        nil
+      end
+  """
+  def scrub_attribute(tag, attribute) do
+    attribute
+  end
+end
diff --git a/lib/html_sanitize_ex/scrubber/strip_tags.ex b/lib/html_sanitize_ex/scrubber/strip_tags.ex
@@ -0,0 +1,14 @@
+defmodule HtmlSanitizeEx.Scrubber.StripTags do
+  @moduledoc """
+  Strips all tags.
+  """
+
+  def before_scrub(text) do
+    String.replace(text, "<![CDATA[", "")
+  end
+
+  def scrub({_, _, children}), do: children
+  def scrub({:comment, children}), do: ""
+  def scrub({_, children}), do: children
+  def scrub(text), do: text
+end
diff --git a/lib/html_sanitize_ex/traverser.ex b/lib/html_sanitize_ex/traverser.ex
@@ -0,0 +1,58 @@
+defmodule HtmlSanitizeEx.Traverser do
+  @doc """
+    Traverses an html_tree.
+  """
+  def traverse([], _scrubber_module) do
+    []
+  end
+
+  def traverse([head | tail], scrubber_module) do
+    head = traverse(head, scrubber_module) |> collapse_list
+    tail = traverse(tail, scrubber_module)
+
+    result = List.flatten([head] ++ tail)
+
+    #IO.inspect {:head, head}
+    #IO.inspect {:tail, tail}
+    #IO.inspect {:result, result}
+    result
+  end
+
+  def traverse({tag, attributes, children}, scrubber_module) do
+    children = children
+                |> traverse(scrubber_module)
+    {tag, attributes, children}
+      |> scrubber_module.scrub
+  end
+
+  def traverse(text, scrubber_module) when is_binary(text) do
+    text
+      |> scrubber_module.scrub
+  end
+
+  # Matches things like {:comment, "this is a comment"} or {:doctype, "..."}.
+  def traverse({token, children}, scrubber_module) do
+    children = children
+                |> traverse(scrubber_module)
+                |> collapse_list
+    {token, children}
+      |> scrubber_module.scrub
+  end
+
+  # Matches things like {:comment, "this is a comment"} or {:doctype, "..."}.
+  def traverse(what, scrubber_module) do
+    IO.inspect "########################"
+    IO.inspect {:error, what}
+    IO.inspect "########################"
+    what
+  end
+
+  # Collapses a list if it only consists of other lists.
+  defp collapse_list(children) do
+    result = case children do
+      [single] -> single
+      list -> list
+    end
+    result
+  end
+end