From 3f66bd202e293153d32c5643cbdcd6412c5a7476 Mon Sep 17 00:00:00 2001 From: Anthony Khong Date: Sun, 2 Jul 2023 15:26:38 +0700 Subject: [PATCH] Add DataFrame.frequencies/2 (#637) --- lib/explorer/data_frame.ex | 25 +++++++++++++++++++++++++ test/explorer/data_frame_test.exs | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 8f56f00dc..b5864306a 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -5030,6 +5030,31 @@ defmodule Explorer.DataFrame do @spec nil_count(df :: DataFrame.t()) :: DataFrame.t() def nil_count(df), do: Shared.apply_impl(df, :nil_count) + @doc """ + Creates a new dataframe with unique rows and the frequencies of each. + + ## Examples + + iex> df = Explorer.DataFrame.new(a: ["a", "a", "b"], b: [1, 1, nil]) + iex> Explorer.DataFrame.frequencies(df, [:a, :b]) + #Explorer.DataFrame< + Polars[2 x 3] + a string ["a", "b"] + b integer [1, nil] + counts integer [2, 1] + > + """ + @doc type: :single + @spec frequencies(df :: DataFrame.t(), columns :: column_names()) :: DataFrame.t() + def frequencies(%DataFrame{} = df, [col | _] = columns) do + df + |> group_by(columns) + |> summarise_with(&[counts: Series.count(&1[col])]) + |> arrange_with(&[desc: &1[:counts]]) + end + + def frequencies(_df, []), do: raise(ArgumentError, "columns cannot be empty") + # Helpers defp backend_from_options!(opts) do diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 7b91f2ddb..f13ba00e9 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -2973,6 +2973,33 @@ defmodule Explorer.DataFrameTest do end end + describe "frequencies/1" do + test "multiple columns with and without nils" do + df = + DF.new( + a: [1, 1, 1, 2, 3, 3], + b: [true, true, true, false, false, false], + c: ["a", nil, "a", nil, "a", nil] + ) + + assert DF.frequencies(df, [:a, :b]) |> DF.to_columns(atom_keys: true) == + %{a: [1, 3, 2], b: [true, false, false], counts: [3, 2, 1]} + + assert DF.frequencies(df, [:a, :c]) |> DF.to_columns(atom_keys: true) == + %{a: [1, 1, 2, 3, 3], c: ["a", nil, nil, "a", nil], counts: [2, 1, 1, 1, 1]} + end + + test "invalid columns args" do + assert_raise ArgumentError, + ~r/cannot be empty/, + fn -> DF.new(a: [1]) |> DF.frequencies([]) end + + assert_raise ArgumentError, + ~r/could not find column name/, + fn -> DF.new(a: [1]) |> DF.frequencies([:x]) end + end + end + describe "nil_count/1" do test "various dtypes" do require Explorer.DataFrame, as: DF