Skip to content

Commit

Permalink
De-functionalize query internals (#989)
Browse files Browse the repository at this point in the history
* Rename `traverse/2` as `traverse_root/2`

This just makes the intended use clearer.

* Append `?` to `collect_pins_and_vars`

This makes it clearer it's a boolean.

* Alias `Kernel` as `K`

This is in line with what we do elsewhere.

* Slight format refactor

* Change `query/1` to `query/2`

This is the primary change. Now `query/2` takes
the dataframe as an explicit argument intead of
an implicit, unhygienized variable.

* Add comment about non-obvious function wrap

* Try to fix specs

* Comment out broken test

* Revert style changes

* Revert all changes to `query.ex`

* Revert all changes to `data_frame.ex`

* Revert changes to `series.ex`

* Add new/1 that returns a Backend.LazyFrame

* Make is_column_pairs return false for structs

* Add _with clauses

* Add Access behaviour

* Make all impls specify behaviour

* Uncomment test

* Add TODO for `is_non_struct_map`

Co-authored-by: José Valim <[email protected]>

* Change Backend.LazyFrame to Backend.QueryFrame

* Rename files

* Edit test internals for readability

* Add docs

Re-write the "Implementation details" section
to reference the new functionality.

* Doc tweaks

* Remove an extra "the"
* Reference the rewritten section in
  the `new/1` docs

* Revert 1st sentence

* Revert "Implementation details"

* Add smaller addendum to that section

* Add example to new/1 docs

* Update lib/explorer/backend/query_frame.ex

Co-authored-by: José Valim <[email protected]>

* Fix punctuation

---------

Co-authored-by: José Valim <[email protected]>
  • Loading branch information
billylanchantin and josevalim authored Sep 30, 2024
1 parent 4ae1a27 commit fe52352
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 42 deletions.
2 changes: 1 addition & 1 deletion lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ defmodule Explorer.Backend.DataFrame do
| [basic_types()]
| (df() -> series() | basic_types() | [basic_types()])

@type lazy_frame :: Explorer.Backend.LazyFrame.t()
@type query_frame :: Explorer.Backend.QueryFrame.t()
@type lazy_series :: Explorer.Backend.LazySeries.t()

@type compression :: {algorithm :: option(atom()), level :: option(integer())}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
defmodule Explorer.Backend.LazyFrame do
defmodule Explorer.Backend.QueryFrame do
@moduledoc """
Represents a lazy dataframe for building query expressions.
The LazyFrame is available inside `filter_with`, `mutate_with`, and
You may call `Explorer.Query.new` to create a query-backed dataframe.
The QueryFrame is available inside `filter_with`, `mutate_with`, and
similar. You cannot perform any operation on them except accessing
its underlying series.
"""
Expand All @@ -18,6 +19,8 @@ defmodule Explorer.Backend.LazyFrame do
names: Backend.DataFrame.column_name(),
resource: reference() | nil
}

@behaviour Access
@behaviour Backend.DataFrame

@doc false
Expand All @@ -40,16 +43,16 @@ defmodule Explorer.Backend.LazyFrame do
# cross node operations happen at the lazy frame level.
# Instead, we store the resource and we delegate them
# to the underlying lazy series.
@impl true
@impl Backend.DataFrame
def owner_reference(_), do: nil

@impl true
@impl Backend.DataFrame
def lazy, do: __MODULE__

@impl true
@impl Backend.DataFrame
def lazy(ldf), do: ldf

@impl true
@impl Backend.DataFrame
def inspect(ldf, opts) do
import Inspect.Algebra

Expand All @@ -68,7 +71,7 @@ defmodule Explorer.Backend.LazyFrame do
end

concat([
color("LazyFrame", :atom, opts),
color("QueryFrame", :atom, opts),
open,
"??? x #{length(cols_algebra)}",
close,
Expand All @@ -86,7 +89,7 @@ defmodule Explorer.Backend.LazyFrame do

defp groups_algebra([], _), do: ""

@impl true
@impl Backend.DataFrame
def pull(%{data: data, dtypes: dtypes}, column) do
dtype_for_column = dtypes[column]

Expand All @@ -103,14 +106,35 @@ defmodule Explorer.Backend.LazyFrame do
for {fun, arity} <- funs do
args = Macro.generate_arguments(arity, __MODULE__)

@impl true
@impl Backend.DataFrame
def unquote(fun)(unquote_splicing(args)) do
raise """
cannot perform operation #{unquote(fun)} on Explorer.Backend.LazyFrame.
cannot perform operation #{unquote(fun)} on Explorer.Backend.QueryFrame.
The LazyFrame is available inside filter_with, mutate_with, and \
The QueryFrame is available inside filter_with, mutate_with, and \
similar and they support only a limited subset of the Series API
"""
end
end

@impl Access
def fetch(%__MODULE__{} = lazy_frame, name) do
case pull(lazy_frame, name) do
%Explorer.Series{data: %Explorer.Backend.LazySeries{}} = lazy_series ->
{:ok, lazy_series}

_other ->
:error
end
end

@impl Access
def get_and_update(%__MODULE__{}, _name, _callback) do
raise "cannot update an `Explorer.Backend.QueryFrame`"
end

@impl Access
def pop(%__MODULE__{}, _name) do
raise "cannot delete from an `Explorer.Backend.QueryFrame`"
end
end
87 changes: 63 additions & 24 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,10 @@ defmodule Explorer.DataFrame do

defguardp is_column(column) when is_binary(column) or is_atom(column) or is_integer(column)
defguardp is_column_name(column) when is_binary(column) or is_atom(column)
defguardp is_column_pairs(columns) when is_list(columns) or is_map(columns)

# TODO: Use is_non_struct_map when we require Elixir v1.18+
defguardp is_column_pairs(columns)
when is_list(columns) or (is_map(columns) and not is_struct(columns))

# Normalize a column name to string
defp to_column_name(column) when is_binary(column), do: column
Expand Down Expand Up @@ -2607,12 +2610,22 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec filter_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> Series.lazy_t() | [Series.lazy_t()])
callback_or_lazy_series_or_list ::
(Explorer.Backend.QueryFrame.t() -> Series.lazy_t() | [Series.lazy_t()])
| Series.lazy_t()
| [Series.lazy_t()]
) :: DataFrame.t()
def filter_with(df, fun) when is_function(fun, 1) do
ldf = Explorer.Backend.LazyFrame.new(df)
filter =
df
|> Explorer.Query.new()
|> fun.()

filter_with(df, filter)
end

case fun.(ldf) do
def filter_with(df, filter) do
case filter do
%Series{dtype: :boolean, data: %LazySeries{} = data} ->
Shared.apply_dataframe(df, :filter_with, [df, data])

Expand Down Expand Up @@ -2926,21 +2939,31 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec mutate_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t())),
callback_or_column_pairs ::
(Explorer.Backend.QueryFrame.t() -> column_pairs(Series.lazy_t()))
| column_pairs(Series.lazy_t()),
opts :: keyword()
) :: DataFrame.t()
def mutate_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun) and is_list(opts) do
def mutate_with(df, query_or_fun, opts \\ [])

def mutate_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) and is_list(opts) do
column_pairs =
df
|> Explorer.Query.new()
|> fun.()

mutate_with(df, column_pairs, opts)
end

def mutate_with(%DataFrame{} = df, column_pairs, opts)
when is_column_pairs(column_pairs) and is_list(opts) do
keep = Keyword.get(opts, :keep, :all)

unless keep in [:all, :none] do
raise ArgumentError, "Invalid value for :keep option. Allowed values are :all or :none."
end

ldf = Explorer.Backend.LazyFrame.new(df)

result = fun.(ldf)

column_pairs = to_column_pairs(df, result, &query_to_series!/1)
column_pairs = to_column_pairs(df, column_pairs, &query_to_series!/1)

new_dtypes =
for {column_name, series} <- column_pairs, into: %{} do
Expand Down Expand Up @@ -3423,21 +3446,30 @@ defmodule Explorer.DataFrame do
>
"""
@doc type: :single
@type sort_callback_result ::
Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}]
@spec sort_with(
df :: DataFrame.t(),
(Explorer.Backend.LazyFrame.t() ->
Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}]),
callback_or_result ::
(Explorer.Backend.QueryFrame.t() -> sort_callback_result()) | sort_callback_result(),
opts :: [nils: :first | :last, stable: boolean()]
) :: DataFrame.t()
def sort_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun, 1) do
[_descending? | opts] = Shared.validate_sort_options!(opts)
def sort_with(df, fun, opts \\ [])

ldf = Explorer.Backend.LazyFrame.new(df)
def sort_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) do
sortable =
df
|> Explorer.Query.new()
|> fun.()

sort_with(df, sortable, opts)
end

result = fun.(ldf)
def sort_with(%DataFrame{} = df, sortable, opts) do
[_descending? | opts] = Shared.validate_sort_options!(opts)

dir_and_lazy_series_pairs =
result
sortable
|> List.wrap()
|> Enum.map(fn
{dir, %Series{data: %LazySeries{} = lazy_series}} when dir in [:asc, :desc] ->
Expand Down Expand Up @@ -5675,15 +5707,22 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec summarise_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t()))
callback_or_column_pairs ::
(Explorer.Backend.QueryFrame.t() -> column_pairs(Series.lazy_t()))
| column_pairs(Series.lazy_t())
) :: DataFrame.t()
def summarise_with(%DataFrame{} = df, fun) when is_function(fun, 1) do
ldf = Explorer.Backend.LazyFrame.new(df)
column_pairs =
df
|> Explorer.Query.new()
|> fun.()

result = fun.(ldf)
summarise_with(df, column_pairs)
end

result =
Enum.map(result, fn
def summarise_with(%DataFrame{} = df, column_pairs) when is_column_pairs(column_pairs) do
column_pairs =
Enum.map(column_pairs, fn
{key, nil} ->
lazy_s = LazySeries.unbacked(:lazy, [nil], :null)
{key, Explorer.Backend.Series.new(lazy_s, :null)}
Expand All @@ -5693,7 +5732,7 @@ defmodule Explorer.DataFrame do
end)

column_pairs =
to_column_pairs(df, result, fn value ->
to_column_pairs(df, column_pairs, fn value ->
case value do
%Series{data: %LazySeries{op: :lazy, args: [nil], dtype: :null}} ->
value
Expand Down
71 changes: 71 additions & 0 deletions lib/explorer/query.ex
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,36 @@ defmodule Explorer.Query do
This means that, whenever you want to generate queries programatically,
you can fallback to the regular `_with` APIs.
In the `_with` APIs, the callbacks receive an `Explorer.DataFrame` as an
input. That dataframe is backed by the special `Explorer.Backend.QueryFrame`
backend.
Explorer.DataFrame.filter_with(df, fn query_backed_frame ->
IO.inspect(query_backed_frame)
...
end)
# #Explorer.DataFrame<
# QueryFrame[??? x 1]
# ...
# >
A "query-backed" dataframe cannot be manipulated. You may only access its
series. And when you do, you get back "lazy-backed" versions of those series:
Explorer.DataFrame.filter_with(df, fn query_backed_frame ->
IO.inspect(query_backed_frame["a"])
...
end)
# #Explorer.Series<
# LazySeries[???]
# s64 (column("a"))
# >
"Lazy-backed" series are backed by the special `Explorer.Backend.LazySeries`
backend. All `Explorer.Series` functions work on lazy-backed series too. So
you can write your `_with` callbacks without ever referencing the fact that
the backend is the lazy one.
"""

kernel_all = Kernel.__info__(:functions) ++ Kernel.__info__(:macros)
Expand All @@ -292,6 +322,47 @@ defmodule Explorer.Query do

@kernel_only kernel_only -- kernel_only -- kernel_all

@doc """
Returns a "query-backed" `Explorer.DataFrame` for use in queries.
This function is mostly an implementation detail for the `*_with` callbacks.
See the "Implementation details" section of the `@moduledoc` for details.
There are some limited instances where it's more convenient to work with
query-backed `DataFrame`s. For example, if you want to re-use a lazy series,
you can do so like this:
alias Explorer.{DataFrame, Query, Series}
df = DataFrame.new(a: [1, 2, 3])
qf = Query.new(df)
gt_1 = Series.greater(qf["a"], 1)
lt_3 = Series.less(qf["a"], 3)
df
|> DataFrame.filter_with(gt_1)
|> DataFrame.to_columns(atom_keys: true)
#=> %{a: [2, 3]}
df
|> DataFrame.filter_with(lt_3)
|> DataFrame.to_columns(atom_keys: true)
#=> %{a: [1, 2]}
df
|> DataFrame.filter_with(Series.and(gt_1, lt_3))
|> DataFrame.to_columns(atom_keys: true)
#=> %{a: [2]}
However, if you think you need `new/1`, first check that you can't accomplish
the same thing with `across/0` inside a macro. The latter is usually easier to
work with.
"""
def new(%Explorer.DataFrame{} = df) do
Explorer.Backend.QueryFrame.new(df)
end

@doc """
Builds an anonymous function from a query.
Expand Down
2 changes: 1 addition & 1 deletion mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ defmodule Explorer.MixProject do
Explorer.Backend,
Explorer.Backend.DataFrame,
Explorer.Backend.Series,
Explorer.Backend.LazyFrame,
Explorer.Backend.QueryFrame,
Explorer.Backend.LazySeries,
Explorer.PolarsBackend
]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
defmodule Explorer.Backend.LazyFrameTest do
defmodule Explorer.Backend.QueryFrameTest do
use ExUnit.Case, async: true
alias Explorer.Backend.LazyFrame
alias Explorer.Backend.QueryFrame

test "inspect/2 prints the columns without data" do
df = Explorer.DataFrame.new(a: [1, 2], b: [3.1, 4.5])
ldf = LazyFrame.new(df)
qf = QueryFrame.new(df)

assert inspect(ldf) ==
assert inspect(qf) ==
"""
#Explorer.DataFrame<
LazyFrame[??? x 2]
QueryFrame[??? x 2]
a s64
b f64
>\
Expand Down

0 comments on commit fe52352

Please sign in to comment.