-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: provide a way for
Form.select_columns
to distinguish structur…
…al dots from dots in the names of record fields (#3222) * Allowing list/tuple in specifier of Form.select_columns Actually, allowing a list of lists of str in case a literal "." appears as a record name. * Adding docstring to Form.select_columns * Moving unit tests to test_3088_... Added a new test for from_parquet. Added some documentation to from_parquet. * style: pre-commit fixes * select_columns uses isinstance Iterable Also correcting some documentation and exception text. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jim Pivarski <[email protected]>
- Loading branch information
1 parent
c30c14e
commit 09351fc
Showing
3 changed files
with
120 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE | ||
|
||
from __future__ import annotations | ||
|
||
import os | ||
|
||
import pytest | ||
|
||
import awkward as ak | ||
|
||
|
||
def array_with_dotted_fields(): | ||
return ak.Array( | ||
[ | ||
{ | ||
"x": [ | ||
{ | ||
"y": { | ||
"z": [1, 2, 3], | ||
"w.1": 4, | ||
} | ||
} | ||
] | ||
} | ||
] | ||
) | ||
|
||
|
||
def test_alternative_specifiers(): | ||
array = array_with_dotted_fields() | ||
form = array.layout.form | ||
assert form.select_columns("*") == form | ||
assert form.select_columns([("x", "y", "w.1")]) == form.select_columns("x.y.w*") | ||
assert form.select_columns([["x", "y", "w.1"], "x.y.z"]) == form | ||
|
||
|
||
def test_columns_with_dots_from_parquet(tmp_path): | ||
# ruff: noqa: F841 | ||
_pq = pytest.importorskip("pyarrow.parquet") | ||
array = array_with_dotted_fields() | ||
parquet_file = os.path.join(tmp_path, "test_3088_array1.parquet") | ||
ak.to_parquet(array, parquet_file) | ||
array_selected = ak.from_parquet(parquet_file, columns=[("x", "y", "w.1")]) | ||
assert array_selected.to_list() == [ | ||
{ | ||
"x": [ | ||
{ | ||
"y": { | ||
# "z": [1, 2, 3], Excluded | ||
"w.1": 4, # Selected | ||
} | ||
} | ||
] | ||
} | ||
] | ||
|
||
ambig_array = ak.Array( | ||
[ | ||
{ | ||
"crazy": { | ||
"dot": [11, 12, 13], | ||
}, | ||
"crazy.dot": [21, 22, 23], | ||
} | ||
] | ||
) | ||
parquet_file_ambig = os.path.join(tmp_path, "test_3088_array_ambig.parquet") | ||
ak.to_parquet(ambig_array, parquet_file_ambig) | ||
ambig_selected = ak.from_parquet(parquet_file_ambig, columns=[("crazy.dot",)]) | ||
# Note: Currently, pyarrow.parquet cannot distinguish dots as separators | ||
# from dots as field names. It builds a dict of all possible indices, | ||
# and returns those. Even so, we still need the ability within Awkward to | ||
# disambiguate these two, which we now have. We would need further | ||
# feature work to create column name substitutions to work around this pyarrow | ||
# limitation should this be justified. | ||
assert ak.array_equal(ambig_selected, ambig_array) # Slurped everything. |