Skip to content

Commit

Permalink
feat: provide a way for Form.select_columns to distinguish structur…
Browse files Browse the repository at this point in the history
…al dots from dots in the names of record fields (#3222)

* Allowing list/tuple in specifier of Form.select_columns

Actually, allowing a list of lists of str in case a literal "." appears as a record name.

* Adding docstring to Form.select_columns

* Moving unit tests to test_3088_...

Added a new test for from_parquet.
Added some documentation to from_parquet.

* style: pre-commit fixes

* select_columns uses isinstance Iterable

Also correcting some documentation and exception text.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jim Pivarski <[email protected]>
  • Loading branch information
3 people authored Aug 29, 2024
1 parent c30c14e commit 09351fc
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 13 deletions.
52 changes: 41 additions & 11 deletions src/awkward/forms/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,7 +333,7 @@ def __call__(self, field: str, *, next_match_if_empty: bool = False) -> Self | N
has_matched = True
next_specifiers.extend(self._match_to_next_specifiers[field])

# Fixed-strings are an O(n) lookup
# Patterns are an O(n) lookup
for pattern in self._patterns:
if fnmatchcase(field, pattern):
has_matched = True
Expand Down Expand Up @@ -437,29 +437,59 @@ def columns(self, list_indicator=None, column_prefix=()):
def select_columns(
self, specifier, expand_braces=True, *, prune_unions_and_records: bool = True
):
"""
select_columns returns a new Form with only columns and sub-columns selected.
Returns an empty Form if no columns matched the specifier(s).
`specifier` can be a `str | Iterable[str | Iterable[str]]`.
Strings may include shell-globbing-style wildcards "*" and "?".
If `expand_braces` is `True` (the default), strings may include alternatives in braces.
For example, `["a.{b,c}.d"]` is equivalent to `["a.b.d", "a.c.d"]`.
Glob-style matching would also suit this single-character instance: `"a.[bc].d"`.
If specifier is a list which contains a list/tuple, that inner list will be interpreted as
column and subcolumn specifiers. They *may* contain wildcards, but "." will not be
interpreted as a `<field>.<subfield>` pattern.
"""
if isinstance(specifier, str):
specifier = {specifier}

# Only take unique specifiers
for item in specifier:
if not isinstance(item, str):
if isinstance(item, str):
if item == "":
raise ValueError(
"a column-selection specifier cannot be an empty string"
)
elif isinstance(item, Iterable):
for field in item:
if not isinstance(field, str):
raise ValueError("a sub-column specifier must be a string")
else:
raise TypeError(
"a column-selection specifier must be a list of non-empty strings"
)
if not item:
raise ValueError(
"a column-selection specifier must be a list of non-empty strings"
"a column specifier must be a string or an iterable of strings"
)

if expand_braces:
next_specifier = []
for item in specifier:
for result in _expand_braces(item):
next_specifier.append(result)
if isinstance(item, str):
for result in _expand_braces(item):
next_specifier.append(result)
else:
next_specifier.append(item)
specifier = next_specifier

specifier = [[] if item == "" else item.split(".") for item in set(specifier)]
match_specifier = _SpecifierMatcher(specifier, match_if_empty=False)
# specifier = set(specifier)
specifier_lists: list[list[str]] = []
for item in specifier:
if isinstance(item, str):
if item == "":
specifier_lists.append([])
else:
specifier_lists.append(item.split("."))
else:
specifier_lists.append(item)
match_specifier = _SpecifierMatcher(specifier_lists, match_if_empty=False)
selection = self._select_columns(match_specifier)
assert selection is not None, "top-level selections always return a Form"

Expand Down
5 changes: 3 additions & 2 deletions src/awkward/operations/ak_from_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ def from_parquet(
Args:
path (str): Local filename or remote URL, passed to fsspec for resolution.
May contain glob patterns.
columns (None, str, or list of str): Glob pattern(s) with bash-like curly
columns (None, str, or iterable of (str or iterable of str)): Glob pattern(s) including bash-like curly
brackets for matching column names. Nested records are separated by dots.
If a list of patterns, the logical-or is matched. If None, all columns
are read.
are read. A list of lists can be provided to select columns with literal dots
in their names -- The inner list provides column names or patterns.
row_groups (None or set of int): Row groups to read; must be non-negative.
Order is ignored: the output array is presented in the order specified by
Parquet metadata. If None, all row groups/all rows are read.
Expand Down
76 changes: 76 additions & 0 deletions tests/test_3088_select_columns_supports_literal_dots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE

from __future__ import annotations

import os

import pytest

import awkward as ak


def array_with_dotted_fields():
return ak.Array(
[
{
"x": [
{
"y": {
"z": [1, 2, 3],
"w.1": 4,
}
}
]
}
]
)


def test_alternative_specifiers():
array = array_with_dotted_fields()
form = array.layout.form
assert form.select_columns("*") == form
assert form.select_columns([("x", "y", "w.1")]) == form.select_columns("x.y.w*")
assert form.select_columns([["x", "y", "w.1"], "x.y.z"]) == form


def test_columns_with_dots_from_parquet(tmp_path):
# ruff: noqa: F841
_pq = pytest.importorskip("pyarrow.parquet")
array = array_with_dotted_fields()
parquet_file = os.path.join(tmp_path, "test_3088_array1.parquet")
ak.to_parquet(array, parquet_file)
array_selected = ak.from_parquet(parquet_file, columns=[("x", "y", "w.1")])
assert array_selected.to_list() == [
{
"x": [
{
"y": {
# "z": [1, 2, 3], Excluded
"w.1": 4, # Selected
}
}
]
}
]

ambig_array = ak.Array(
[
{
"crazy": {
"dot": [11, 12, 13],
},
"crazy.dot": [21, 22, 23],
}
]
)
parquet_file_ambig = os.path.join(tmp_path, "test_3088_array_ambig.parquet")
ak.to_parquet(ambig_array, parquet_file_ambig)
ambig_selected = ak.from_parquet(parquet_file_ambig, columns=[("crazy.dot",)])
# Note: Currently, pyarrow.parquet cannot distinguish dots as separators
# from dots as field names. It builds a dict of all possible indices,
# and returns those. Even so, we still need the ability within Awkward to
# disambiguate these two, which we now have. We would need further
# feature work to create column name substitutions to work around this pyarrow
# limitation should this be justified.
assert ak.array_equal(ambig_selected, ambig_array) # Slurped everything.

0 comments on commit 09351fc

Please sign in to comment.