diff --git a/src/coffea/nanoevents/factory.py b/src/coffea/nanoevents/factory.py index 9ef58a409..5ff86c581 100644 --- a/src/coffea/nanoevents/factory.py +++ b/src/coffea/nanoevents/factory.py @@ -471,6 +471,12 @@ def from_parquet( io.IOBase, ) + if isinstance(file, dict): + file, filespec_treepath = next(iter(file.items())) + if filespec_treepath is not None: + warnings.warn( + f'For parquet file="{file}", treepath="{filespec_treepath}" is ignored when opening files' + ) if ( delayed and not isinstance(schemaclass, FunctionType) diff --git a/src/coffea/nanoevents/mapping/parquet.py b/src/coffea/nanoevents/mapping/parquet.py index ee95acc2e..61a2792d4 100644 --- a/src/coffea/nanoevents/mapping/parquet.py +++ b/src/coffea/nanoevents/mapping/parquet.py @@ -195,11 +195,35 @@ def preload_column_source(self, uuid, path_in_source, source): key = self.key_root() + tuple_to_key((uuid, path_in_source)) self._cache[key] = source - def get_column_handle(self, columnsource, name): + def get_column_handle(self, columnsource, name, allow_missing): + if allow_missing: + return ( + ParquetSourceMapping.UprootLikeShim(columnsource, name) + if name in columnsource.file.schema_arrow.names + else None + ) return ParquetSourceMapping.UprootLikeShim(columnsource, name) - def extract_column(self, columnhandle, start, stop, **kwargs): - return columnhandle.array(entry_start=start, entry_stop=stop) + def extract_column(self, columnhandle, start, stop, allow_missing, **kwargs): + if allow_missing and columnhandle is None: + return awkward.contents.IndexedOptionArray( + awkward.index.Index64(numpy.full(stop - start, -1, dtype=numpy.int64)), + awkward.contents.NumpyArray(numpy.array([], dtype=bool)), + ) + elif not allow_missing and columnhandle is None: + raise RuntimeError( + "Received columnhandle of None when missing column in file is not allowed!" + ) + + the_array = columnhandle.array(entry_start=start, entry_stop=stop) + + if allow_missing: + the_array = awkward.contents.IndexedOptionArray( + awkward.index.Index64(numpy.arange(stop - start, dtype=numpy.int64)), + awkward.contents.NumpyArray(the_array), + ) + + return the_array def __len__(self): return self._stop - self._start diff --git a/tests/samples/nano_dimuon.parquet b/tests/samples/nano_dimuon.parquet index 7ceb0e22d..86938471b 100644 Binary files a/tests/samples/nano_dimuon.parquet and b/tests/samples/nano_dimuon.parquet differ diff --git a/tests/samples/nano_dimuon_semver.parquet b/tests/samples/nano_dimuon_semver.parquet new file mode 100644 index 000000000..7ceb0e22d Binary files /dev/null and b/tests/samples/nano_dimuon_semver.parquet differ diff --git a/tests/samples/nano_dy.parquet b/tests/samples/nano_dy.parquet index 5de5990c8..40186c2a1 100644 Binary files a/tests/samples/nano_dy.parquet and b/tests/samples/nano_dy.parquet differ diff --git a/tests/samples/nano_dy_semver.parquet b/tests/samples/nano_dy_semver.parquet new file mode 100644 index 000000000..5de5990c8 Binary files /dev/null and b/tests/samples/nano_dy_semver.parquet differ diff --git a/tests/test_nanoevents.py b/tests/test_nanoevents.py index e099d9a25..c6055911f 100644 --- a/tests/test_nanoevents.py +++ b/tests/test_nanoevents.py @@ -63,7 +63,7 @@ def crossref(events): suffixes = [ "root", - # "parquet", + "parquet", ]