From 2d2095fef89f9f909d22e00c61919f577613e724 Mon Sep 17 00:00:00 2001 From: Joni Herttuainen Date: Fri, 24 Mar 2023 15:47:18 +0100 Subject: [PATCH] Remove network object level property_dtypes. * fix the functionaltity to fetch and merge different properties from multiple populations --- bluepysnap/circuit_ids.py | 5 - bluepysnap/network.py | 100 +++++++++++------- ...{test_dtype_mismatch.py => test_dtypes.py} | 58 +++++----- tests/test_edges/test_edges.py | 76 +------------ tests/test_nodes/test_nodes.py | 56 +--------- 5 files changed, 93 insertions(+), 202 deletions(-) rename tests/{test_dtype_mismatch.py => test_dtypes.py} (59%) diff --git a/bluepysnap/circuit_ids.py b/bluepysnap/circuit_ids.py index 7f6698c2..e512a6ca 100644 --- a/bluepysnap/circuit_ids.py +++ b/bluepysnap/circuit_ids.py @@ -55,11 +55,6 @@ def __init__(self, index, sort_index=True): index = index.sortlevel()[0] self.index = index - @property - def index_schema(self): - """Return an empty index with the same names of the wrapped index.""" - return pd.MultiIndex.from_tuples([], names=self.index.names) - @classmethod def _instance(cls, index, sort_index=True): """The instance returned by the functions.""" diff --git a/bluepysnap/network.py b/bluepysnap/network.py index 6e515464..70557443 100644 --- a/bluepysnap/network.py +++ b/bluepysnap/network.py @@ -25,6 +25,62 @@ from bluepysnap.exceptions import BluepySnapError +def _gather_properties(populations, ids, properties): + """Helper function to get data from populations. + + Args: + populations (list): populations (NodePopulation, EdgePopulation) to consider + ids (CircuitNodeIds, CircuitEdgeIds): node/edge ids to retrieve + properties (list): properties to retrieve + + Returns: + pandas.DataFrame: dataframe containing the gathered data + """ + values = {} + indices = {} + + # gather properties for each of the population + for pop in populations: + global_pop_ids = ids.filter_population(pop.name) + + pop_ids = global_pop_ids.get_ids() + pop_properties = set(properties) & pop.property_names + + for prop in pop_properties: + data = pop.get(pop_ids, prop) + + if data.size > 0: + values[prop] = values.get(prop, []) + [data] + + # Is there a better way to merge multi-indices than append? + if prop in indices: + indices[prop] = indices[prop].append(global_pop_ids.index) + else: + indices[prop] = global_pop_ids.index + + def _serialize(property_name): + ids = indices[property_name] + val = values[property_name] + + # If any of the dtypes is a category, force it. Otherwise, let pandas handle it. + has_categoricals = any(map(pd.api.types.is_categorical_dtype, val)) + + return pd.Series( + np.concatenate(val), + index=ids, + name=property_name, + dtype="category" if has_categoricals else None, + ) + + series = [_serialize(prop) for prop in values] + + return pd.DataFrame( + pd.concat(series, axis=1) if series else None, + columns=properties, + index=ids.index, + ) + + class NetworkObject(abc.ABC): """Abstract class for the top level NetworkObjects accessor.""" @@ -48,22 +104,6 @@ def _populations(self): def population_names(self): """Should define all sorted NetworkObjects population names from the Circuit.""" - @cached_property - def property_dtypes(self): - """Returns all the NetworkObjects property dtypes for the Circuit.""" - - def _update(d, index, value): - if d.setdefault(index, value) != value: - raise BluepySnapError( - f"Same property with different dtype. {index}: {value}!= {d[index]}" - ) - - res = {} - for pop in self.values(): - for varname, dtype in pop.property_dtypes.items(): - _update(res, varname, dtype) - return pd.Series(res) - def keys(self): """Returns iterator on the NetworkObjectPopulation names. @@ -149,33 +189,15 @@ def get(self, group=None, properties=None): """Returns the properties of the NetworkObject.""" ids = self.ids(group) properties = utils.ensure_list(properties) - # We don t convert to set properties itself to keep the column order. - properties_set = set(properties) - unknown_props = properties_set - self.property_names + unknown_props = set(properties) - self.property_names if unknown_props: raise BluepySnapError(f"Unknown properties required: {unknown_props}") - # Retrieve the dtypes of the selected properties. - # However, the int dtype may not be preserved if some values are NaN. - dtypes = { - column: dtype - for column, dtype in self.property_dtypes.items() - if column in properties_set - } - dataframes = [pd.DataFrame(columns=properties, index=ids.index_schema).astype(dtypes)] - for name, pop in sorted(self.items()): - # since ids is sorted, global_pop_ids should be sorted as well - global_pop_ids = ids.filter_population(name) - pop_ids = global_pop_ids.get_ids() - if len(pop_ids) > 0: - pop_properties = properties_set & pop.property_names - # Since the columns are passed as Series, index cannot be specified directly. - # However, it's a bit more performant than converting the Series to numpy arrays. - pop_df = pd.DataFrame({prop: pop.get(pop_ids, prop) for prop in pop_properties}) - pop_df.index = global_pop_ids.index - dataframes.append(pop_df) - res = pd.concat(dataframes) + populations = self.values() + + res = _gather_properties(populations, ids, properties) + assert res.index.is_monotonic_increasing, "The index should be already sorted" return res diff --git a/tests/test_dtype_mismatch.py b/tests/test_dtypes.py similarity index 59% rename from tests/test_dtype_mismatch.py rename to tests/test_dtypes.py index 6416e723..c56954fa 100644 --- a/tests/test_dtype_mismatch.py +++ b/tests/test_dtypes.py @@ -17,15 +17,18 @@ _str_dtype = h5py.string_dtype(encoding="utf-8") MAP_DTYPE = { - "float": float, - "int": int, - "uint": np.uint64, "object": _str_dtype, "str": _str_dtype, "float32": np.float32, + "float": float, + "int8": np.int8, "int16": np.int16, "int32": np.int32, + "int": int, + "uint8": np.uint8, + "uint16": np.uint16, "uint32": np.uint32, + "uint": np.uint64, } @@ -34,7 +37,7 @@ def add_test_field(file_path, population_name, data, data_type): test_data_path = f"{pop_0_path}/{TEST_FIELD}" with h5py.File(file_path, "r+") as h5: - if data_type == "categorical": + if data_type == "category": categorical = pd.Categorical(data) categories = categorical.categories.values @@ -51,35 +54,34 @@ def add_test_field(file_path, population_name, data, data_type): @pytest.mark.parametrize( - "dtypes", + ("dtypes", "expected"), ( - ("categorical", "categorical"), - ("int", "float"), - ("int", "uint"), - ("int", "str"), - ("int", "int16"), - ("int", "int32"), - ("int16", "int32"), - ("uint32", "int32"), - ("uint", "float"), + (("category", "category"), "category"), + (("int8", "int8"), "int8"), + (("uint8", "uint8"), "uint8"), + (("object", "object"), "object"), + (("category", "str"), "category"), + (("category", "int"), "category"), + (("int", "float"), "float"), + (("int", "str"), "object"), + (("int", "int16"), "int"), + (("int", "int32"), "int"), + (("uint32", "int32"), "int"), + (("uint", "float"), "float"), + (("float32", "float"), "float"), + (("int8", "uint8"), "int16"), + (("int16", "uint8"), "int16"), + (("int16", "uint16"), "int32"), + (("int32", "uint32"), "int"), + (("int", "uint32"), "int"), + (("int", "uint"), "float"), ), ) -def test_mismatching_dtypes(dtypes): +def test_resulting_dtypes(dtypes, expected): with copy_test_data() as (test_dir, config_path): node_path = Path(test_dir) / "nodes.h5" for population, data, dtype in zip(TEST_POPULATIONS, TEST_DATA, dtypes): add_test_field(node_path, population, data, dtype) - with pytest.raises(BluepySnapError, match="Same property with different dtype."): - Circuit(config_path).nodes.property_dtypes - - -@pytest.mark.parametrize("dtype", list(MAP_DTYPE)) -def test_matching_dtypes(dtype): - with copy_test_data() as (test_dir, config_path): - node_path = Path(test_dir) / "nodes.h5" - for population, data in zip(TEST_POPULATIONS, TEST_DATA): - add_test_field(node_path, population, data, dtype) - - res = Circuit(config_path).nodes.property_dtypes - assert isinstance(res, pd.Series) + res = Circuit(config_path).nodes.get(properties=TEST_FIELD) + assert res[TEST_FIELD].dtype == expected diff --git a/tests/test_edges/test_edges.py b/tests/test_edges/test_edges.py index 11a05db4..89aeabd2 100644 --- a/tests/test_edges/test_edges.py +++ b/tests/test_edges/test_edges.py @@ -85,80 +85,6 @@ def test_property_names(self): "syn_weight", } - def test_property_dtypes(self): - expected = pd.Series( - data=[ - dtype("float32"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float32"), - dtype("float64"), - dtype("float32"), - dtype("float64"), - dtype("int64"), - dtype("int64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float32"), - dtype("float32"), - dtype("float64"), - dtype("float64"), - IDS_DTYPE, - IDS_DTYPE, - dtype("O"), - dtype("int32"), - ], - index=[ - "syn_weight", - "@dynamics:param1", - "afferent_surface_y", - "afferent_surface_z", - "conductance", - "efferent_center_x", - "delay", - "afferent_center_z", - "efferent_section_id", - "afferent_section_id", - "efferent_center_y", - "afferent_center_x", - "efferent_surface_z", - "afferent_center_y", - "afferent_surface_x", - "efferent_surface_x", - "afferent_section_pos", - "efferent_section_pos", - "efferent_surface_y", - "efferent_center_z", - "@source_node", - "@target_node", - "other1", - "other2", - ], - ).sort_index() - pdt.assert_series_equal(self.test_obj.property_dtypes.sort_index(), expected) - - def test_property_dtypes_fail(self): - a = pd.Series( - data=[dtype("int64"), dtype("float64")], index=["syn_weight", "efferent_surface_z"] - ).sort_index() - b = pd.Series( - data=[dtype("int32"), dtype("float64")], index=["syn_weight", "efferent_surface_z"] - ).sort_index() - - with patch( - "bluepysnap.edges.EdgePopulation.property_dtypes", new_callable=PropertyMock - ) as mock: - mock.side_effect = [a, b] - circuit = Circuit(str(TEST_DATA_DIR / "circuit_config.json")) - test_obj = test_module.Edges(circuit) - with pytest.raises(BluepySnapError): - test_obj.property_dtypes.sort_index() - def test_ids(self): np.random.seed(42) # single edge ID --> CircuitEdgeIds return populations with the 0 id @@ -340,7 +266,7 @@ def test_get(self): expected = pd.DataFrame( { "other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float), - "other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=object), + "other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float), "@source_node": np.array([2, 0, 0, 2], dtype=int), }, index=pd.MultiIndex.from_tuples( diff --git a/tests/test_nodes/test_nodes.py b/tests/test_nodes/test_nodes.py index e8013b75..b2169459 100644 --- a/tests/test_nodes/test_nodes.py +++ b/tests/test_nodes/test_nodes.py @@ -78,60 +78,6 @@ def test_property_value(self): assert self.test_obj.property_values("mtype") == {"L2_X", "L7_X", "L9_Z", "L8_Y", "L6_Y"} assert self.test_obj.property_values("other2") == {10, 11, 12, 13} - def test_property_dtypes(self): - expected = pd.Series( - data=[ - dtype("int64"), - dtype("O"), - dtype("O"), - dtype("O"), - dtype("O"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("float64"), - dtype("O"), - dtype("int64"), - ], - index=[ - "layer", - "model_template", - "model_type", - "morphology", - "mtype", - "rotation_angle_xaxis", - "rotation_angle_yaxis", - "rotation_angle_zaxis", - "x", - "y", - "z", - "@dynamics:holding_current", - "other1", - "other2", - ], - ).sort_index() - pdt.assert_series_equal(self.test_obj.property_dtypes.sort_index(), expected) - - def test_property_dtypes_fail(self): - a = pd.Series( - data=[dtype("int64"), dtype("O")], index=["layer", "model_template"] - ).sort_index() - b = pd.Series( - data=[dtype("int32"), dtype("O")], index=["layer", "model_template"] - ).sort_index() - - with patch( - "bluepysnap.nodes.NodePopulation.property_dtypes", new_callable=PropertyMock - ) as mock: - mock.side_effect = [a, b] - circuit = Circuit(str(TEST_DATA_DIR / "circuit_config.json")) - test_obj = test_module.Nodes(circuit) - with pytest.raises(BluepySnapError): - test_obj.property_dtypes.sort_index() - def test_ids(self): np.random.seed(42) @@ -352,7 +298,7 @@ def test_get(self): expected = pd.DataFrame( { "other2": np.array([np.NaN, np.NaN, np.NaN], dtype=float), - "other1": np.array([np.NaN, np.NaN, np.NaN], dtype=object), + "other1": np.array([np.NaN, np.NaN, np.NaN], dtype=float), "layer": np.array([2, 6, 6], dtype=int), }, index=pd.MultiIndex.from_tuples(