BlueBrain · joni-herttuainen · Mar 1, 2023 · Mar 24, 2023 · Mar 24, 2023
diff --git a/bluepysnap/circuit_ids.py b/bluepysnap/circuit_ids.py
@@ -55,11 +55,6 @@ def __init__(self, index, sort_index=True):
             index = index.sortlevel()[0]
         self.index = index
 
-    @property
-    def index_schema(self):
-        """Return an empty index with the same names of the wrapped index."""
-        return pd.MultiIndex.from_tuples([], names=self.index.names)
-
     @classmethod
     def _instance(cls, index, sort_index=True):
         """The instance returned by the functions."""

diff --git a/bluepysnap/network.py b/bluepysnap/network.py
@@ -25,6 +25,62 @@
 from bluepysnap.exceptions import BluepySnapError
 
 
+def _gather_properties(populations, ids, properties):
+    """Helper function to get data from populations.
+
+    Args:
+        populations (list): populations (NodePopulation, EdgePopulation) to consider
+        ids (CircuitNodeIds, CircuitEdgeIds): node/edge ids to retrieve
+        properties (list): properties to retrieve
+
+    Returns:
+        pandas.DataFrame: dataframe containing the gathered data
+    """
+    values = {}
+    indices = {}
+
+    # gather properties for each of the population
+    for pop in populations:
+        global_pop_ids = ids.filter_population(pop.name)
+
+        pop_ids = global_pop_ids.get_ids()
+        pop_properties = set(properties) & pop.property_names
+
+        for prop in pop_properties:
+            data = pop.get(pop_ids, prop)
+
+            if data.size > 0:
+                values[prop] = values.get(prop, []) + [data]
+
+            # Is there a better way to merge multi-indices than append?
+            if prop in indices:
+                indices[prop] = indices[prop].append(global_pop_ids.index)
+            else:
+                indices[prop] = global_pop_ids.index
+
+    def _serialize(property_name):
+        ids = indices[property_name]
+        val = values[property_name]
+
+        # If any of the dtypes is a category, force it. Otherwise, let pandas handle it.
+        has_categoricals = any(map(pd.api.types.is_categorical_dtype, val))
+
+        return pd.Series(
+            np.concatenate(val),
+            index=ids,
+            name=property_name,
+            dtype="category" if has_categoricals else None,
+        )
+
+    series = [_serialize(prop) for prop in values]
+
+    return pd.DataFrame(
+        pd.concat(series, axis=1) if series else None,
+        columns=properties,
+        index=ids.index,
+    )
+
+
 class NetworkObject(abc.ABC):
     """Abstract class for the top level NetworkObjects accessor."""
 
@@ -48,22 +104,6 @@ def _populations(self):
     def population_names(self):
         """Should define all sorted NetworkObjects population names from the Circuit."""
 
-    @cached_property
-    def property_dtypes(self):
-        """Returns all the NetworkObjects property dtypes for the Circuit."""
-
-        def _update(d, index, value):
-            if d.setdefault(index, value) != value:
-                raise BluepySnapError(
-                    f"Same property with different dtype. {index}: {value}!= {d[index]}"
-                )
-
-        res = {}
-        for pop in self.values():
-            for varname, dtype in pop.property_dtypes.items():
-                _update(res, varname, dtype)
-        return pd.Series(res)
-
     def keys(self):
         """Returns iterator on the NetworkObjectPopulation names.
 
@@ -149,33 +189,15 @@ def get(self, group=None, properties=None):
         """Returns the properties of the NetworkObject."""
         ids = self.ids(group)
         properties = utils.ensure_list(properties)
-        # We don t convert to set properties itself to keep the column order.
-        properties_set = set(properties)
 
-        unknown_props = properties_set - self.property_names
+        unknown_props = set(properties) - self.property_names
         if unknown_props:
             raise BluepySnapError(f"Unknown properties required: {unknown_props}")
 
-        # Retrieve the dtypes of the selected properties.
-        # However, the int dtype may not be preserved if some values are NaN.
-        dtypes = {
-            column: dtype
-            for column, dtype in self.property_dtypes.items()
-            if column in properties_set
-        }
-        dataframes = [pd.DataFrame(columns=properties, index=ids.index_schema).astype(dtypes)]
-        for name, pop in sorted(self.items()):
-            # since ids is sorted, global_pop_ids should be sorted as well
-            global_pop_ids = ids.filter_population(name)
-            pop_ids = global_pop_ids.get_ids()
-            if len(pop_ids) > 0:
-                pop_properties = properties_set & pop.property_names
-                # Since the columns are passed as Series, index cannot be specified directly.
-                # However, it's a bit more performant than converting the Series to numpy arrays.
-                pop_df = pd.DataFrame({prop: pop.get(pop_ids, prop) for prop in pop_properties})
-                pop_df.index = global_pop_ids.index
-                dataframes.append(pop_df)
-        res = pd.concat(dataframes)
+        populations = self.values()
+
+        res = _gather_properties(populations, ids, properties)
+
         assert res.index.is_monotonic_increasing, "The index should be already sorted"
         return res
 

diff --git a/tests/test_dtypes.py b/tests/test_dtypes.py
@@ -0,0 +1,87 @@
+from pathlib import Path
+
+import h5py
+import numpy as np
+import pandas as pd
+import pytest
+
+from bluepysnap import Circuit
+from bluepysnap.exceptions import BluepySnapError
+
+from utils import copy_test_data
+
+TEST_FIELD = "test_field"
+TEST_DATA = list("111"), list("2222")
+TEST_POPULATIONS = "default", "default2"
+
+_str_dtype = h5py.string_dtype(encoding="utf-8")
+
+MAP_DTYPE = {
+    "object": _str_dtype,
+    "str": _str_dtype,
+    "float32": np.float32,
+    "float": float,
+    "int8": np.int8,
+    "int16": np.int16,
+    "int32": np.int32,
+    "int": int,
+    "uint8": np.uint8,
+    "uint16": np.uint16,
+    "uint32": np.uint32,
+    "uint": np.uint64,
+}
+
+
+def add_test_field(file_path, population_name, data, data_type):
+    pop_0_path = f"/nodes/{population_name}/0"
+    test_data_path = f"{pop_0_path}/{TEST_FIELD}"
+
+    with h5py.File(file_path, "r+") as h5:
+        if data_type == "category":
+            categorical = pd.Categorical(data)
+            categories = categorical.categories.values
+
+            lib_path = f"{pop_0_path}/@library/{TEST_FIELD}"
+
+            h5.create_dataset(lib_path, data=categories)
+
+            data = categorical.codes
+            dtype = data.dtype
+        else:
+            dtype = MAP_DTYPE[data_type]
+
+        h5.create_dataset(test_data_path, data=data, dtype=dtype)
+
+
+@pytest.mark.parametrize(
+    ("dtypes", "expected"),
+    (
+        (("category", "category"), "category"),
+        (("int8", "int8"), "int8"),
+        (("uint8", "uint8"), "uint8"),
+        (("object", "object"), "object"),
+        (("category", "str"), "category"),
+        (("category", "int"), "category"),
+        (("int", "float"), "float"),
+        (("int", "str"), "object"),
+        (("int", "int16"), "int"),
+        (("int", "int32"), "int"),
+        (("uint32", "int32"), "int"),
+        (("uint", "float"), "float"),
+        (("float32", "float"), "float"),
+        (("int8", "uint8"), "int16"),
+        (("int16", "uint8"), "int16"),
+        (("int16", "uint16"), "int32"),
+        (("int32", "uint32"), "int"),
+        (("int", "uint32"), "int"),
+        (("int", "uint"), "float"),
+    ),
+)
+def test_resulting_dtypes(dtypes, expected):
+    with copy_test_data() as (test_dir, config_path):
+        node_path = Path(test_dir) / "nodes.h5"
+        for population, data, dtype in zip(TEST_POPULATIONS, TEST_DATA, dtypes):
+            add_test_field(node_path, population, data, dtype)
+
+        res = Circuit(config_path).nodes.get(properties=TEST_FIELD)
+        assert res[TEST_FIELD].dtype == expected
diff --git a/tests/test_edges/test_edges.py b/tests/test_edges/test_edges.py
@@ -85,80 +85,6 @@ def test_property_names(self):
             "syn_weight",
         }
 
-    def test_property_dtypes(self):
-        expected = pd.Series(
-            data=[
-                dtype("float32"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float32"),
-                dtype("float64"),
-                dtype("float32"),
-                dtype("float64"),
-                dtype("int64"),
-                dtype("int64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float64"),
-                dtype("float32"),
-                dtype("float32"),
-                dtype("float64"),
-                dtype("float64"),
-                IDS_DTYPE,
-                IDS_DTYPE,
-                dtype("O"),
-                dtype("int32"),
-            ],
-            index=[
-                "syn_weight",
-                "@dynamics:param1",
-                "afferent_surface_y",
-                "afferent_surface_z",
-                "conductance",
-                "efferent_center_x",
-                "delay",
-                "afferent_center_z",
-                "efferent_section_id",
-                "afferent_section_id",
-                "efferent_center_y",
-                "afferent_center_x",
-                "efferent_surface_z",
-                "afferent_center_y",
-                "afferent_surface_x",
-                "efferent_surface_x",
-                "afferent_section_pos",
-                "efferent_section_pos",
-                "efferent_surface_y",
-                "efferent_center_z",
-                "@source_node",
-                "@target_node",
-                "other1",
-                "other2",
-            ],
-        ).sort_index()
-        pdt.assert_series_equal(self.test_obj.property_dtypes.sort_index(), expected)
-
-    def test_property_dtypes_fail(self):
-        a = pd.Series(
-            data=[dtype("int64"), dtype("float64")], index=["syn_weight", "efferent_surface_z"]
-        ).sort_index()
-        b = pd.Series(
-            data=[dtype("int32"), dtype("float64")], index=["syn_weight", "efferent_surface_z"]
-        ).sort_index()
-
-        with patch(
-            "bluepysnap.edges.EdgePopulation.property_dtypes", new_callable=PropertyMock
-        ) as mock:
-            mock.side_effect = [a, b]
-            circuit = Circuit(str(TEST_DATA_DIR / "circuit_config.json"))
-            test_obj = test_module.Edges(circuit)
-            with pytest.raises(BluepySnapError):
-                test_obj.property_dtypes.sort_index()
-
     def test_ids(self):
         np.random.seed(42)
         # single edge ID --> CircuitEdgeIds return populations with the 0 id
@@ -340,7 +266,7 @@ def test_get(self):
         expected = pd.DataFrame(
             {
                 "other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
-                "other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=object),
+                "other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
                 "@source_node": np.array([2, 0, 0, 2], dtype=int),
             },
             index=pd.MultiIndex.from_tuples(
@@ -776,7 +702,6 @@ def test_pickle(self, tmp_path):
         # trigger some cached properties, to makes sure they aren't being pickeld
         self.test_obj.size
         self.test_obj.property_names
-        self.test_obj.property_dtypes
 
         with open(pickle_path, "wb") as fd:
             pickle.dump(self.test_obj, fd)