Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing the issue with conflicting property dtypes #186

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions bluepysnap/circuit_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def __init__(self, index, sort_index=True):
index = index.sortlevel()[0]
self.index = index

@property
def index_schema(self):
"""Return an empty index with the same names of the wrapped index."""
return pd.MultiIndex.from_tuples([], names=self.index.names)

@classmethod
def _instance(cls, index, sort_index=True):
"""The instance returned by the functions."""
Expand Down
100 changes: 61 additions & 39 deletions bluepysnap/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,62 @@
from bluepysnap.exceptions import BluepySnapError


def _gather_properties(populations, ids, properties):
"""Helper function to get data from populations.

Args:
populations (list): populations (NodePopulation, EdgePopulation) to consider
ids (CircuitNodeIds, CircuitEdgeIds): node/edge ids to retrieve
properties (list): properties to retrieve

Returns:
pandas.DataFrame: dataframe containing the gathered data
"""
values = {}
indices = {}

# gather properties for each of the population
for pop in populations:
global_pop_ids = ids.filter_population(pop.name)

pop_ids = global_pop_ids.get_ids()
pop_properties = set(properties) & pop.property_names

for prop in pop_properties:
data = pop.get(pop_ids, prop)

if data.size > 0:
values[prop] = values.get(prop, []) + [data]

# Is there a better way to merge multi-indices than append?
if prop in indices:
indices[prop] = indices[prop].append(global_pop_ids.index)
else:
indices[prop] = global_pop_ids.index

def _serialize(property_name):
ids = indices[property_name]
val = values[property_name]

# If any of the dtypes is a category, force it. Otherwise, let pandas handle it.
has_categoricals = any(map(pd.api.types.is_categorical_dtype, val))

return pd.Series(
np.concatenate(val),
index=ids,
name=property_name,
dtype="category" if has_categoricals else None,
)

series = [_serialize(prop) for prop in values]

return pd.DataFrame(
pd.concat(series, axis=1) if series else None,
columns=properties,
index=ids.index,
)


class NetworkObject(abc.ABC):
"""Abstract class for the top level NetworkObjects accessor."""

Expand All @@ -48,22 +104,6 @@ def _populations(self):
def population_names(self):
"""Should define all sorted NetworkObjects population names from the Circuit."""

@cached_property
def property_dtypes(self):
"""Returns all the NetworkObjects property dtypes for the Circuit."""

def _update(d, index, value):
if d.setdefault(index, value) != value:
raise BluepySnapError(
f"Same property with different dtype. {index}: {value}!= {d[index]}"
)

res = {}
for pop in self.values():
for varname, dtype in pop.property_dtypes.items():
_update(res, varname, dtype)
return pd.Series(res)

def keys(self):
"""Returns iterator on the NetworkObjectPopulation names.

Expand Down Expand Up @@ -149,33 +189,15 @@ def get(self, group=None, properties=None):
"""Returns the properties of the NetworkObject."""
ids = self.ids(group)
properties = utils.ensure_list(properties)
# We don t convert to set properties itself to keep the column order.
properties_set = set(properties)

unknown_props = properties_set - self.property_names
unknown_props = set(properties) - self.property_names
if unknown_props:
raise BluepySnapError(f"Unknown properties required: {unknown_props}")

# Retrieve the dtypes of the selected properties.
# However, the int dtype may not be preserved if some values are NaN.
dtypes = {
column: dtype
for column, dtype in self.property_dtypes.items()
if column in properties_set
}
dataframes = [pd.DataFrame(columns=properties, index=ids.index_schema).astype(dtypes)]
for name, pop in sorted(self.items()):
# since ids is sorted, global_pop_ids should be sorted as well
global_pop_ids = ids.filter_population(name)
pop_ids = global_pop_ids.get_ids()
if len(pop_ids) > 0:
pop_properties = properties_set & pop.property_names
# Since the columns are passed as Series, index cannot be specified directly.
# However, it's a bit more performant than converting the Series to numpy arrays.
pop_df = pd.DataFrame({prop: pop.get(pop_ids, prop) for prop in pop_properties})
pop_df.index = global_pop_ids.index
dataframes.append(pop_df)
res = pd.concat(dataframes)
populations = self.values()

res = _gather_properties(populations, ids, properties)

assert res.index.is_monotonic_increasing, "The index should be already sorted"
return res

Expand Down
87 changes: 87 additions & 0 deletions tests/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from pathlib import Path

import h5py
import numpy as np
import pandas as pd
import pytest

from bluepysnap import Circuit
from bluepysnap.exceptions import BluepySnapError

from utils import copy_test_data

TEST_FIELD = "test_field"
TEST_DATA = list("111"), list("2222")
TEST_POPULATIONS = "default", "default2"

_str_dtype = h5py.string_dtype(encoding="utf-8")

MAP_DTYPE = {
"object": _str_dtype,
"str": _str_dtype,
"float32": np.float32,
"float": float,
"int8": np.int8,
"int16": np.int16,
"int32": np.int32,
"int": int,
"uint8": np.uint8,
"uint16": np.uint16,
"uint32": np.uint32,
"uint": np.uint64,
}


def add_test_field(file_path, population_name, data, data_type):
pop_0_path = f"/nodes/{population_name}/0"
test_data_path = f"{pop_0_path}/{TEST_FIELD}"

with h5py.File(file_path, "r+") as h5:
if data_type == "category":
categorical = pd.Categorical(data)
categories = categorical.categories.values

lib_path = f"{pop_0_path}/@library/{TEST_FIELD}"

h5.create_dataset(lib_path, data=categories)

data = categorical.codes
dtype = data.dtype
else:
dtype = MAP_DTYPE[data_type]

h5.create_dataset(test_data_path, data=data, dtype=dtype)


@pytest.mark.parametrize(
("dtypes", "expected"),
(
(("category", "category"), "category"),
(("int8", "int8"), "int8"),
(("uint8", "uint8"), "uint8"),
(("object", "object"), "object"),
(("category", "str"), "category"),
(("category", "int"), "category"),
(("int", "float"), "float"),
(("int", "str"), "object"),
(("int", "int16"), "int"),
(("int", "int32"), "int"),
(("uint32", "int32"), "int"),
(("uint", "float"), "float"),
(("float32", "float"), "float"),
(("int8", "uint8"), "int16"),
(("int16", "uint8"), "int16"),
(("int16", "uint16"), "int32"),
(("int32", "uint32"), "int"),
(("int", "uint32"), "int"),
(("int", "uint"), "float"),
),
)
def test_resulting_dtypes(dtypes, expected):
with copy_test_data() as (test_dir, config_path):
node_path = Path(test_dir) / "nodes.h5"
for population, data, dtype in zip(TEST_POPULATIONS, TEST_DATA, dtypes):
add_test_field(node_path, population, data, dtype)

res = Circuit(config_path).nodes.get(properties=TEST_FIELD)
assert res[TEST_FIELD].dtype == expected
77 changes: 1 addition & 76 deletions tests/test_edges/test_edges.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,80 +85,6 @@ def test_property_names(self):
"syn_weight",
}

def test_property_dtypes(self):
expected = pd.Series(
data=[
dtype("float32"),
dtype("float64"),
dtype("float64"),
dtype("float64"),
dtype("float32"),
dtype("float64"),
dtype("float32"),
dtype("float64"),
dtype("int64"),
dtype("int64"),
dtype("float64"),
dtype("float64"),
dtype("float64"),
dtype("float64"),
dtype("float64"),
dtype("float64"),
dtype("float32"),
dtype("float32"),
dtype("float64"),
dtype("float64"),
IDS_DTYPE,
IDS_DTYPE,
dtype("O"),
dtype("int32"),
],
index=[
"syn_weight",
"@dynamics:param1",
"afferent_surface_y",
"afferent_surface_z",
"conductance",
"efferent_center_x",
"delay",
"afferent_center_z",
"efferent_section_id",
"afferent_section_id",
"efferent_center_y",
"afferent_center_x",
"efferent_surface_z",
"afferent_center_y",
"afferent_surface_x",
"efferent_surface_x",
"afferent_section_pos",
"efferent_section_pos",
"efferent_surface_y",
"efferent_center_z",
"@source_node",
"@target_node",
"other1",
"other2",
],
).sort_index()
pdt.assert_series_equal(self.test_obj.property_dtypes.sort_index(), expected)

def test_property_dtypes_fail(self):
a = pd.Series(
data=[dtype("int64"), dtype("float64")], index=["syn_weight", "efferent_surface_z"]
).sort_index()
b = pd.Series(
data=[dtype("int32"), dtype("float64")], index=["syn_weight", "efferent_surface_z"]
).sort_index()

with patch(
"bluepysnap.edges.EdgePopulation.property_dtypes", new_callable=PropertyMock
) as mock:
mock.side_effect = [a, b]
circuit = Circuit(str(TEST_DATA_DIR / "circuit_config.json"))
test_obj = test_module.Edges(circuit)
with pytest.raises(BluepySnapError):
test_obj.property_dtypes.sort_index()

def test_ids(self):
np.random.seed(42)
# single edge ID --> CircuitEdgeIds return populations with the 0 id
Expand Down Expand Up @@ -340,7 +266,7 @@ def test_get(self):
expected = pd.DataFrame(
{
"other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
"other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=object),
"other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
"@source_node": np.array([2, 0, 0, 2], dtype=int),
},
index=pd.MultiIndex.from_tuples(
Expand Down Expand Up @@ -776,7 +702,6 @@ def test_pickle(self, tmp_path):
# trigger some cached properties, to makes sure they aren't being pickeld
self.test_obj.size
self.test_obj.property_names
self.test_obj.property_dtypes

with open(pickle_path, "wb") as fd:
pickle.dump(self.test_obj, fd)
Expand Down
Loading