Skip to content

Commit

Permalink
Merge pull request #91 from lincc-frameworks/acessor-mapping
Browse files Browse the repository at this point in the history
Fixes for accessor mapping methods
  • Loading branch information
hombit authored May 29, 2024
2 parents 41cce93 + d2c4ebd commit 8564139
Show file tree
Hide file tree
Showing 5 changed files with 549 additions and 245 deletions.
198 changes: 43 additions & 155 deletions docs/tutorials/low_level.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@
"cell_type": "code",
"execution_count": null,
"id": "619f088e7ac0f327",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.641800Z",
"start_time": "2024-05-09T12:43:47.634903Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -49,12 +44,7 @@
"cell_type": "code",
"execution_count": null,
"id": "f9dd16a4bb9aaa63",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.708715Z",
"start_time": "2024-05-09T12:43:47.700005Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_df = generate_data(4, 3, seed=42)\n",
Expand Down Expand Up @@ -83,20 +73,17 @@
"source": [
"### `.nest` object is a mapping\n",
"\n",
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n",
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data."
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n",
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n",
"\n",
"The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb7beb750d3e2893",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.711893Z",
"start_time": "2024-05-09T12:43:47.709614Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"list(nested_series.nest.keys())"
Expand All @@ -114,12 +101,7 @@
"cell_type": "code",
"execution_count": null,
"id": "56b0d9ffc5820d22",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.714235Z",
"start_time": "2024-05-09T12:43:47.712499Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest.fields"
Expand All @@ -137,12 +119,7 @@
"cell_type": "code",
"execution_count": null,
"id": "30ee9a430b6ff641",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.717863Z",
"start_time": "2024-05-09T12:43:47.715368Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest[\"t\"]"
Expand All @@ -160,12 +137,7 @@
"cell_type": "code",
"execution_count": null,
"id": "f0db15d31b289140",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.720405Z",
"start_time": "2024-05-09T12:43:47.718626Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest[[\"t\", \"flux\"]].dtype"
Expand All @@ -177,35 +149,38 @@
"metadata": {},
"source": [
"You can add new columns, drop existing ones, or modify the existing ones.\n",
"The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n",
"These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n",
"\n",
"The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n",
"When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "66ae5cc26fa17458",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.726619Z",
"start_time": "2024-05-09T12:43:47.721070Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = nested_series.copy()\n",
"\n",
"# Change the data in-place\n",
"new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
"\n",
"# Add new column\n",
"new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n",
"# Create a new series with a new column\n",
"new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
"\n",
"# Drop the column, .pop() method is also available\n",
"del new_series.nest[\"band\"]\n",
"# Create a new series with a column removed, you can also pass a list of columns to remove\n",
"new_series = new_series.nest.without_field(\"band\")\n",
"\n",
"# Add a new column with a python list instead of a Series\n",
"new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n",
"new_series = new_series.nest.with_field(\n",
" \"new_column\",\n",
" [1, 2] * (new_series.nest.flat_length // 2),\n",
")\n",
"\n",
"# Create a new series, with a column dtype changed\n",
"new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
"\n",
"new_series.nest.to_flat()"
]
Expand All @@ -228,12 +203,7 @@
"cell_type": "code",
"execution_count": null,
"id": "ce6d519d8d37ead3",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.768616Z",
"start_time": "2024-05-09T12:43:47.764343Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.nest.to_flat([\"flux\", \"t\"])"
Expand All @@ -243,12 +213,7 @@
"cell_type": "code",
"execution_count": null,
"id": "2421b91387487995",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.798697Z",
"start_time": "2024-05-09T12:43:47.795583Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n",
Expand All @@ -267,19 +232,12 @@
"cell_type": "code",
"execution_count": null,
"id": "f2c205e95affb9ba",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.833034Z",
"start_time": "2024-05-09T12:43:47.827805Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = nested_series.copy()\n",
"\n",
"# Adjust each time to be relative to the first observation\n",
"dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
"new_series.nest.set_list_field(\"dt\", dt)\n",
"new_series = new_series.nest.with_list_field(\"dt\", dt)\n",
"new_series.nest.to_flat()"
]
},
Expand Down Expand Up @@ -313,12 +271,7 @@
"cell_type": "code",
"execution_count": null,
"id": "8ef96243c6d74aff",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.875752Z",
"start_time": "2024-05-09T12:43:47.872293Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n",
Expand All @@ -329,12 +282,7 @@
"cell_type": "code",
"execution_count": null,
"id": "422e719861ae40f6",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.925465Z",
"start_time": "2024-05-09T12:43:47.922965Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))"
Expand Down Expand Up @@ -364,12 +312,7 @@
"cell_type": "code",
"execution_count": null,
"id": "926f2c9fcffc5f03",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.937490Z",
"start_time": "2024-05-09T12:43:47.933878Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"new_series = pack(nested_series.nest.to_flat())\n",
Expand All @@ -380,12 +323,7 @@
"cell_type": "code",
"execution_count": null,
"id": "3a1d2025c232ac82",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.969831Z",
"start_time": "2024-05-09T12:43:47.964948Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_flat = pack(\n",
Expand Down Expand Up @@ -422,12 +360,7 @@
"cell_type": "code",
"execution_count": null,
"id": "2de4619726ab3d5c",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.991261Z",
"start_time": "2024-05-09T12:43:47.986129Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_pack = pack(\n",
Expand All @@ -454,12 +387,7 @@
"cell_type": "code",
"execution_count": null,
"id": "9c63ae45dd0b6a29",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:47.995869Z",
"start_time": "2024-05-09T12:43:47.992016Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_pack = pack(\n",
Expand Down Expand Up @@ -500,12 +428,7 @@
"cell_type": "code",
"execution_count": null,
"id": "1284d9b536b9e784",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.000441Z",
"start_time": "2024-05-09T12:43:47.996620Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_from_dtype = pd.Series(\n",
Expand All @@ -531,12 +454,7 @@
"cell_type": "code",
"execution_count": null,
"id": "b7c7fd878bc97f68",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.004677Z",
"start_time": "2024-05-09T12:43:48.001129Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n",
Expand Down Expand Up @@ -568,12 +486,7 @@
"cell_type": "code",
"execution_count": null,
"id": "e837d25dcb0a2b4d",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.015257Z",
"start_time": "2024-05-09T12:43:48.013217Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"pa_struct_array = pa.StructArray.from_arrays(\n",
Expand Down Expand Up @@ -611,12 +524,7 @@
"cell_type": "code",
"execution_count": null,
"id": "116c902ea8681c9e",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.040801Z",
"start_time": "2024-05-09T12:43:48.038106Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Convert to pd.ArrowDtype Series of struct-arrays\n",
Expand All @@ -641,12 +549,7 @@
"cell_type": "code",
"execution_count": null,
"id": "30ea40dee30795d1",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.055678Z",
"start_time": "2024-05-09T12:43:48.050677Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"for element in nested_series:\n",
Expand All @@ -665,12 +568,7 @@
"cell_type": "code",
"execution_count": null,
"id": "81f6c1f98dfc26a9",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.060166Z",
"start_time": "2024-05-09T12:43:48.056425Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_elements = list(nested_series)\n",
Expand All @@ -689,12 +587,7 @@
"cell_type": "code",
"execution_count": null,
"id": "69ed758c48c55015",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.063115Z",
"start_time": "2024-05-09T12:43:48.060863Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n",
Expand All @@ -707,12 +600,7 @@
"cell_type": "code",
"execution_count": null,
"id": "99ce9d18bc69ae49",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-09T12:43:48.088986Z",
"start_time": "2024-05-09T12:43:48.086255Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"# Would have empty pd.DataFrame for top-level missed data\n",
Expand Down
Loading

0 comments on commit 8564139

Please sign in to comment.