Skip to content

Commit 8564139

Browse files
authored
Merge pull request #91 from lincc-frameworks/acessor-mapping
Fixes for accessor mapping methods
2 parents 41cce93 + d2c4ebd commit 8564139

File tree

5 files changed

+549
-245
lines changed

5 files changed

+549
-245
lines changed

docs/tutorials/low_level.ipynb

Lines changed: 43 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,7 @@
1717
"cell_type": "code",
1818
"execution_count": null,
1919
"id": "619f088e7ac0f327",
20-
"metadata": {
21-
"ExecuteTime": {
22-
"end_time": "2024-05-09T12:43:47.641800Z",
23-
"start_time": "2024-05-09T12:43:47.634903Z"
24-
}
25-
},
20+
"metadata": {},
2621
"outputs": [],
2722
"source": [
2823
"import numpy as np\n",
@@ -49,12 +44,7 @@
4944
"cell_type": "code",
5045
"execution_count": null,
5146
"id": "f9dd16a4bb9aaa63",
52-
"metadata": {
53-
"ExecuteTime": {
54-
"end_time": "2024-05-09T12:43:47.708715Z",
55-
"start_time": "2024-05-09T12:43:47.700005Z"
56-
}
57-
},
47+
"metadata": {},
5848
"outputs": [],
5949
"source": [
6050
"nested_df = generate_data(4, 3, seed=42)\n",
@@ -83,20 +73,17 @@
8373
"source": [
8474
"### `.nest` object is a mapping\n",
8575
"\n",
86-
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n",
87-
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data."
76+
"`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n",
77+
"Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n",
78+
"\n",
79+
"The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)."
8880
]
8981
},
9082
{
9183
"cell_type": "code",
9284
"execution_count": null,
9385
"id": "fb7beb750d3e2893",
94-
"metadata": {
95-
"ExecuteTime": {
96-
"end_time": "2024-05-09T12:43:47.711893Z",
97-
"start_time": "2024-05-09T12:43:47.709614Z"
98-
}
99-
},
86+
"metadata": {},
10087
"outputs": [],
10188
"source": [
10289
"list(nested_series.nest.keys())"
@@ -114,12 +101,7 @@
114101
"cell_type": "code",
115102
"execution_count": null,
116103
"id": "56b0d9ffc5820d22",
117-
"metadata": {
118-
"ExecuteTime": {
119-
"end_time": "2024-05-09T12:43:47.714235Z",
120-
"start_time": "2024-05-09T12:43:47.712499Z"
121-
}
122-
},
104+
"metadata": {},
123105
"outputs": [],
124106
"source": [
125107
"nested_series.nest.fields"
@@ -137,12 +119,7 @@
137119
"cell_type": "code",
138120
"execution_count": null,
139121
"id": "30ee9a430b6ff641",
140-
"metadata": {
141-
"ExecuteTime": {
142-
"end_time": "2024-05-09T12:43:47.717863Z",
143-
"start_time": "2024-05-09T12:43:47.715368Z"
144-
}
145-
},
122+
"metadata": {},
146123
"outputs": [],
147124
"source": [
148125
"nested_series.nest[\"t\"]"
@@ -160,12 +137,7 @@
160137
"cell_type": "code",
161138
"execution_count": null,
162139
"id": "f0db15d31b289140",
163-
"metadata": {
164-
"ExecuteTime": {
165-
"end_time": "2024-05-09T12:43:47.720405Z",
166-
"start_time": "2024-05-09T12:43:47.718626Z"
167-
}
168-
},
140+
"metadata": {},
169141
"outputs": [],
170142
"source": [
171143
"nested_series.nest[[\"t\", \"flux\"]].dtype"
@@ -177,35 +149,38 @@
177149
"metadata": {},
178150
"source": [
179151
"You can add new columns, drop existing ones, or modify the existing ones.\n",
180-
"The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n",
152+
"These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n",
153+
"\n",
154+
"The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n",
181155
"When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied."
182156
]
183157
},
184158
{
185159
"cell_type": "code",
186160
"execution_count": null,
187161
"id": "66ae5cc26fa17458",
188-
"metadata": {
189-
"ExecuteTime": {
190-
"end_time": "2024-05-09T12:43:47.726619Z",
191-
"start_time": "2024-05-09T12:43:47.721070Z"
192-
}
193-
},
162+
"metadata": {},
194163
"outputs": [],
195164
"source": [
196165
"new_series = nested_series.copy()\n",
197166
"\n",
198167
"# Change the data in-place\n",
199168
"new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
200169
"\n",
201-
"# Add new column\n",
202-
"new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n",
170+
"# Create a new series with a new column\n",
171+
"new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n",
203172
"\n",
204-
"# Drop the column, .pop() method is also available\n",
205-
"del new_series.nest[\"band\"]\n",
173+
"# Create a new series with a column removed, you can also pass a list of columns to remove\n",
174+
"new_series = new_series.nest.without_field(\"band\")\n",
206175
"\n",
207176
"# Add a new column with a python list instead of a Series\n",
208-
"new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n",
177+
"new_series = new_series.nest.with_field(\n",
178+
" \"new_column\",\n",
179+
" [1, 2] * (new_series.nest.flat_length // 2),\n",
180+
")\n",
181+
"\n",
182+
"# Create a new series, with a column dtype changed\n",
183+
"new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n",
209184
"\n",
210185
"new_series.nest.to_flat()"
211186
]
@@ -228,12 +203,7 @@
228203
"cell_type": "code",
229204
"execution_count": null,
230205
"id": "ce6d519d8d37ead3",
231-
"metadata": {
232-
"ExecuteTime": {
233-
"end_time": "2024-05-09T12:43:47.768616Z",
234-
"start_time": "2024-05-09T12:43:47.764343Z"
235-
}
236-
},
206+
"metadata": {},
237207
"outputs": [],
238208
"source": [
239209
"nested_series.nest.to_flat([\"flux\", \"t\"])"
@@ -243,12 +213,7 @@
243213
"cell_type": "code",
244214
"execution_count": null,
245215
"id": "2421b91387487995",
246-
"metadata": {
247-
"ExecuteTime": {
248-
"end_time": "2024-05-09T12:43:47.798697Z",
249-
"start_time": "2024-05-09T12:43:47.795583Z"
250-
}
251-
},
216+
"metadata": {},
252217
"outputs": [],
253218
"source": [
254219
"lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n",
@@ -267,19 +232,12 @@
267232
"cell_type": "code",
268233
"execution_count": null,
269234
"id": "f2c205e95affb9ba",
270-
"metadata": {
271-
"ExecuteTime": {
272-
"end_time": "2024-05-09T12:43:47.833034Z",
273-
"start_time": "2024-05-09T12:43:47.827805Z"
274-
}
275-
},
235+
"metadata": {},
276236
"outputs": [],
277237
"source": [
278-
"new_series = nested_series.copy()\n",
279-
"\n",
280238
"# Adjust each time to be relative to the first observation\n",
281239
"dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
282-
"new_series.nest.set_list_field(\"dt\", dt)\n",
240+
"new_series = new_series.nest.with_list_field(\"dt\", dt)\n",
283241
"new_series.nest.to_flat()"
284242
]
285243
},
@@ -313,12 +271,7 @@
313271
"cell_type": "code",
314272
"execution_count": null,
315273
"id": "8ef96243c6d74aff",
316-
"metadata": {
317-
"ExecuteTime": {
318-
"end_time": "2024-05-09T12:43:47.875752Z",
319-
"start_time": "2024-05-09T12:43:47.872293Z"
320-
}
321-
},
274+
"metadata": {},
322275
"outputs": [],
323276
"source": [
324277
"struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n",
@@ -329,12 +282,7 @@
329282
"cell_type": "code",
330283
"execution_count": null,
331284
"id": "422e719861ae40f6",
332-
"metadata": {
333-
"ExecuteTime": {
334-
"end_time": "2024-05-09T12:43:47.925465Z",
335-
"start_time": "2024-05-09T12:43:47.922965Z"
336-
}
337-
},
285+
"metadata": {},
338286
"outputs": [],
339287
"source": [
340288
"nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))"
@@ -364,12 +312,7 @@
364312
"cell_type": "code",
365313
"execution_count": null,
366314
"id": "926f2c9fcffc5f03",
367-
"metadata": {
368-
"ExecuteTime": {
369-
"end_time": "2024-05-09T12:43:47.937490Z",
370-
"start_time": "2024-05-09T12:43:47.933878Z"
371-
}
372-
},
315+
"metadata": {},
373316
"outputs": [],
374317
"source": [
375318
"new_series = pack(nested_series.nest.to_flat())\n",
@@ -380,12 +323,7 @@
380323
"cell_type": "code",
381324
"execution_count": null,
382325
"id": "3a1d2025c232ac82",
383-
"metadata": {
384-
"ExecuteTime": {
385-
"end_time": "2024-05-09T12:43:47.969831Z",
386-
"start_time": "2024-05-09T12:43:47.964948Z"
387-
}
388-
},
326+
"metadata": {},
389327
"outputs": [],
390328
"source": [
391329
"series_from_flat = pack(\n",
@@ -422,12 +360,7 @@
422360
"cell_type": "code",
423361
"execution_count": null,
424362
"id": "2de4619726ab3d5c",
425-
"metadata": {
426-
"ExecuteTime": {
427-
"end_time": "2024-05-09T12:43:47.991261Z",
428-
"start_time": "2024-05-09T12:43:47.986129Z"
429-
}
430-
},
363+
"metadata": {},
431364
"outputs": [],
432365
"source": [
433366
"series_from_pack = pack(\n",
@@ -454,12 +387,7 @@
454387
"cell_type": "code",
455388
"execution_count": null,
456389
"id": "9c63ae45dd0b6a29",
457-
"metadata": {
458-
"ExecuteTime": {
459-
"end_time": "2024-05-09T12:43:47.995869Z",
460-
"start_time": "2024-05-09T12:43:47.992016Z"
461-
}
462-
},
390+
"metadata": {},
463391
"outputs": [],
464392
"source": [
465393
"series_from_pack = pack(\n",
@@ -500,12 +428,7 @@
500428
"cell_type": "code",
501429
"execution_count": null,
502430
"id": "1284d9b536b9e784",
503-
"metadata": {
504-
"ExecuteTime": {
505-
"end_time": "2024-05-09T12:43:48.000441Z",
506-
"start_time": "2024-05-09T12:43:47.996620Z"
507-
}
508-
},
431+
"metadata": {},
509432
"outputs": [],
510433
"source": [
511434
"series_from_dtype = pd.Series(\n",
@@ -531,12 +454,7 @@
531454
"cell_type": "code",
532455
"execution_count": null,
533456
"id": "b7c7fd878bc97f68",
534-
"metadata": {
535-
"ExecuteTime": {
536-
"end_time": "2024-05-09T12:43:48.004677Z",
537-
"start_time": "2024-05-09T12:43:48.001129Z"
538-
}
539-
},
457+
"metadata": {},
540458
"outputs": [],
541459
"source": [
542460
"series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n",
@@ -568,12 +486,7 @@
568486
"cell_type": "code",
569487
"execution_count": null,
570488
"id": "e837d25dcb0a2b4d",
571-
"metadata": {
572-
"ExecuteTime": {
573-
"end_time": "2024-05-09T12:43:48.015257Z",
574-
"start_time": "2024-05-09T12:43:48.013217Z"
575-
}
576-
},
489+
"metadata": {},
577490
"outputs": [],
578491
"source": [
579492
"pa_struct_array = pa.StructArray.from_arrays(\n",
@@ -611,12 +524,7 @@
611524
"cell_type": "code",
612525
"execution_count": null,
613526
"id": "116c902ea8681c9e",
614-
"metadata": {
615-
"ExecuteTime": {
616-
"end_time": "2024-05-09T12:43:48.040801Z",
617-
"start_time": "2024-05-09T12:43:48.038106Z"
618-
}
619-
},
527+
"metadata": {},
620528
"outputs": [],
621529
"source": [
622530
"# Convert to pd.ArrowDtype Series of struct-arrays\n",
@@ -641,12 +549,7 @@
641549
"cell_type": "code",
642550
"execution_count": null,
643551
"id": "30ea40dee30795d1",
644-
"metadata": {
645-
"ExecuteTime": {
646-
"end_time": "2024-05-09T12:43:48.055678Z",
647-
"start_time": "2024-05-09T12:43:48.050677Z"
648-
}
649-
},
552+
"metadata": {},
650553
"outputs": [],
651554
"source": [
652555
"for element in nested_series:\n",
@@ -665,12 +568,7 @@
665568
"cell_type": "code",
666569
"execution_count": null,
667570
"id": "81f6c1f98dfc26a9",
668-
"metadata": {
669-
"ExecuteTime": {
670-
"end_time": "2024-05-09T12:43:48.060166Z",
671-
"start_time": "2024-05-09T12:43:48.056425Z"
672-
}
673-
},
571+
"metadata": {},
674572
"outputs": [],
675573
"source": [
676574
"nested_elements = list(nested_series)\n",
@@ -689,12 +587,7 @@
689587
"cell_type": "code",
690588
"execution_count": null,
691589
"id": "69ed758c48c55015",
692-
"metadata": {
693-
"ExecuteTime": {
694-
"end_time": "2024-05-09T12:43:48.063115Z",
695-
"start_time": "2024-05-09T12:43:48.060863Z"
696-
}
697-
},
590+
"metadata": {},
698591
"outputs": [],
699592
"source": [
700593
"nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n",
@@ -707,12 +600,7 @@
707600
"cell_type": "code",
708601
"execution_count": null,
709602
"id": "99ce9d18bc69ae49",
710-
"metadata": {
711-
"ExecuteTime": {
712-
"end_time": "2024-05-09T12:43:48.088986Z",
713-
"start_time": "2024-05-09T12:43:48.086255Z"
714-
}
715-
},
603+
"metadata": {},
716604
"outputs": [],
717605
"source": [
718606
"# Would have empty pd.DataFrame for top-level missed data\n",

0 commit comments

Comments
 (0)