|
17 | 17 | "cell_type": "code",
|
18 | 18 | "execution_count": null,
|
19 | 19 | "id": "619f088e7ac0f327",
|
20 |
| - "metadata": { |
21 |
| - "ExecuteTime": { |
22 |
| - "end_time": "2024-05-09T12:43:47.641800Z", |
23 |
| - "start_time": "2024-05-09T12:43:47.634903Z" |
24 |
| - } |
25 |
| - }, |
| 20 | + "metadata": {}, |
26 | 21 | "outputs": [],
|
27 | 22 | "source": [
|
28 | 23 | "import numpy as np\n",
|
|
49 | 44 | "cell_type": "code",
|
50 | 45 | "execution_count": null,
|
51 | 46 | "id": "f9dd16a4bb9aaa63",
|
52 |
| - "metadata": { |
53 |
| - "ExecuteTime": { |
54 |
| - "end_time": "2024-05-09T12:43:47.708715Z", |
55 |
| - "start_time": "2024-05-09T12:43:47.700005Z" |
56 |
| - } |
57 |
| - }, |
| 47 | + "metadata": {}, |
58 | 48 | "outputs": [],
|
59 | 49 | "source": [
|
60 | 50 | "nested_df = generate_data(4, 3, seed=42)\n",
|
|
83 | 73 | "source": [
|
84 | 74 | "### `.nest` object is a mapping\n",
|
85 | 75 | "\n",
|
86 |
| - "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n", |
87 |
| - "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data." |
| 76 | + "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n", |
| 77 | + "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n", |
| 78 | + "\n", |
| 79 | + "The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)." |
88 | 80 | ]
|
89 | 81 | },
|
90 | 82 | {
|
91 | 83 | "cell_type": "code",
|
92 | 84 | "execution_count": null,
|
93 | 85 | "id": "fb7beb750d3e2893",
|
94 |
| - "metadata": { |
95 |
| - "ExecuteTime": { |
96 |
| - "end_time": "2024-05-09T12:43:47.711893Z", |
97 |
| - "start_time": "2024-05-09T12:43:47.709614Z" |
98 |
| - } |
99 |
| - }, |
| 86 | + "metadata": {}, |
100 | 87 | "outputs": [],
|
101 | 88 | "source": [
|
102 | 89 | "list(nested_series.nest.keys())"
|
|
114 | 101 | "cell_type": "code",
|
115 | 102 | "execution_count": null,
|
116 | 103 | "id": "56b0d9ffc5820d22",
|
117 |
| - "metadata": { |
118 |
| - "ExecuteTime": { |
119 |
| - "end_time": "2024-05-09T12:43:47.714235Z", |
120 |
| - "start_time": "2024-05-09T12:43:47.712499Z" |
121 |
| - } |
122 |
| - }, |
| 104 | + "metadata": {}, |
123 | 105 | "outputs": [],
|
124 | 106 | "source": [
|
125 | 107 | "nested_series.nest.fields"
|
|
137 | 119 | "cell_type": "code",
|
138 | 120 | "execution_count": null,
|
139 | 121 | "id": "30ee9a430b6ff641",
|
140 |
| - "metadata": { |
141 |
| - "ExecuteTime": { |
142 |
| - "end_time": "2024-05-09T12:43:47.717863Z", |
143 |
| - "start_time": "2024-05-09T12:43:47.715368Z" |
144 |
| - } |
145 |
| - }, |
| 122 | + "metadata": {}, |
146 | 123 | "outputs": [],
|
147 | 124 | "source": [
|
148 | 125 | "nested_series.nest[\"t\"]"
|
|
160 | 137 | "cell_type": "code",
|
161 | 138 | "execution_count": null,
|
162 | 139 | "id": "f0db15d31b289140",
|
163 |
| - "metadata": { |
164 |
| - "ExecuteTime": { |
165 |
| - "end_time": "2024-05-09T12:43:47.720405Z", |
166 |
| - "start_time": "2024-05-09T12:43:47.718626Z" |
167 |
| - } |
168 |
| - }, |
| 140 | + "metadata": {}, |
169 | 141 | "outputs": [],
|
170 | 142 | "source": [
|
171 | 143 | "nested_series.nest[[\"t\", \"flux\"]].dtype"
|
|
177 | 149 | "metadata": {},
|
178 | 150 | "source": [
|
179 | 151 | "You can add new columns, drop existing ones, or modify the existing ones.\n",
|
180 |
| - "The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n", |
| 152 | + "These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n", |
| 153 | + "\n", |
| 154 | + "The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n", |
181 | 155 | "When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied."
|
182 | 156 | ]
|
183 | 157 | },
|
184 | 158 | {
|
185 | 159 | "cell_type": "code",
|
186 | 160 | "execution_count": null,
|
187 | 161 | "id": "66ae5cc26fa17458",
|
188 |
| - "metadata": { |
189 |
| - "ExecuteTime": { |
190 |
| - "end_time": "2024-05-09T12:43:47.726619Z", |
191 |
| - "start_time": "2024-05-09T12:43:47.721070Z" |
192 |
| - } |
193 |
| - }, |
| 162 | + "metadata": {}, |
194 | 163 | "outputs": [],
|
195 | 164 | "source": [
|
196 | 165 | "new_series = nested_series.copy()\n",
|
197 | 166 | "\n",
|
198 | 167 | "# Change the data in-place\n",
|
199 | 168 | "new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n",
|
200 | 169 | "\n",
|
201 |
| - "# Add new column\n", |
202 |
| - "new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n", |
| 170 | + "# Create a new series with a new column\n", |
| 171 | + "new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n", |
203 | 172 | "\n",
|
204 |
| - "# Drop the column, .pop() method is also available\n", |
205 |
| - "del new_series.nest[\"band\"]\n", |
| 173 | + "# Create a new series with a column removed, you can also pass a list of columns to remove\n", |
| 174 | + "new_series = new_series.nest.without_field(\"band\")\n", |
206 | 175 | "\n",
|
207 | 176 | "# Add a new column with a python list instead of a Series\n",
|
208 |
| - "new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n", |
| 177 | + "new_series = new_series.nest.with_field(\n", |
| 178 | + " \"new_column\",\n", |
| 179 | + " [1, 2] * (new_series.nest.flat_length // 2),\n", |
| 180 | + ")\n", |
| 181 | + "\n", |
| 182 | + "# Create a new series, with a column dtype changed\n", |
| 183 | + "new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n", |
209 | 184 | "\n",
|
210 | 185 | "new_series.nest.to_flat()"
|
211 | 186 | ]
|
|
228 | 203 | "cell_type": "code",
|
229 | 204 | "execution_count": null,
|
230 | 205 | "id": "ce6d519d8d37ead3",
|
231 |
| - "metadata": { |
232 |
| - "ExecuteTime": { |
233 |
| - "end_time": "2024-05-09T12:43:47.768616Z", |
234 |
| - "start_time": "2024-05-09T12:43:47.764343Z" |
235 |
| - } |
236 |
| - }, |
| 206 | + "metadata": {}, |
237 | 207 | "outputs": [],
|
238 | 208 | "source": [
|
239 | 209 | "nested_series.nest.to_flat([\"flux\", \"t\"])"
|
|
243 | 213 | "cell_type": "code",
|
244 | 214 | "execution_count": null,
|
245 | 215 | "id": "2421b91387487995",
|
246 |
| - "metadata": { |
247 |
| - "ExecuteTime": { |
248 |
| - "end_time": "2024-05-09T12:43:47.798697Z", |
249 |
| - "start_time": "2024-05-09T12:43:47.795583Z" |
250 |
| - } |
251 |
| - }, |
| 216 | + "metadata": {}, |
252 | 217 | "outputs": [],
|
253 | 218 | "source": [
|
254 | 219 | "lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n",
|
|
267 | 232 | "cell_type": "code",
|
268 | 233 | "execution_count": null,
|
269 | 234 | "id": "f2c205e95affb9ba",
|
270 |
| - "metadata": { |
271 |
| - "ExecuteTime": { |
272 |
| - "end_time": "2024-05-09T12:43:47.833034Z", |
273 |
| - "start_time": "2024-05-09T12:43:47.827805Z" |
274 |
| - } |
275 |
| - }, |
| 235 | + "metadata": {}, |
276 | 236 | "outputs": [],
|
277 | 237 | "source": [
|
278 |
| - "new_series = nested_series.copy()\n", |
279 |
| - "\n", |
280 | 238 | "# Adjust each time to be relative to the first observation\n",
|
281 | 239 | "dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n",
|
282 |
| - "new_series.nest.set_list_field(\"dt\", dt)\n", |
| 240 | + "new_series = new_series.nest.with_list_field(\"dt\", dt)\n", |
283 | 241 | "new_series.nest.to_flat()"
|
284 | 242 | ]
|
285 | 243 | },
|
|
313 | 271 | "cell_type": "code",
|
314 | 272 | "execution_count": null,
|
315 | 273 | "id": "8ef96243c6d74aff",
|
316 |
| - "metadata": { |
317 |
| - "ExecuteTime": { |
318 |
| - "end_time": "2024-05-09T12:43:47.875752Z", |
319 |
| - "start_time": "2024-05-09T12:43:47.872293Z" |
320 |
| - } |
321 |
| - }, |
| 274 | + "metadata": {}, |
322 | 275 | "outputs": [],
|
323 | 276 | "source": [
|
324 | 277 | "struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n",
|
|
329 | 282 | "cell_type": "code",
|
330 | 283 | "execution_count": null,
|
331 | 284 | "id": "422e719861ae40f6",
|
332 |
| - "metadata": { |
333 |
| - "ExecuteTime": { |
334 |
| - "end_time": "2024-05-09T12:43:47.925465Z", |
335 |
| - "start_time": "2024-05-09T12:43:47.922965Z" |
336 |
| - } |
337 |
| - }, |
| 285 | + "metadata": {}, |
338 | 286 | "outputs": [],
|
339 | 287 | "source": [
|
340 | 288 | "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))"
|
|
364 | 312 | "cell_type": "code",
|
365 | 313 | "execution_count": null,
|
366 | 314 | "id": "926f2c9fcffc5f03",
|
367 |
| - "metadata": { |
368 |
| - "ExecuteTime": { |
369 |
| - "end_time": "2024-05-09T12:43:47.937490Z", |
370 |
| - "start_time": "2024-05-09T12:43:47.933878Z" |
371 |
| - } |
372 |
| - }, |
| 315 | + "metadata": {}, |
373 | 316 | "outputs": [],
|
374 | 317 | "source": [
|
375 | 318 | "new_series = pack(nested_series.nest.to_flat())\n",
|
|
380 | 323 | "cell_type": "code",
|
381 | 324 | "execution_count": null,
|
382 | 325 | "id": "3a1d2025c232ac82",
|
383 |
| - "metadata": { |
384 |
| - "ExecuteTime": { |
385 |
| - "end_time": "2024-05-09T12:43:47.969831Z", |
386 |
| - "start_time": "2024-05-09T12:43:47.964948Z" |
387 |
| - } |
388 |
| - }, |
| 326 | + "metadata": {}, |
389 | 327 | "outputs": [],
|
390 | 328 | "source": [
|
391 | 329 | "series_from_flat = pack(\n",
|
|
422 | 360 | "cell_type": "code",
|
423 | 361 | "execution_count": null,
|
424 | 362 | "id": "2de4619726ab3d5c",
|
425 |
| - "metadata": { |
426 |
| - "ExecuteTime": { |
427 |
| - "end_time": "2024-05-09T12:43:47.991261Z", |
428 |
| - "start_time": "2024-05-09T12:43:47.986129Z" |
429 |
| - } |
430 |
| - }, |
| 363 | + "metadata": {}, |
431 | 364 | "outputs": [],
|
432 | 365 | "source": [
|
433 | 366 | "series_from_pack = pack(\n",
|
|
454 | 387 | "cell_type": "code",
|
455 | 388 | "execution_count": null,
|
456 | 389 | "id": "9c63ae45dd0b6a29",
|
457 |
| - "metadata": { |
458 |
| - "ExecuteTime": { |
459 |
| - "end_time": "2024-05-09T12:43:47.995869Z", |
460 |
| - "start_time": "2024-05-09T12:43:47.992016Z" |
461 |
| - } |
462 |
| - }, |
| 390 | + "metadata": {}, |
463 | 391 | "outputs": [],
|
464 | 392 | "source": [
|
465 | 393 | "series_from_pack = pack(\n",
|
|
500 | 428 | "cell_type": "code",
|
501 | 429 | "execution_count": null,
|
502 | 430 | "id": "1284d9b536b9e784",
|
503 |
| - "metadata": { |
504 |
| - "ExecuteTime": { |
505 |
| - "end_time": "2024-05-09T12:43:48.000441Z", |
506 |
| - "start_time": "2024-05-09T12:43:47.996620Z" |
507 |
| - } |
508 |
| - }, |
| 431 | + "metadata": {}, |
509 | 432 | "outputs": [],
|
510 | 433 | "source": [
|
511 | 434 | "series_from_dtype = pd.Series(\n",
|
|
531 | 454 | "cell_type": "code",
|
532 | 455 | "execution_count": null,
|
533 | 456 | "id": "b7c7fd878bc97f68",
|
534 |
| - "metadata": { |
535 |
| - "ExecuteTime": { |
536 |
| - "end_time": "2024-05-09T12:43:48.004677Z", |
537 |
| - "start_time": "2024-05-09T12:43:48.001129Z" |
538 |
| - } |
539 |
| - }, |
| 457 | + "metadata": {}, |
540 | 458 | "outputs": [],
|
541 | 459 | "source": [
|
542 | 460 | "series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n",
|
|
568 | 486 | "cell_type": "code",
|
569 | 487 | "execution_count": null,
|
570 | 488 | "id": "e837d25dcb0a2b4d",
|
571 |
| - "metadata": { |
572 |
| - "ExecuteTime": { |
573 |
| - "end_time": "2024-05-09T12:43:48.015257Z", |
574 |
| - "start_time": "2024-05-09T12:43:48.013217Z" |
575 |
| - } |
576 |
| - }, |
| 489 | + "metadata": {}, |
577 | 490 | "outputs": [],
|
578 | 491 | "source": [
|
579 | 492 | "pa_struct_array = pa.StructArray.from_arrays(\n",
|
|
611 | 524 | "cell_type": "code",
|
612 | 525 | "execution_count": null,
|
613 | 526 | "id": "116c902ea8681c9e",
|
614 |
| - "metadata": { |
615 |
| - "ExecuteTime": { |
616 |
| - "end_time": "2024-05-09T12:43:48.040801Z", |
617 |
| - "start_time": "2024-05-09T12:43:48.038106Z" |
618 |
| - } |
619 |
| - }, |
| 527 | + "metadata": {}, |
620 | 528 | "outputs": [],
|
621 | 529 | "source": [
|
622 | 530 | "# Convert to pd.ArrowDtype Series of struct-arrays\n",
|
|
641 | 549 | "cell_type": "code",
|
642 | 550 | "execution_count": null,
|
643 | 551 | "id": "30ea40dee30795d1",
|
644 |
| - "metadata": { |
645 |
| - "ExecuteTime": { |
646 |
| - "end_time": "2024-05-09T12:43:48.055678Z", |
647 |
| - "start_time": "2024-05-09T12:43:48.050677Z" |
648 |
| - } |
649 |
| - }, |
| 552 | + "metadata": {}, |
650 | 553 | "outputs": [],
|
651 | 554 | "source": [
|
652 | 555 | "for element in nested_series:\n",
|
|
665 | 568 | "cell_type": "code",
|
666 | 569 | "execution_count": null,
|
667 | 570 | "id": "81f6c1f98dfc26a9",
|
668 |
| - "metadata": { |
669 |
| - "ExecuteTime": { |
670 |
| - "end_time": "2024-05-09T12:43:48.060166Z", |
671 |
| - "start_time": "2024-05-09T12:43:48.056425Z" |
672 |
| - } |
673 |
| - }, |
| 571 | + "metadata": {}, |
674 | 572 | "outputs": [],
|
675 | 573 | "source": [
|
676 | 574 | "nested_elements = list(nested_series)\n",
|
|
689 | 587 | "cell_type": "code",
|
690 | 588 | "execution_count": null,
|
691 | 589 | "id": "69ed758c48c55015",
|
692 |
| - "metadata": { |
693 |
| - "ExecuteTime": { |
694 |
| - "end_time": "2024-05-09T12:43:48.063115Z", |
695 |
| - "start_time": "2024-05-09T12:43:48.060863Z" |
696 |
| - } |
697 |
| - }, |
| 590 | + "metadata": {}, |
698 | 591 | "outputs": [],
|
699 | 592 | "source": [
|
700 | 593 | "nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n",
|
|
707 | 600 | "cell_type": "code",
|
708 | 601 | "execution_count": null,
|
709 | 602 | "id": "99ce9d18bc69ae49",
|
710 |
| - "metadata": { |
711 |
| - "ExecuteTime": { |
712 |
| - "end_time": "2024-05-09T12:43:48.088986Z", |
713 |
| - "start_time": "2024-05-09T12:43:48.086255Z" |
714 |
| - } |
715 |
| - }, |
| 603 | + "metadata": {}, |
716 | 604 | "outputs": [],
|
717 | 605 | "source": [
|
718 | 606 | "# Would have empty pd.DataFrame for top-level missed data\n",
|
|
0 commit comments