-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataframe_loop_example.py
71 lines (62 loc) · 2.45 KB
/
dataframe_loop_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# ======================================================================
# https://stackoverflow.com/questions/7837722/what-is-the-most-efficient-way-to-loop-through-dataframes-with-pandas
#
# EDIT 2020/11/10
#
# For what it is worth, here is an updated benchmark with some other alternatives
# (perf with MacBookPro 2,4 GHz Intel Core i9 8 cores 32 Go 2667 MHz DDR4)
# ======================================================================
#!/usr/bin/env python
import sys
import tqdm # tqdm is a Python library that allows you to output a smart
# progress bar by wrapping around any iterable. A tqdm progress
# bar not only shows you how much time has elapsed, but also
# shows the estimated time remaining for the iterable.
import time
import pandas as pd
B = []
t = pd.DataFrame({'a': range(0, 10000), 'b': range(10000, 20000)})
for _ in tqdm.tqdm(range(10)): # Wrapping the iterable, i.e. 'run the test 10 times'.
C = []
A = time.time()
for i,r in t.iterrows():
C.append((r['a'], r['b']))
B.append({"method": "iterrows", "time": time.time()-A})
C = []
A = time.time()
for ir in t.itertuples():
C.append((ir[1], ir[2]))
B.append({"method": "itertuples", "time": time.time()-A})
C = []
A = time.time()
# Nuertey Odzeyem addendum: the following zip() approach also works
# for getting at the index of the DataFrame, r being the row(s):
for r in zip(t.index, t['a'], t['b']):
C.append((r[1], r[2]))
#print("t.index")
#print(r[0])
B.append({"method": "zip", "time": time.time()-A})
C = []
A = time.time()
for r in zip(*t.to_dict("list").values()):
C.append((r[0], r[1]))
B.append({"method": "zip + to_dict('list')", "time": time.time()-A})
C = []
A = time.time()
for r in t.to_dict("records"):
C.append((r["a"], r["b"]))
B.append({"method": "to_dict('records')", "time": time.time()-A})
A = time.time()
t.agg(tuple, axis=1).tolist()
B.append({"method": "agg", "time": time.time()-A})
A = time.time()
t.apply(tuple, axis=1).tolist()
B.append({"method": "apply", "time": time.time()-A})
print()
print(f'Python {sys.version} on {sys.platform}')
print(f"Pandas version {pd.__version__}")
# Create a DataFrame from the list of dictionary values, and then groupby
# the culprit column "method".
print(
pd.DataFrame(B).groupby("method").agg(["mean", "std"]).xs("time", axis=1).sort_values("mean")
)