-
Notifications
You must be signed in to change notification settings - Fork 0
/
aggregate.py
35 lines (26 loc) · 888 Bytes
/
aggregate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
from pathlib import Path
import json
columns = ["name", "full_name", "description", "created_at", "pushed_at"]
rows = []
for p in Path("data").glob("results*.json"):
with p.open() as f:
d = json.load(f)
for item in d["items"]:
print(item.keys())
row = []
for col in columns:
value = item[col]
if type(value) == str:
value = value[:500]
value = value.encode("ascii", errors="ignore").decode()
row.append(value)
rows.append(row)
df = pd.DataFrame(rows, columns=columns)
df = df.sort_values("created_at", ascending=False)
print(df)
df.to_csv("data/aggregated_results.csv", index=False)
sample = df.sample(n=100)
sample["framework"] = ""
sample = sample[["framework", "full_name", "description"]]
sample.to_csv("data/sample.csv", index=False)