-
Notifications
You must be signed in to change notification settings - Fork 0
/
1xml2csv.py
executable file
·100 lines (78 loc) · 2.63 KB
/
1xml2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
# by Dr. Torben Menke https://entorb.net
# https://github.com/entorb/analyze-apple-health
"""
Read Apple health exported data.
apple_health_export/export.xml
export in csv and Excel format
out/data-raw.*
Note: Excel has a limit of 1,048,576 rows...
TODO: checkout export_cda.xml as well
"""
# based on https://towardsdatascience.com/analyse-your-health-with-python-and-apple-health-11c12894aae2
# requirements:
# pip3 install pandas
# import numpy as np
import time
from pathlib import Path
import pandas as pd
from defusedxml.ElementTree import parse as XMLparse # fixes S405 # noqa: N812
# from xml.etree.ElementTree import parse as XMLparse
Path("out").mkdir(exist_ok=True)
print("read xml data")
timelast = time.time()
# create element tree object
tree = XMLparse("apple_health_export/export.xml")
# For every health record, extract the attributes into a dictionary (columns).
# Then create a list (rows).
root = tree.getroot()
record_list = [x.attrib for x in root.iter("Record")]
print(f"{int(time.time() - timelast)}s")
print("convert to DataFrame")
timelast = time.time()
# create DataFrame from a list (rows) of dictionaries (columns)
df = pd.DataFrame(record_list)
print(f"{int(time.time() - timelast)}s")
# print("export out/data-raw-1.tsv")
# timelast = time.time()
# df.to_csv(
# "out/data-raw-1.tsv",
# sep="\t",
# line_terminator="\n",
# )
# print("%ds" % (time.time() - timelast))
print("column modifications")
timelast = time.time()
df.index.name = "row"
# date fixes
for col in ["creationDate", "startDate", "endDate"]:
# proper type to dates
df[col] = pd.to_datetime(df[col])
# Remove timezone from columns
df[col] = df[col].dt.tz_localize(tz=None)
# convert value to numeric or NaN if fails
df["value"] = pd.to_numeric(df["value"], errors="coerce")
# how to treat nun-numeric values
# a ) filling with 1.0 (= one time) makes it easier to aggregate
# df["value"] = df["value"].fillna(1.0)
# b) just drop value=na rows
df = df[df["value"].notna()]
# shorter observation names: use vectorized replace function
df["type"] = df["type"].str.replace("HKQuantityTypeIdentifier", "")
df["type"] = df["type"].str.replace("HKCategoryTypeIdentifier", "")
print(f"{int(time.time() - timelast)}s")
print("export out/data-raw2.tsv")
timelast = time.time()
df.to_csv(
"out/data-raw-2.tsv",
sep="\t",
lineterminator="\n",
)
print(f"{int(time.time() - timelast)}s")
if len(df.index) >= 1048576 - 1:
print("too many rows for Excel export")
else:
print("export out/data-raw-2.xlsx")
timelast = time.time()
df.to_excel("out/data-raw-2.xlsx")
print(f"{int(time.time() - timelast)}s")