-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
2522f1a
commit 39823e3
Showing
4 changed files
with
194 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
187 changes: 187 additions & 0 deletions
187
projects/data_cleaning/clean_project_data_v4_final2.qmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,187 @@ | ||
```{.python filename="clean_project_data_v4_final.py"} | ||
import pandas as pd | ||
import numpy as np | ||
from datetime import datetime, timedelta | ||
import matplotlib.pyplot as plt | ||
# Create date range | ||
date_rng = pd.date_range(start="1/1/2020", end="1/31/2020", freq="D") | ||
# Sample time series data with DateTimeIndex | ||
data1 = pd.Series([1, 2, -1, 4, 5, 20, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, | ||
21, 22, 24, 24, 24, 24, 24, 24, 29, 30, 31], index=date_rng) | ||
data2 = pd.Series([5, 6, 200, 8, 9, 10, 11, 12, 300, 14, 15, 16, 17, 18, 19, 20, 21, 22, | ||
23, 24, 25, 26, 27, 27, 27, 30, 31, 32, 33, 34, 35], index=date_rng) | ||
data3 = pd.Series([15, 16, 11, 18, 400, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, | ||
32, 33, 34, 35, 36, 37, 38, 39, 45, 45, 45, 45, 45, 45], index=date_rng) | ||
# Cleaning data1 | ||
print("\nCleaning data1") | ||
data1_original = data1.copy() | ||
# Checking for jumps | ||
print("Checking for jumps in data1") | ||
max_jump=10 | ||
prev_value = data1.iloc[0] | ||
for t, value in data1.items(): | ||
if abs(value - prev_value) <= max_jump: | ||
# "Value ok" | ||
data1[t] = value | ||
prev_value = value | ||
else: | ||
data1[t] = np.nan | ||
print("Jump detected and value removed on", t, ":", value) | ||
print(f"Data removed: {data1_original[~data1_original.isin(data1)]}") | ||
# print("Data1 after jump check:", data1) | ||
# Checking for values in range | ||
min_val = 0 | ||
max_val = 50 | ||
for t, value in data1.items(): | ||
# print("Checking value on", t, ":", value) | ||
if min_val <= value <= max_val: | ||
pass | ||
# print("Value ok:", value) | ||
else: | ||
data1[t] = np.nan | ||
print("Value removed:", value) | ||
print(f"Data removed: {data1_original[~data1_original.isin(data1)]}") | ||
# print("Data1 after range check:", data1) | ||
# Checking for flat periods | ||
print("Checking for flat periods in data1") | ||
flat_period = 5 | ||
i = 0 | ||
while i < len(data1) - flat_period: | ||
if len(set(data1[i: i + flat_period + 1])) == 1: | ||
print("Removing flat period starting at index:", i) | ||
data1[i: i + flat_period + 1] = np.nan | ||
i += flat_period | ||
else: | ||
i += 1 | ||
print(f"Data removed: {data1_original[~data1_original.isin(data1)]}") | ||
# print("Data1 after flat period check:", data1) | ||
# Cleaning data2 | ||
print("\nCleaning data2") | ||
data2_original = data2.copy() | ||
# Checking for jumps | ||
print("Checking for jumps in data2") | ||
max_jump=10 | ||
prev_value = data2.iloc[0] | ||
for t, value in data2.items(): | ||
if abs(value - prev_value) <= max_jump: | ||
# "Value ok" | ||
data2[t] = value | ||
prev_value = value | ||
else: | ||
data2[t] = np.nan | ||
print("Jump detected and value removed on", t, ":", value) | ||
print(f"Data removed: {data2_original[~data2_original.isin(data2)]}") | ||
# print("data2 after jump check:", data2) | ||
# Checking for values in range | ||
min_val = 0 | ||
max_val = 50 | ||
for t, value in data2.items(): | ||
# print("Checking value on", t, ":", value) | ||
if min_val <= value <= max_val: | ||
pass | ||
# print("Value ok:", value) | ||
else: | ||
data2[t] = np.nan | ||
print("Value removed:", value) | ||
print(f"Data removed: {data2_original[~data2_original.isin(data2)]}") | ||
# print("data2 after range check:", data2) | ||
# Checking for flat periods | ||
print("Checking for flat periods in data2") | ||
flat_period = 5 | ||
i = 0 | ||
while i < len(data2) - flat_period: | ||
if len(set(data2[i: i + flat_period + 1])) == 1: | ||
print("Removing flat period starting at index:", i) | ||
data2[i: i + flat_period + 1] = np.nan | ||
i += flat_period | ||
else: | ||
i += 1 | ||
print(f"Data removed: {data2_original[~data2_original.isin(data2)]}") | ||
# print("data2 after flat period check:", data2) | ||
# print("Final cleaned data2:", data2) | ||
# Cleaning data3 | ||
print("\nCleaning data3") | ||
data3_original = data3.copy() | ||
# Checking for jumps | ||
print("Checking for jumps in data3") | ||
max_jump=10 | ||
prev_value = data3.iloc[0] | ||
for t, value in data3.items(): | ||
if abs(value - prev_value) <= max_jump: | ||
# "Value ok" | ||
data3[t] = value | ||
prev_value = value | ||
else: | ||
data3[t] = np.nan | ||
print("Jump detected and value removed on", t, ":", value) | ||
print(f"Data removed: {data3_original[~data3_original.isin(data3)]}") | ||
# print("data3 after jump check:", data3) | ||
# Checking for values in range | ||
min_val = 0 | ||
max_val = 50 | ||
for t, value in data3.items(): | ||
# print("Checking value on", t, ":", value) | ||
if min_val <= value <= max_val: | ||
pass | ||
# print("Value ok:", value) | ||
else: | ||
data3[t] = np.nan | ||
print("Value removed:", value) | ||
print(f"Data removed: {data3_original[~data3_original.isin(data3)]}") | ||
# print("data3 after range check:", data3) | ||
# Checking for flat periods | ||
print("Checking for flat periods in data3") | ||
flat_period = 5 | ||
i = 0 | ||
while i < len(data3) - flat_period: | ||
if len(set(data3[i: i + flat_period + 1])) == 1: | ||
print("Removing flat period starting at index:", i) | ||
data3[i: i + flat_period + 1] = np.nan | ||
i += flat_period | ||
else: | ||
i += 1 | ||
print(f"Data removed: {data3_original[~data3_original.isin(data3)]}") | ||
# print("data3 after flat period check:", data3) | ||
# print("Final cleaned data3:", data3) | ||
## plot data showing outliers as red dots | ||
plt.figure(figsize=(10, 5)) | ||
plt.plot(data1_original, '.', color="red") | ||
plt.plot(data1, '.', color="green") | ||
plt.title("Data1") | ||
plt.show() | ||
plt.figure(figsize=(10, 5)) | ||
plt.plot(data2_original, '.', color="red") | ||
plt.plot(data2, '.', color="green") | ||
plt.title("Data2") | ||
plt.show() | ||
plt.figure(figsize=(10, 5)) | ||
plt.plot(data3_original, '.', color="red") | ||
plt.plot(data3, '.', color="green") | ||
plt.title("Data3") | ||
plt.show() | ||
``` | ||
|