+
+
+
+
+
+
+
+clean_project_data_v4_final.py+
import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import matplotlib.pyplot as plt
+
+# Create date range
+= pd.date_range(start="1/1/2020", end="1/31/2020", freq="D")
+ date_rng
+# Sample time series data with DateTimeIndex
+= pd.Series([1, 2, -1, 4, 5, 20, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ data1 21, 22, 24, 24, 24, 24, 24, 24, 29, 30, 31], index=date_rng)
+ = pd.Series([5, 6, 200, 8, 9, 10, 11, 12, 300, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ data2 23, 24, 25, 26, 27, 27, 27, 30, 31, 32, 33, 34, 35], index=date_rng)
+ = pd.Series([15, 16, 11, 18, 400, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ data3 32, 33, 34, 35, 36, 37, 38, 39, 45, 45, 45, 45, 45, 45], index=date_rng)
+
+
+# Cleaning data1
+print("\nCleaning data1")
+= data1.copy()
+ data1_original
+# Checking for jumps
+print("Checking for jumps in data1")
+=10
+ max_jump= data1.iloc[0]
+ prev_value for t, value in data1.items():
+if abs(value - prev_value) <= max_jump:
+ # "Value ok"
+ = value
+ data1[t] = value
+ prev_value else:
+ = np.nan
+ data1[t] print("Jump detected and value removed on", t, ":", value)
+ print(f"Data removed: {data1_original[~data1_original.isin(data1)]}")
+# print("Data1 after jump check:", data1)
+
+# Checking for values in range
+= 0
+ min_val = 50
+ max_val for t, value in data1.items():
+# print("Checking value on", t, ":", value)
+ if min_val <= value <= max_val:
+ pass
+ # print("Value ok:", value)
+ else:
+ = np.nan
+ data1[t] print("Value removed:", value)
+ print(f"Data removed: {data1_original[~data1_original.isin(data1)]}")
+# print("Data1 after range check:", data1)
+
+
+# Checking for flat periods
+print("Checking for flat periods in data1")
+= 5
+ flat_period = 0
+ i while i < len(data1) - flat_period:
+if len(set(data1[i: i + flat_period + 1])) == 1:
+ print("Removing flat period starting at index:", i)
+ + flat_period + 1] = np.nan
+ data1[i: i += flat_period
+ i else:
+ += 1
+ i print(f"Data removed: {data1_original[~data1_original.isin(data1)]}")
+# print("Data1 after flat period check:", data1)
+
+
+# Cleaning data2
+print("\nCleaning data2")
+= data2.copy()
+ data2_original
+# Checking for jumps
+print("Checking for jumps in data2")
+=10
+ max_jump= data2.iloc[0]
+ prev_value for t, value in data2.items():
+if abs(value - prev_value) <= max_jump:
+ # "Value ok"
+ = value
+ data2[t] = value
+ prev_value else:
+ = np.nan
+ data2[t] print("Jump detected and value removed on", t, ":", value)
+ print(f"Data removed: {data2_original[~data2_original.isin(data2)]}")
+# print("data2 after jump check:", data2)
+
+# Checking for values in range
+= 0
+ min_val = 50
+ max_val for t, value in data2.items():
+# print("Checking value on", t, ":", value)
+ if min_val <= value <= max_val:
+ pass
+ # print("Value ok:", value)
+ else:
+ = np.nan
+ data2[t] print("Value removed:", value)
+ print(f"Data removed: {data2_original[~data2_original.isin(data2)]}")
+# print("data2 after range check:", data2)
+
+
+# Checking for flat periods
+print("Checking for flat periods in data2")
+= 5
+ flat_period = 0
+ i while i < len(data2) - flat_period:
+if len(set(data2[i: i + flat_period + 1])) == 1:
+ print("Removing flat period starting at index:", i)
+ + flat_period + 1] = np.nan
+ data2[i: i += flat_period
+ i else:
+ += 1
+ i print(f"Data removed: {data2_original[~data2_original.isin(data2)]}")
+# print("data2 after flat period check:", data2)
+
+# print("Final cleaned data2:", data2)
+
+# Cleaning data3
+print("\nCleaning data3")
+= data3.copy()
+ data3_original
+# Checking for jumps
+print("Checking for jumps in data3")
+=10
+ max_jump= data3.iloc[0]
+ prev_value for t, value in data3.items():
+if abs(value - prev_value) <= max_jump:
+ # "Value ok"
+ = value
+ data3[t] = value
+ prev_value else:
+ = np.nan
+ data3[t] print("Jump detected and value removed on", t, ":", value)
+ print(f"Data removed: {data3_original[~data3_original.isin(data3)]}")
+# print("data3 after jump check:", data3)
+
+# Checking for values in range
+= 0
+ min_val = 50
+ max_val for t, value in data3.items():
+# print("Checking value on", t, ":", value)
+ if min_val <= value <= max_val:
+ pass
+ # print("Value ok:", value)
+ else:
+ = np.nan
+ data3[t] print("Value removed:", value)
+ print(f"Data removed: {data3_original[~data3_original.isin(data3)]}")
+# print("data3 after range check:", data3)
+
+
+# Checking for flat periods
+print("Checking for flat periods in data3")
+= 5
+ flat_period = 0
+ i while i < len(data3) - flat_period:
+if len(set(data3[i: i + flat_period + 1])) == 1:
+ print("Removing flat period starting at index:", i)
+ + flat_period + 1] = np.nan
+ data3[i: i += flat_period
+ i else:
+ += 1
+ i print(f"Data removed: {data3_original[~data3_original.isin(data3)]}")
+# print("data3 after flat period check:", data3)
+
+# print("Final cleaned data3:", data3)
+
+## plot data showing outliers as red dots
+=(10, 5))
+ plt.figure(figsize'.', color="red")
+ plt.plot(data1_original, '.', color="green")
+ plt.plot(data1, "Data1")
+ plt.title(
+ plt.show()
+=(10, 5))
+ plt.figure(figsize'.', color="red")
+ plt.plot(data2_original, '.', color="green")
+ plt.plot(data2, "Data2")
+ plt.title(
+ plt.show()
+=(10, 5))
+ plt.figure(figsize'.', color="red")
+ plt.plot(data3_original, '.', color="green")
+ plt.plot(data3, "Data3")
+ plt.title( plt.show()