Skip to content

Commit 4d1b4bf

Browse files
author
Michael Fuest
committed
fixed some more preprocessing
1 parent c70fa37 commit 4d1b4bf

File tree

5 files changed

+41
-71
lines changed

5 files changed

+41
-71
lines changed

config/data_config.yaml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,13 @@ datasets:
1111
]
1212
metadata_columns: ["dataid",
1313
"building_type",
14-
"pv",
14+
#"pv",
1515
"solar",
1616
"car1",
1717
"city",
1818
"state",
1919
"total_square_footage",
20-
"house_construction_year",
21-
"total_amount_of_pv"
20+
"house_construction_year"
2221
]
2322
goinerdata:
2423
path: "home/fuest/EnData/data/goinerdata/"

config/model_config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ conditioning_vars: # for each desired conditioning variable, add the name and nu
1212
car1: 2
1313
city: 7
1414
state: 3
15+
total_square_footage: 5
16+
house_construction_year: 5
1517

1618
diffcharge:
1719
batch_size: 64

datasets/pecanstreet.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,7 @@
1111
import yaml
1212
from torch.utils.data import Dataset
1313

14-
from datasets.utils import encode_categorical_variables
15-
from datasets.utils import encode_numerical_variables
14+
from datasets.utils import encode_conditioning_variables
1615

1716
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
1817
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -90,18 +89,16 @@ def load_and_preprocess_data(
9089
raise FileNotFoundError(f"Metadata file not found at {metadata_csv_path}")
9190

9291
metadata = pd.read_csv(metadata_csv_path, usecols=metadata_columns)
92+
9393
if "solar" in metadata.columns:
9494
metadata.rename(columns={"solar": "has_solar"}, inplace=True)
9595

9696
data = self._load_full_data(path, data_columns)
9797
user_flags = self._set_user_flags(metadata, data)
9898
data = self._preprocess_data(data)
9999
data = pd.merge(data, metadata, on="dataid", how="left")
100-
data = encode_categorical_variables(data)
101-
data = encode_numerical_variables(
102-
data,
103-
["total_square_footage", "house_construction_year", "total_amount_of_pv"],
104-
)
100+
data = self._handle_missing_data(data)
101+
data = encode_conditioning_variables(data)
105102
return data, metadata, user_flags
106103

107104
def _load_full_data(self, path: str, columns: List[str]) -> pd.DataFrame:
@@ -193,31 +190,24 @@ def _calculate_and_store_statistics(self, data: pd.DataFrame, column: str) -> Di
193190
"""
194191

195192
def calculate_stats(group):
196-
# Concatenate all time series arrays in the group
197193
all_values = np.concatenate(group[column].values)
198-
199-
# Standardization statistics
200194
mean = np.mean(all_values)
201195
std = np.std(all_values)
202196

203-
# Perform standardization on all_values
204197
standardized = (all_values - mean) / (std + 1e-8)
205198

206-
# Min-Max scaling statistics on standardized data
207199
z_min = np.min(standardized)
208200
z_max = np.max(standardized)
209201

210202
return pd.Series({"mean": mean, "std": std, "z_min": z_min, "z_max": z_max})
211203

212204
if self.normalization_method == "group":
213-
# Group by dataid, month, and weekday
214205
grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
215206
calculate_stats
216207
)
217208
return grouped_stats.to_dict(orient="index")
218209

219210
elif self.normalization_method == "date":
220-
# Group by month and weekday
221211
grouped_stats = data.groupby(["month", "weekday"]).apply(calculate_stats)
222212
return grouped_stats.to_dict(orient="index")
223213

@@ -251,15 +241,12 @@ def normalize_and_scale_row(row):
251241
z_min = stats["z_min"]
252242
z_max = stats["z_max"]
253243

254-
# Standardization
255244
values = np.array(row[column])
256245
standardized = (values - mean) / (std + 1e-8)
257246

258-
# Optional Clipping after Standardization
259247
if self.threshold:
260248
standardized = np.clip(standardized, *self.threshold)
261249

262-
# Min-Max Scaling on standardized data
263250
scaled = (standardized - z_min) / (z_max - z_min + 1e-8)
264251

265252
return scaled
@@ -293,6 +280,13 @@ def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:
293280

294281
return solar_data
295282

283+
def _handle_missing_data(self, data: pd.DataFrame) -> pd.DataFrame:
284+
data["car1"] = data["car1"].fillna("no")
285+
data["has_solar"] = data["has_solar"].fillna("no")
286+
287+
assert data.isna().sum().sum() == 0, "Missing data remaining!"
288+
return data
289+
296290
@staticmethod
297291
def _merge_columns_into_timeseries(df: pd.DataFrame) -> pd.DataFrame:
298292
"""
@@ -632,7 +626,13 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
632626
"car1": torch.tensor(sample["car1"], dtype=torch.long),
633627
"city": torch.tensor(sample["city"], dtype=torch.long),
634628
"state": torch.tensor(sample["state"], dtype=torch.long),
635-
"has_solar": torch.tensor(sample["has_solar"], dtype=torch.long), # Updated
629+
"has_solar": torch.tensor(sample["has_solar"], dtype=torch.long),
630+
"total_square_footage": torch.tensor(
631+
sample["total_square_footage"], dtype=torch.long
632+
),
633+
"house_construction_year": torch.tensor(
634+
sample["house_construction_year"], dtype=torch.long
635+
),
636636
}
637637

638638
return (torch.tensor(time_series, dtype=torch.float32), conditioning_vars)

datasets/utils.py

Lines changed: 14 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -81,57 +81,26 @@ def split_dataset(dataset: Dataset, val_split: float = 0.1) -> Tuple[Dataset, Da
8181
return train_dataset, val_dataset
8282

8383

84-
def encode_categorical_variables(data: pd.DataFrame, columns: List[str]):
84+
def encode_conditioning_variables(data: pd.DataFrame) -> pd.DataFrame:
8585
"""
86-
Encodes categorical variables in a DataFrame to integer codes.
86+
Takes conditioning columns (e.g. city, total square footage), and converts it into integer encoded mappings. Discretizes numerical conditioning
87+
variables into categorical bins.
8788
8889
Args:
89-
data (pd.DataFrame): Input DataFrame containing categorical variables.
90-
columns (List[str]): List of column names to transform.
90+
data (pd.DataFrame): The data whose cols are being encoded.
9191
9292
Returns:
93-
df_encoded (pd.DataFrame): DataFrame with categorical variables encoded as integer codes.
94-
mappings (dict): Dictionary mapping column names to their category-to-code mappings.
93+
data (pd.DataFrame): The data frame that now has integer codes where numerical and categorical values used to be.
9594
"""
96-
df_encoded = data.copy()
97-
mappings = {}
95+
for col in data.columns:
9896

99-
for col in columns:
100-
df_encoded[col] = df_encoded[col].astype("category")
101-
102-
category_to_code = dict(enumerate(df_encoded[col].cat.categories))
103-
code_to_category = {v: k for k, v in category_to_code.items()}
104-
df_encoded[col] = df_encoded[col].cat.codes
105-
106-
mappings[col] = {
107-
"category_to_code": {cat: code for code, cat in category_to_code.items()},
108-
"code_to_category": code_to_category,
109-
}
110-
111-
return df_encoded
112-
113-
114-
def encode_numerical_variables(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
115-
"""
116-
Takes numerical conditioning columns (e.g. total square footage), and converts it into integer encoded mappings.
117-
118-
Args:
119-
data (pd.DataFrame): The data whose numerical cols are being encoded.
120-
columns (List[str]): The column names of numerical columns that need to be encoded.
121-
122-
Returns:
123-
data (pd.DataFrame): The data frame that now has integer codes where numerical values used to be.
124-
"""
125-
for col in columns:
126-
127-
data[col] = pd.to_numeric(data[col], errors="coerce")
128-
data[col]
129-
130-
if data[col].isnull().all():
131-
raise ValueError(f"Column '{col}' contains no valid numeric values.")
132-
133-
data[col] = pd.cut(
134-
data[col], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True
135-
).astype(int)
97+
if col in ["dataid", "timeseries", "month", "weekday", "date_day"]:
98+
continue
13699

100+
if pd.api.types.is_numeric_dtype(data[col]):
101+
data[col] = pd.cut(
102+
data[col], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True
103+
).astype(int)
104+
else:
105+
data[col] = data[col].astype("category").cat.codes
137106
return data

main.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def evaluate_individual_user_models(
88
full_dataset = PecanStreetDataManager(
99
normalize=normalize,
1010
include_generation=include_generation,
11-
threshold=(-10, 10),
11+
threshold=(-6, 6),
1212
normalization_method="group",
1313
)
1414
evaluator = Evaluator(full_dataset, model_name)
@@ -27,7 +27,7 @@ def evaluate_single_dataset_model(
2727
normalize=normalize,
2828
include_generation=include_generation,
2929
normalization_method=normalization_method,
30-
threshold=(-5, 5),
30+
threshold=(-6, 6),
3131
)
3232
evaluator = Evaluator(full_dataset, model_name)
3333
# evaluator.evaluate_all_users()
@@ -40,9 +40,9 @@ def main():
4040
# evaluate_individual_user_models("acgan", include_generation=True)
4141
# evaluate_individual_user_models("acgan", include_generation=False, normalization_method="date")
4242
evaluate_single_dataset_model(
43-
"acgan",
44-
geography="newyork",
45-
include_generation=False,
43+
"diffusion_ts",
44+
geography="austin",
45+
include_generation=True,
4646
normalization_method="date",
4747
)
4848

0 commit comments

Comments
 (0)