Skip to content

Commit

Permalink
fixed some more preprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Fuest committed Sep 24, 2024
1 parent c70fa37 commit 4d1b4bf
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 71 deletions.
5 changes: 2 additions & 3 deletions config/data_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@ datasets:
]
metadata_columns: ["dataid",
"building_type",
"pv",
#"pv",
"solar",
"car1",
"city",
"state",
"total_square_footage",
"house_construction_year",
"total_amount_of_pv"
"house_construction_year"
]
goinerdata:
path: "home/fuest/EnData/data/goinerdata/"
Expand Down
2 changes: 2 additions & 0 deletions config/model_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ conditioning_vars: # for each desired conditioning variable, add the name and nu
car1: 2
city: 7
state: 3
total_square_footage: 5
house_construction_year: 5

diffcharge:
batch_size: 64
Expand Down
36 changes: 18 additions & 18 deletions datasets/pecanstreet.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
import yaml
from torch.utils.data import Dataset

from datasets.utils import encode_categorical_variables
from datasets.utils import encode_numerical_variables
from datasets.utils import encode_conditioning_variables

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
Expand Down Expand Up @@ -90,18 +89,16 @@ def load_and_preprocess_data(
raise FileNotFoundError(f"Metadata file not found at {metadata_csv_path}")

metadata = pd.read_csv(metadata_csv_path, usecols=metadata_columns)

if "solar" in metadata.columns:
metadata.rename(columns={"solar": "has_solar"}, inplace=True)

data = self._load_full_data(path, data_columns)
user_flags = self._set_user_flags(metadata, data)
data = self._preprocess_data(data)
data = pd.merge(data, metadata, on="dataid", how="left")
data = encode_categorical_variables(data)
data = encode_numerical_variables(
data,
["total_square_footage", "house_construction_year", "total_amount_of_pv"],
)
data = self._handle_missing_data(data)
data = encode_conditioning_variables(data)
return data, metadata, user_flags

def _load_full_data(self, path: str, columns: List[str]) -> pd.DataFrame:
Expand Down Expand Up @@ -193,31 +190,24 @@ def _calculate_and_store_statistics(self, data: pd.DataFrame, column: str) -> Di
"""

def calculate_stats(group):
# Concatenate all time series arrays in the group
all_values = np.concatenate(group[column].values)

# Standardization statistics
mean = np.mean(all_values)
std = np.std(all_values)

# Perform standardization on all_values
standardized = (all_values - mean) / (std + 1e-8)

# Min-Max scaling statistics on standardized data
z_min = np.min(standardized)
z_max = np.max(standardized)

return pd.Series({"mean": mean, "std": std, "z_min": z_min, "z_max": z_max})

if self.normalization_method == "group":
# Group by dataid, month, and weekday
grouped_stats = data.groupby(["dataid", "month", "weekday"]).apply(
calculate_stats
)
return grouped_stats.to_dict(orient="index")

elif self.normalization_method == "date":
# Group by month and weekday
grouped_stats = data.groupby(["month", "weekday"]).apply(calculate_stats)
return grouped_stats.to_dict(orient="index")

Expand Down Expand Up @@ -251,15 +241,12 @@ def normalize_and_scale_row(row):
z_min = stats["z_min"]
z_max = stats["z_max"]

# Standardization
values = np.array(row[column])
standardized = (values - mean) / (std + 1e-8)

# Optional Clipping after Standardization
if self.threshold:
standardized = np.clip(standardized, *self.threshold)

# Min-Max Scaling on standardized data
scaled = (standardized - z_min) / (z_max - z_min + 1e-8)

return scaled
Expand Down Expand Up @@ -293,6 +280,13 @@ def _preprocess_solar(self, data: pd.DataFrame) -> pd.DataFrame:

return solar_data

def _handle_missing_data(self, data: pd.DataFrame) -> pd.DataFrame:
data["car1"] = data["car1"].fillna("no")
data["has_solar"] = data["has_solar"].fillna("no")

assert data.isna().sum().sum() == 0, "Missing data remaining!"
return data

@staticmethod
def _merge_columns_into_timeseries(df: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -632,7 +626,13 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
"car1": torch.tensor(sample["car1"], dtype=torch.long),
"city": torch.tensor(sample["city"], dtype=torch.long),
"state": torch.tensor(sample["state"], dtype=torch.long),
"has_solar": torch.tensor(sample["has_solar"], dtype=torch.long), # Updated
"has_solar": torch.tensor(sample["has_solar"], dtype=torch.long),
"total_square_footage": torch.tensor(
sample["total_square_footage"], dtype=torch.long
),
"house_construction_year": torch.tensor(
sample["house_construction_year"], dtype=torch.long
),
}

return (torch.tensor(time_series, dtype=torch.float32), conditioning_vars)
59 changes: 14 additions & 45 deletions datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,57 +81,26 @@ def split_dataset(dataset: Dataset, val_split: float = 0.1) -> Tuple[Dataset, Da
return train_dataset, val_dataset


def encode_categorical_variables(data: pd.DataFrame, columns: List[str]):
def encode_conditioning_variables(data: pd.DataFrame) -> pd.DataFrame:
"""
Encodes categorical variables in a DataFrame to integer codes.
Takes conditioning columns (e.g. city, total square footage), and converts it into integer encoded mappings. Discretizes numerical conditioning
variables into categorical bins.
Args:
data (pd.DataFrame): Input DataFrame containing categorical variables.
columns (List[str]): List of column names to transform.
data (pd.DataFrame): The data whose cols are being encoded.
Returns:
df_encoded (pd.DataFrame): DataFrame with categorical variables encoded as integer codes.
mappings (dict): Dictionary mapping column names to their category-to-code mappings.
data (pd.DataFrame): The data frame that now has integer codes where numerical and categorical values used to be.
"""
df_encoded = data.copy()
mappings = {}
for col in data.columns:

for col in columns:
df_encoded[col] = df_encoded[col].astype("category")

category_to_code = dict(enumerate(df_encoded[col].cat.categories))
code_to_category = {v: k for k, v in category_to_code.items()}
df_encoded[col] = df_encoded[col].cat.codes

mappings[col] = {
"category_to_code": {cat: code for code, cat in category_to_code.items()},
"code_to_category": code_to_category,
}

return df_encoded


def encode_numerical_variables(data: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
"""
Takes numerical conditioning columns (e.g. total square footage), and converts it into integer encoded mappings.
Args:
data (pd.DataFrame): The data whose numerical cols are being encoded.
columns (List[str]): The column names of numerical columns that need to be encoded.
Returns:
data (pd.DataFrame): The data frame that now has integer codes where numerical values used to be.
"""
for col in columns:

data[col] = pd.to_numeric(data[col], errors="coerce")
data[col]

if data[col].isnull().all():
raise ValueError(f"Column '{col}' contains no valid numeric values.")

data[col] = pd.cut(
data[col], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True
).astype(int)
if col in ["dataid", "timeseries", "month", "weekday", "date_day"]:
continue

if pd.api.types.is_numeric_dtype(data[col]):
data[col] = pd.cut(
data[col], bins=5, labels=[0, 1, 2, 3, 4], include_lowest=True
).astype(int)
else:
data[col] = data[col].astype("category").cat.codes
return data
10 changes: 5 additions & 5 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def evaluate_individual_user_models(
full_dataset = PecanStreetDataManager(
normalize=normalize,
include_generation=include_generation,
threshold=(-10, 10),
threshold=(-6, 6),
normalization_method="group",
)
evaluator = Evaluator(full_dataset, model_name)
Expand All @@ -27,7 +27,7 @@ def evaluate_single_dataset_model(
normalize=normalize,
include_generation=include_generation,
normalization_method=normalization_method,
threshold=(-5, 5),
threshold=(-6, 6),
)
evaluator = Evaluator(full_dataset, model_name)
# evaluator.evaluate_all_users()
Expand All @@ -40,9 +40,9 @@ def main():
# evaluate_individual_user_models("acgan", include_generation=True)
# evaluate_individual_user_models("acgan", include_generation=False, normalization_method="date")
evaluate_single_dataset_model(
"acgan",
geography="newyork",
include_generation=False,
"diffusion_ts",
geography="austin",
include_generation=True,
normalization_method="date",
)

Expand Down

0 comments on commit 4d1b4bf

Please sign in to comment.