From f629f4bbcdcd8720db776593a8950e476aea7e59 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 10 Oct 2024 13:55:26 -0400 Subject: [PATCH 1/7] feat: Adding additional mechanism to generate raw data Signed-off-by: Francisco Javier Arceo --- data/generate_data.py | 149 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 data/generate_data.py diff --git a/data/generate_data.py b/data/generate_data.py new file mode 100644 index 0000000..70c5471 --- /dev/null +++ b/data/generate_data.py @@ -0,0 +1,149 @@ +import pandas as pd +import numpy as np +import uuid +from datetime import datetime, timedelta +import random + + +def generate_random_transactions( + users_df: pd.DataFrame, max_transactions: int = 11, max_days_back=365 +) -> pd.DataFrame: + # Predefined lists of categories and locations + transaction_categories = [ + "Groceries", + "Utilities", + "Entertainment", + "Dining", + "Travel", + "Health", + "Education", + "Shopping", + "Automotive", + "Rent", + ] + cities_and_states = [ + ("New York", "NY"), + ("Los Angeles", "CA"), + ("Chicago", "IL"), + ("Houston", "TX"), + ("Phoenix", "AZ"), + ("Philadelphia", "PA"), + ("San Antonio", "TX"), + ("San Diego", "CA"), + ("Dallas", "TX"), + ("San Jose", "CA"), + ] + transactions_list = [] + + for i, row in users_df.iterrows(): + num_transactions = np.random.randint(1, max_transactions) + for j in range(num_transactions): + # Random date within the last 10-max_days_back (default 365) days + random_days = np.random.randint(10, max_days_back) + date_of_transaction = datetime.now() - timedelta(days=random_days) + city, state = random.choice(cities_and_states) + if j == (num_transactions - 1): + date_of_transaction == row["created"] + + transactions_list.append( + { + "user_id": row["user_id"], + "created": date_of_transaction, + "updated": date_of_transaction, + "date_of_transaction": date_of_transaction, + "transaction_amount": round(np.random.uniform(10, 1000), 2), + "transaction_category": random.choice(transaction_categories), + "card_token": str(uuid.uuid4()), + "city": city, + "state": state, + } + ) + + return pd.DataFrame(transactions_list) + + +def calculate_point_in_time_features(label_dataset, transactions_df) -> pd.DataFrame: + label_dataset["created"] = pd.to_datetime(label_dataset["created"]) + transactions_df["transaction_timestamp"] = pd.to_datetime( + transactions_df["date_of_transaction"] + ) + + # Get all transactions before the created time + transactions_before = pd.merge( + label_dataset[["user_id", "created"]], transactions_df, on="user_id" + ) + transactions_before = transactions_before[ + transactions_before["transaction_timestamp"] < transactions_before["created_x"] + ] + transactions_before["days_between_transactions"] = ( + transactions_before["transaction_timestamp"] - transactions_before["created_x"] + ).dt.days + + # Group by user_id and created to calculate features + features = ( + transactions_before.groupby(["user_id", "created_x"]) + .agg( + num_prev_transactions=("transaction_amount", "count"), + avg_prev_transaction_amount=("transaction_amount", "mean"), + max_prev_transaction_amount=("transaction_amount", "max"), + stdv_prev_transaction_amount=("transaction_amount", "std"), + days_since_last_transaction=("days_between_transactions", "min"), + days_since_first_transaction=("days_between_transactions", "max"), + ) + .reset_index() + .fillna(0) + ) + + final_df = ( + pd.merge( + label_dataset, + features, + left_on=["user_id", "created"], + right_on=["user_id", "created_x"], + how="left", + ) + .reset_index(drop=True) + .drop("created_x", axis=1) + ) + + return final_df + + +def main(): + train = pd.read_csv("train.csv") + test = pd.read_csv("test.csv") + valid = pd.read_csv("validate.csv") + df = pd.concat([train, test, valid], axis=0).reset_index(drop=True) + + df["user_id"] = [f"user_{i}" for i in range(df.shape[0])] + df["transaction_id"] = [f"txn_{i}" for i in range(df.shape[0])] + + for date_col in ["created", "updated"]: + df[date_col] = pd.Timestamp.now() + + label_dataset = pd.DataFrame( + df[ + [ + "user_id", + "fraud", + "created", + "updated", + "distance_from_home", + "distance_from_last_transaction", + "ratio_to_median_purchase_price", + ] + ] + ) + + user_purchase_history = generate_random_transactions( + users_df=df[df["repeat_retailer"] == 1], + max_transactions=5, + max_days_back=365, + ) + user_purchase_history.to_parquet("raw_transaction_datasource.parquet") + finaldf = calculate_point_in_time_features(label_dataset, user_purchase_history) + finaldf.to_parquet("final_data.parquet") + + +if __name__ == "__main__": + main() From 89ad706882f487c803db74e29f6f1c4d70dd1852 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 10 Oct 2024 13:55:56 -0400 Subject: [PATCH 2/7] Renaming file Signed-off-by: Francisco Javier Arceo --- data/{generate_data.py => generate_raw_data.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data/{generate_data.py => generate_raw_data.py} (100%) diff --git a/data/generate_data.py b/data/generate_raw_data.py similarity index 100% rename from data/generate_data.py rename to data/generate_raw_data.py From 71eeeef509a6a2d468d69b2a73a4d4da2f8d07e8 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Fri, 11 Oct 2024 16:57:02 -0400 Subject: [PATCH 3/7] updated to use absolute path so script can be run in main repo Signed-off-by: Francisco Javier Arceo --- data/generate_raw_data.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/data/generate_raw_data.py b/data/generate_raw_data.py index 70c5471..2d87b45 100644 --- a/data/generate_raw_data.py +++ b/data/generate_raw_data.py @@ -1,3 +1,4 @@ +import os import pandas as pd import numpy as np import uuid @@ -110,9 +111,10 @@ def calculate_point_in_time_features(label_dataset, transactions_df) -> pd.DataF def main(): - train = pd.read_csv("train.csv") - test = pd.read_csv("test.csv") - valid = pd.read_csv("validate.csv") + script_dir = os.path.dirname(os.path.abspath(__file__)) + train = pd.read_csv(os.path.join(script_dir, "train.csv")) + test = pd.read_csv(os.path.join(script_dir, "test.csv")) + valid = pd.read_csv(os.path.join(script_dir, "validate.csv")) df = pd.concat([train, test, valid], axis=0).reset_index(drop=True) df["user_id"] = [f"user_{i}" for i in range(df.shape[0])] From ab1ac9791657e210bb16f2698f038ef8ea54a347 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Fri, 11 Oct 2024 16:58:57 -0400 Subject: [PATCH 4/7] exporting data using absolute path as well Signed-off-by: Francisco Javier Arceo --- data/generate_raw_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/data/generate_raw_data.py b/data/generate_raw_data.py index 2d87b45..1deff25 100644 --- a/data/generate_raw_data.py +++ b/data/generate_raw_data.py @@ -142,9 +142,11 @@ def main(): max_transactions=5, max_days_back=365, ) - user_purchase_history.to_parquet("raw_transaction_datasource.parquet") + user_purchase_history.to_parquet( + os.path.join(script_dir, "raw_transaction_datasource.parquet") + ) finaldf = calculate_point_in_time_features(label_dataset, user_purchase_history) - finaldf.to_parquet("final_data.parquet") + finaldf.to_parquet(os.path.join(script_dir, "final_data.parquet")) if __name__ == "__main__": From bf33f5d4cc31b34e818d1504c0553fb8845280fc Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Fri, 3 Jan 2025 11:18:46 -0500 Subject: [PATCH 5/7] Merged back the original features and added more progress output Signed-off-by: Francisco Javier Arceo --- data/generate_raw_data.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/data/generate_raw_data.py b/data/generate_raw_data.py index 1deff25..c309793 100644 --- a/data/generate_raw_data.py +++ b/data/generate_raw_data.py @@ -35,6 +35,8 @@ def generate_random_transactions( ("San Jose", "CA"), ] transactions_list = [] + total_users = users_df.shape[0] + batch = total_users // 10 for i, row in users_df.iterrows(): num_transactions = np.random.randint(1, max_transactions) @@ -59,6 +61,13 @@ def generate_random_transactions( "state": state, } ) + if (i % batch) == 0: + formatted_i = f"{i:,}" + percent_complete = i / total_users * 100 + print( + f"{formatted_i:>{len(f'{total_users:,}')}} of {total_users:,} " + f"({percent_complete:.0f}%) complete" + ) return pd.DataFrame(transactions_list) @@ -111,10 +120,15 @@ def calculate_point_in_time_features(label_dataset, transactions_df) -> pd.DataF def main(): + print("loading data...") script_dir = os.path.dirname(os.path.abspath(__file__)) train = pd.read_csv(os.path.join(script_dir, "train.csv")) test = pd.read_csv(os.path.join(script_dir, "test.csv")) valid = pd.read_csv(os.path.join(script_dir, "validate.csv")) + train["set"] = "train" + test["set"] = "test" + valid["set"] = "valid" + df = pd.concat([train, test, valid], axis=0).reset_index(drop=True) df["user_id"] = [f"user_{i}" for i in range(df.shape[0])] @@ -137,16 +151,24 @@ def main(): ] ) + print("generating transaction level data...") user_purchase_history = generate_random_transactions( - users_df=df[df["repeat_retailer"] == 1], + users_df=df[df["repeat_retailer"] == 1].reset_index(drop=True), max_transactions=5, max_days_back=365, ) user_purchase_history.to_parquet( os.path.join(script_dir, "raw_transaction_datasource.parquet") ) + print("calculating point in time features...") finaldf = calculate_point_in_time_features(label_dataset, user_purchase_history) + print("merging final dataset...") + finaldf = finaldf.merge( + df[["user_id", "created", "used_chip", "used_pin_number", "online_order"]], + on=["user_id", "created"], + ) finaldf.to_parquet(os.path.join(script_dir, "final_data.parquet")) + print("...data processing complete.") if __name__ == "__main__": From 4a87143a91b5624a7680eae911a822b7f47160b0 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Fri, 3 Jan 2025 11:22:34 -0500 Subject: [PATCH 6/7] adding the set column Signed-off-by: Francisco Javier Arceo --- data/generate_raw_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/generate_raw_data.py b/data/generate_raw_data.py index c309793..d058043 100644 --- a/data/generate_raw_data.py +++ b/data/generate_raw_data.py @@ -144,6 +144,7 @@ def main(): "fraud", "created", "updated", + "set", "distance_from_home", "distance_from_last_transaction", "ratio_to_median_purchase_price", From 0f73d753dd82142edbb04c183f18f10a4b879ecd Mon Sep 17 00:00:00 2001 From: Francisco Arceo Date: Tue, 7 Jan 2025 12:59:11 -0500 Subject: [PATCH 7/7] Update data/generate_raw_data.py Co-authored-by: Helber Belmiro --- data/generate_raw_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/generate_raw_data.py b/data/generate_raw_data.py index d058043..93dbfa5 100644 --- a/data/generate_raw_data.py +++ b/data/generate_raw_data.py @@ -46,7 +46,7 @@ def generate_random_transactions( date_of_transaction = datetime.now() - timedelta(days=random_days) city, state = random.choice(cities_and_states) if j == (num_transactions - 1): - date_of_transaction == row["created"] + date_of_transaction = row["created"] transactions_list.append( {