From 3f3ec0afd1fab6fad248c082378d13fb2cd3d408 Mon Sep 17 00:00:00 2001 From: Sunyanan Choochotkaew Date: Thu, 28 Sep 2023 14:49:56 +0900 Subject: [PATCH] fix/update model training and export Signed-off-by: Sunyanan Choochotkaew --- cmd/main.py | 17 +- model_training/benchmark/stressng.yaml | 321 +++++++++++-------------- model_training/script.sh | 5 +- src/train/exporter/exporter.py | 20 +- src/train/exporter/validator.py | 1 - src/train/pipeline.py | 7 +- src/util/loader.py | 33 ++- src/util/train_types.py | 2 +- 8 files changed, 204 insertions(+), 202 deletions(-) diff --git a/cmd/main.py b/cmd/main.py index 944faa41..c40bb52a 100644 --- a/cmd/main.py +++ b/cmd/main.py @@ -327,7 +327,8 @@ def extract(args): if args.output: save_csv(data_path, "extracted_" + args.output, feature_power_data) query = feature_to_query(FeatureGroups[fg][0]) - query_results[query][[TIMESTAMP_COL, query]].groupby([TIMESTAMP_COL]).sum().to_csv(args.output[0:-4]+"_raw.csv") + raw_data = query_results[query][[TIMESTAMP_COL, query]].groupby([TIMESTAMP_COL]).sum() + save_csv(data_path, "extracted_" + args.output[0:-4]+"_raw.csv", raw_data) return feature_power_data, power_cols def isolate(args): @@ -407,10 +408,12 @@ def train(args): print_cols = ["feature_group", "model_name", "mae"] print("AbsPower pipeline results:") metadata_df = load_pipeline_metadata(pipeline.path, energy_source, ModelOutputType.AbsPower.name) - print(metadata_df.sort_values(by=[ERROR_KEY])[print_cols]) + if metadata_df is not None: + print(metadata_df.sort_values(by=[ERROR_KEY])[print_cols]) print("DynPower pipeline results:") metadata_df = load_pipeline_metadata(pipeline.path, energy_source, ModelOutputType.DynPower.name) - print(metadata_df.sort_values(by=[ERROR_KEY])[print_cols]) + if metadata_df is not None: + print(metadata_df.sort_values(by=[ERROR_KEY])[print_cols]) warnings.resetwarnings() @@ -616,7 +619,7 @@ def _summary_plot(energy_source, summary_df, output_folder, name): sns.barplot(data=data, x="Feature Group", y="MAE", hue="Model", ax=ax) ax.set_title(component) ax.set_ylabel("MAE (Watt)") - ax.set_ylim((0, 50)) + ax.set_ylim((0, 100)) if i < col_num-1: ax.set_xlabel("") ax.legend(bbox_to_anchor=(1.05, 1.05)) @@ -671,7 +674,8 @@ def plot(args): from estimate import default_predicted_col_func from sklearn.preprocessing import MaxAbsScaler - best_result_map, power_labels_map, best_model_id_map, _ = estimate(args) + best_result_map, power_labels_map, best_model_id_map, summary_df = estimate(args) + print(summary_df) for energy_source, best_restult in best_result_map.items(): best_restult = best_restult.reset_index() power_labels = power_labels_map[energy_source] @@ -737,7 +741,7 @@ def export(args): machine_path = get_machine_path(output_path, args.version, machine_id) collect_date, _ = extract_time(args.benchmark) - exporter.export(pipeline_path, machine_path, version=args.version, publisher=args.publisher, collect_date=collect_date, include_raw=args.include_raw) + exporter.export(data_path, pipeline_path, machine_path, machine_id=machine_id, version=args.version, publisher=args.publisher, collect_date=collect_date, include_raw=args.include_raw) args.energy_source = ",".join(PowerSourceMap.keys()) @@ -839,7 +843,6 @@ def plot_scenario(args): data_filename = get_general_filename(args.target_data, energy_source, None, ot, args.extractor, args.isolator) + "_" + args.scenario _ts_plot(power_data, power_cols, "Power source: {} ({})".format(energy_source, args.scenario), output_folder, data_filename, ylabel="Power (W)") - if __name__ == "__main__": # set model top path to data path os.environ['MODEL_PATH'] = data_path diff --git a/model_training/benchmark/stressng.yaml b/model_training/benchmark/stressng.yaml index 4b0f43d8..2ac2ef0c 100644 --- a/model_training/benchmark/stressng.yaml +++ b/model_training/benchmark/stressng.yaml @@ -96,202 +96,173 @@ spec: # # We first execute the baseline scenarios. # cpuFrequency;useOrNotHT;mainWorkload;numInstances;extraParam;extraParamValue - - "none;sleep;none;none;none" # capture the OS/backgroud power consumption - - "3600000;cpu;1;none;none" # the incremental power from the previous scenarios is activation + workload power - - "3600000;cpu;2;none;none" # the incremental power from the previous scenarios is only the workload power + - none;sleep;none;none;none # capture the OS/backgroud power consumption + - 3600000;cpu;1;none;none # the incremental power from the previous scenarios is activation + workload power + - 3600000;cpu;2;none;none # the incremental power from the previous scenarios is only the workload power # # Then we execute the all other workloads. # cpu: is used to stress the CPU # The baselineMachine has 32 CPUs with 2 hyperthreads - - "2000000;cpu;4;none;none" - - "2000000;cpu;8;none;none" - - "2000000;cpu;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;cpu;24;none;none" - - "2000000;cpu;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;cpu;4;none;none" - - "2800000;cpu;8;none;none" - - "2800000;cpu;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;cpu;24;none;none" - - "2800000;cpu;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;cpu;4;none;none" - - "3600000;cpu;8;none;none" - - "3600000;cpu;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;cpu;24;none;none" - - "3600000;cpu;32;none;none" # max HT cores in baselineMachine (32) + # max CPU cores in baselineMachine (15) + # max HT cores in baselineMachine (32) + - 2000000;cpu;4;none;none + - 2000000;cpu;8;none;none + - 2000000;cpu;15;none;none + - 2000000;cpu;24;none;none + - 2000000;cpu;32;none;none + - 2000000;cpu;40;none;none + - 2800000;cpu;4;none;none + - 2800000;cpu;8;none;none + - 2800000;cpu;15;none;none + - 2800000;cpu;24;none;none + - 2800000;cpu;32;none;none + - 2800000;cpu;40;none;none + - 3600000;cpu;4;none;none + - 3600000;cpu;8;none;none + - 3600000;cpu;15;none;none + - 3600000;cpu;24;none;none + - 3600000;cpu;32;none;none + - 3600000;cpu;40;none;none # # branch: is used to stress branch by branch to 1024 randomly selected locations and hence exercise # the CPU branch prediction logic - - "2000000;branch;4;none;none" - - "2000000;branch;8;none;none" - - "2000000;branch;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;branch;24;none;none" - - "2000000;branch;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;branch;4;none;none" - - "2800000;branch;8;none;none" - - "2800000;branch;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;branch;24;none;none" - - "2800000;branch;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;branch;4;none;none" - - "3600000;branch;8;none;none" - - "3600000;branch;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;branch;24;none;none" - - "3600000;branch;32;none;none" # max HT cores in baselineMachine (32) - # - # cyclic: is used to stress linux schedulers with cyclic nanosecond sleeps - - "2000000;cyclic;4;none;none" - - "2000000;cyclic;8;none;none" - - "2000000;cyclic;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;cyclic;24;none;none" - - "2000000;cyclic;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;cyclic;4;none;none" - - "2800000;cyclic;8;none;none" - - "2800000;cyclic;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;cyclic;24;none;none" - - "2800000;cyclic;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;cyclic;4;none;none" - - "3600000;cyclic;8;none;none" - - "3600000;cyclic;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;cyclic;24;none;none" - - "3600000;cyclic;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;branch;4;none;none + - 2000000;branch;8;none;none + - 2000000;branch;15;none;none + - 2000000;branch;24;none;none + - 2000000;branch;32;none;none + - 2000000;branch;40;none;none + - 2800000;branch;4;none;none + - 2800000;branch;8;none;none + - 2800000;branch;15;none;none + - 2800000;branch;24;none;none + - 2800000;branch;32;none;none + - 2800000;branch;40;none;none + - 3600000;branch;4;none;none + - 3600000;branch;8;none;none + - 3600000;branch;15;none;none + - 3600000;branch;24;none;none + - 3600000;branch;32;none;none + - 3600000;branch;40;none;none # # regs: start N workers exercising CPU generic registers - - "2000000;regs;4;none;none" - - "2000000;regs;8;none;none" - - "2000000;regs;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;regs;24;none;none" - - "2000000;regs;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;regs;4;none;none" - - "2800000;regs;8;none;none" - - "2800000;regs;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;regs;24;none;none" - - "2800000;regs;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;regs;4;none;none" - - "3600000;regs;8;none;none" - - "3600000;regs;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;regs;24;none;none" - - "3600000;regs;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;regs;4;none;none + - 2000000;regs;8;none;none + - 2000000;regs;15;none;none + - 2000000;regs;24;none;none + - 2000000;regs;32;none;none + - 2000000;regs;40;none;none + - 2800000;regs;4;none;none + - 2800000;regs;8;none;none + - 2800000;regs;15;none;none + - 2800000;regs;24;none;none + - 2800000;regs;32;none;none + - 2800000;regs;40;none;none + - 3600000;regs;4;none;none + - 3600000;regs;8;none;none + - 3600000;regs;15;none;none + - 3600000;regs;24;none;none + - 3600000;regs;32;none;none + - 3600000;regs;40;none;none # # l1cache: is used to stress CPU level 1 cache with reads and writes - - "2000000;l1cache;4;none;none" - - "2000000;l1cache;8;none;none" - - "2000000;l1cache;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;l1cache;24;none;none" - - "2000000;l1cache;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;l1cache;4;none;none" - - "2800000;l1cache;8;none;none" - - "2800000;l1cache;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;l1cache;24;none;none" - - "2800000;l1cache;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;l1cache;4;none;none" - - "3600000;l1cache;8;none;none" - - "3600000;l1cache;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;l1cache;24;none;none" - - "3600000;l1cache;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;l1cache;4;none;none + - 2000000;l1cache;8;none;none + - 2000000;l1cache;15;none;none + - 2000000;l1cache;24;none;none + - 2000000;l1cache;32;none;none + - 2000000;l1cache;40;none;none + - 2800000;l1cache;4;none;none + - 2800000;l1cache;8;none;none + - 2800000;l1cache;15;none;none + - 2800000;l1cache;24;none;none + - 2800000;l1cache;32;none;none + - 2800000;l1cache;40;none;none + - 3600000;l1cache;4;none;none + - 3600000;l1cache;8;none;none + - 3600000;l1cache;15;none;none + - 3600000;l1cache;24;none;none + - 3600000;l1cache;32;none;none + - 3600000;l1cache;40;none;none # # cache: is used to stress the CPU cache with random wide spread memory read and writes to thrash the CPU cache - - "2000000;cache;4;none;none" - - "2000000;cache;8;none;none" - - "2000000;cache;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;cache;24;none;none" - - "2000000;cache;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;cache;4;none;none" - - "2800000;cache;8;none;none" - - "2800000;cache;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;cache;24;none;none" - - "2800000;cache;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;cache;4;none;none" - - "3600000;cache;8;none;none" - - "3600000;cache;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;cache;24;none;none" - - "3600000;cache;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;cache;4;none;none + - 2000000;cache;8;none;none + - 2000000;cache;15;none;none + - 2000000;cache;24;none;none + - 2000000;cache;32;none;none + - 2000000;cache;40;none;none + - 2800000;cache;4;none;none + - 2800000;cache;8;none;none + - 2800000;cache;15;none;none + - 2800000;cache;24;none;none + - 2800000;cache;32;none;none + - 2800000;cache;40;none;none + - 3600000;cache;4;none;none + - 3600000;cache;8;none;none + - 3600000;cache;15;none;none + - 3600000;cache;24;none;none + - 3600000;cache;32;none;none + - 3600000;cache;40;none;none # # stream: "Sustainable Memory Bandwidth in High Performance Computers" benchmarking tool by John D. McCalpin - - "2000000;stream;4;none;none" - - "2000000;stream;8;none;none" - - "2000000;stream;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;stream;24;none;none" - - "2000000;stream;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;stream;4;none;none" - - "2800000;stream;8;none;none" - - "2800000;stream;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;stream;24;none;none" - - "2800000;stream;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;stream;4;none;none" - - "3600000;stream;8;none;none" - - "3600000;stream;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;stream;24;none;none" - - "3600000;stream;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;stream;4;none;none + - 2000000;stream;8;none;none + - 2000000;stream;15;none;none + - 2000000;stream;24;none;none + - 2000000;stream;32;none;none + - 2000000;stream;40;none;none + - 2800000;stream;4;none;none + - 2800000;stream;8;none;none + - 2800000;stream;15;none;none + - 2800000;stream;24;none;none + - 2800000;stream;32;none;none + - 2800000;stream;40;none;none + - 3600000;stream;4;none;none + - 3600000;stream;8;none;none + - 3600000;stream;15;none;none + - 3600000;stream;24;none;none + - 3600000;stream;32;none;none + - 3600000;stream;40;none;none # # A common recommendation is to use around 80-90% of the available memory for stress testing. # The baselineMachine has 20Gi free, we make two tests 80% # --vm-rw: is used to stress the virtual memory subsystem by allocating memory pages and continuously # writing and reading data to and from them. This simulates a scenario where memory is frequently used # and modified. This test stress both memory allocation and data access. - - "2000000;vm-rw;4;vm-rw-bytes;16G" - - "2000000;vm-rw;8;vm-rw-bytes;16G" - - "2000000;vm-rw;15;vm-rw-bytes;16G" # max CPU cores in baselineMachine (15) - - "2000000;vm-rw;24;vm-rw-bytes;16G" - - "2000000;vm-rw;32;vm-rw-bytes;16G" # max HT cores in baselineMachine (32) - - "2800000;vm-rw;4;vm-rw-bytes;16G" - - "2800000;vm-rw;8;vm-rw-bytes;16G" - - "2800000;vm-rw;15;vm-rw-bytes;16G" # max CPU cores in baselineMachine (15) - - "2800000;vm-rw;24;vm-rw-bytes;16G" - - "2800000;vm-rw;32;vm-rw-bytes;16G" # max HT cores in baselineMachine (32) - - "3600000;vm-rw;4;vm-rw-bytes;16G" - - "3600000;vm-rw;8;vm-rw-bytes;16G" - - "3600000;vm-rw;15;vm-rw-bytes;16G" # max CPU cores in baselineMachine (15) - - "3600000;vm-rw;24;vm-rw-bytes;16G" - - "3600000;vm-rw;32;vm-rw-bytes;16G" # max HT cores in baselineMachine (32) - # - # --iomix: is used to stress a mix of sequential, random and memory mapped read/write operations as - # well as random copy file read/writes, forced sync'ing and (if run as root) cache dropping. - - "2000000;iomix;4;none;none" - - "2000000;iomix;8;none;none" - - "2000000;iomix;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;iomix;24;none;none" - - "2000000;iomix;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;iomix;4;none;none" - - "2800000;iomix;8;none;none" - - "2800000;iomix;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;iomix;24;none;none" - - "2800000;iomix;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;iomix;4;none;none" - - "3600000;iomix;8;none;none" - - "3600000;iomix;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;iomix;24;none;none" - - "3600000;iomix;32;none;none" # max HT cores in baselineMachine (32) - # - # pipe: is used to stress pipe write operations - - "2000000;pipe;4;none;none" - - "2000000;pipe;8;none;none" - - "2000000;pipe;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;pipe;24;none;none" - - "2000000;pipe;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;pipe;4;none;none" - - "2800000;pipe;8;none;none" - - "2800000;pipe;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;pipe;24;none;none" - - "2800000;pipe;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;pipe;4;none;none" - - "3600000;pipe;8;none;none" - - "3600000;pipe;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;pipe;24;none;none" - - "3600000;pipe;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;vm-rw;4;vm-rw-bytes;15G + - 2000000;vm-rw;8;vm-rw-bytes;7G + - 2000000;vm-rw;15;vm-rw-bytes;4G + - 2000000;vm-rw;24;vm-rw-bytes;2G + - 2000000;vm-rw;32;vm-rw-bytes;1G + - 2800000;vm-rw;4;vm-rw-bytes;15G + - 2800000;vm-rw;8;vm-rw-bytes;7G + - 2800000;vm-rw;15;vm-rw-bytes;4G + - 2800000;vm-rw;24;vm-rw-bytes;2G + - 2800000;vm-rw;32;vm-rw-bytes;1G + - 3600000;vm-rw;4;vm-rw-bytes;15G + - 3600000;vm-rw;8;vm-rw-bytes;7G + - 3600000;vm-rw;15;vm-rw-bytes;4G + - 3600000;vm-rw;24;vm-rw-bytes;2G + - 3600000;vm-rw;32;vm-rw-bytes;1G # # sctp: is used to stress the network performing SCTP send/receives - - "2000000;sctp;4;none;none" - - "2000000;sctp;8;none;none" - - "2000000;sctp;15;none;none" # max CPU cores in baselineMachine (15) - - "2000000;sctp;24;none;none" - - "2000000;sctp;32;none;none" # max HT cores in baselineMachine (32) - - "2800000;sctp;4;none;none" - - "2800000;sctp;8;none;none" - - "2800000;sctp;15;none;none" # max CPU cores in baselineMachine (15) - - "2800000;sctp;24;none;none" - - "2800000;sctp;32;none;none" # max HT cores in baselineMachine (32) - - "3600000;sctp;4;none;none" - - "3600000;sctp;8;none;none" - - "3600000;sctp;15;none;none" # max CPU cores in baselineMachine (15) - - "3600000;sctp;24;none;none" - - "3600000;sctp;32;none;none" # max HT cores in baselineMachine (32) + - 2000000;sctp;4;none;none + - 2000000;sctp;8;none;none + - 2000000;sctp;15;none;none + - 2000000;sctp;24;none;none + - 2000000;sctp;32;none;none + - 2000000;sctp;40;none;none + - 2800000;sctp;4;none;none + - 2800000;sctp;8;none;none + - 2800000;sctp;15;none;none + - 2800000;sctp;24;none;none + - 2800000;sctp;32;none;none + - 2800000;sctp;40;none;none + - 3600000;sctp;4;none;none + - 3600000;sctp;8;none;none + - 3600000;sctp;15;none;none + - 3600000;sctp;24;none;none + - 3600000;sctp;32;none;none + - 3600000;sctp;40;none;none sequential: true \ No newline at end of file diff --git a/model_training/script.sh b/model_training/script.sh index f880bcca..ec1b8d2d 100755 --- a/model_training/script.sh +++ b/model_training/script.sh @@ -14,6 +14,7 @@ export VERSION=${VERSION-v0.6} export PIPELINE_PREFIX=${PIPELINE_PREFIX-"std_"} export CPE_DATAPATH=${CPE_DATAPATH-"$(pwd)/data"} export ENTRYPOINT_IMG=${ENTRYPOINT_IMG-"quay.io/sustainable_computing_io/kepler_model_server:v0.6"} +export MODEL_PATH=$CPE_DATAPATH mkdir -p $HOME/bin export PATH=$HOME/bin:$PATH @@ -113,7 +114,7 @@ function wait_for_benchmark() { function save_benchmark() { BENCHMARK=$1 BENCHMARK_NS=$2 - kubectl get benchmark $BENCHMARK -n ${BENCHMARK_NS} -ojson > data/${BENCHMARK}.json + kubectl get benchmark $BENCHMARK -n ${BENCHMARK_NS} -ojson > $CPE_DATAPATH/${BENCHMARK}.json } function collect_idle() { @@ -180,7 +181,7 @@ function quick_collect() { } function train() { - train_model stressng_kepler_query,coremark_kepler_query,parsec_kepler_query ${VERSION} + train_model stressng_kepler_query ${VERSION} } function quick_train() { diff --git a/src/train/exporter/exporter.py b/src/train/exporter/exporter.py index cfec8420..da58fd75 100644 --- a/src/train/exporter/exporter.py +++ b/src/train/exporter/exporter.py @@ -16,8 +16,9 @@ from loader import load_csv, load_pipeline_metadata, get_model_group_path, load_metadata, load_train_args, get_preprocess_folder, get_general_filename from saver import WEIGHT_FILENAME, save_pipeline_metadata, save_train_args from format import time_to_str +from writer import generate_pipeline_page, generate_validation_results, append_version_readme -def export(pipeline_path, machine_path, version, publisher, collect_date, include_raw=False): +def export(data_path, pipeline_path, machine_path, machine_id, version, publisher, collect_date, include_raw=False): if not validate_arguments(pipeline_path): return @@ -47,7 +48,9 @@ def export(pipeline_path, machine_path, version, publisher, collect_date, includ extractor = pipeline_metadata["extractor"] isolator = pipeline_metadata["isolator"] + mae_validated_df_map = dict() for energy_source in PowerSourceMap.keys(): + mae_validated_df_map[energy_source] = dict() for ot in ModelOutputType: metadata_df = load_pipeline_metadata(pipeline_path, energy_source, ot.name) if metadata_df is None: @@ -80,8 +83,19 @@ def export(pipeline_path, machine_path, version, publisher, collect_date, includ save_pipeline_metadata(out_pipeline_path, pipeline_metadata, energy_source, ot.name, mae_validated_df) print("Exported models for {}/{}".format(energy_source, ot.name)) print(mae_validated_df) + mae_validated_df_map[energy_source][ot.name] = mae_validated_df else: print("No valid models exported for {}/{}".format(energy_source, ot.name)) - + + train_args = load_train_args(pipeline_path) + train_args["machine_id"] = machine_id + # save train args - save_train_args(out_pipeline_path, load_train_args(pipeline_path)) \ No newline at end of file + save_train_args(out_pipeline_path, train_args) + + # generate document + generate_pipeline_page(data_path, machine_path, train_args) + generate_validation_results(machine_path, train_args, mae_validated_df_map) + append_version_readme(machine_path, train_args, pipeline_metadata, include_raw) + + diff --git a/src/train/exporter/validator.py b/src/train/exporter/validator.py index f8b4993f..851f952c 100644 --- a/src/train/exporter/validator.py +++ b/src/train/exporter/validator.py @@ -6,7 +6,6 @@ from loader import load_train_args from config import ERROR_KEY -from train_types import PowerSourceMap required_benchmark = ["stressng_kepler_query"] diff --git a/src/train/pipeline.py b/src/train/pipeline.py index 10656f2d..57f6c005 100644 --- a/src/train/pipeline.py +++ b/src/train/pipeline.py @@ -163,7 +163,7 @@ def save_metadata(self): all_metadata = get_all_metadata(model_toppath, self.name) for energy_source, model_type_metadata in all_metadata.items(): for model_type, metadata_df in model_type_metadata.items(): - metadata_df = metadata_df.sort_values(by=[ERROR_KEY]) + metadata_df = metadata_df.sort_values(by=["feature_group", ERROR_KEY]) save_pipeline_metadata(self.path, self.metadata, energy_source, model_type, metadata_df) def print_pipeline_process_end(self, energy_source, feature_group, abs_data, dyn_data): @@ -214,7 +214,10 @@ def initial_trainers(trainer_names, node_level, pipeline_name, target_energy_sou energy_components = PowerSourceMap[energy_source] for feature_group in valid_feature_groups: for trainer_name in trainer_names: - trainer_class = load_class("trainer", trainer_name) + try: + trainer_class = load_class("trainer", trainer_name) + except: + continue trainer = trainer_class(energy_components, feature_group.name, energy_source, node_level, pipeline_name=pipeline_name) trainers += [trainer] return trainers diff --git a/src/util/loader.py b/src/util/loader.py index e27d9a34..6b711e84 100644 --- a/src/util/loader.py +++ b/src/util/loader.py @@ -17,12 +17,14 @@ CHECKPOINT_FOLDERNAME = 'checkpoint' PREPROCESS_FOLDERNAME = "preprocessed_data" -default_init_model_url = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/" -default_init_pipeline_name = "Linux-4.15.0-213-generic-x86_64_v0.6" +default_init_model_url = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.6/nx12" +default_init_pipeline_name = "std_v0.6" default_trainer_name = "GradientBoostingRegressorTrainer" default_node_type = "1" any_node_type = -1 -default_feature_group = FeatureGroup.KubeletOnly +default_feature_group = FeatureGroup.BPFOnly + +trainers_with_weight = ["SGDRegressorTrainer"] def load_json(path, name): if ".json" not in name: @@ -236,20 +238,29 @@ def get_download_output_path(download_path, energy_source, output_type): energy_source_path = assure_path(os.path.join(download_path, energy_source)) return os.path.join(energy_source_path, output_type.name) -def get_url(output_type, feature_group=default_feature_group, trainer_name=default_trainer_name, node_type=default_node_type, model_topurl=default_init_model_url, energy_source="rapl", pipeline_name=default_init_pipeline_name): +def get_url(output_type, feature_group=default_feature_group, trainer_name=default_trainer_name, node_type=default_node_type, model_topurl=default_init_model_url, energy_source="rapl", pipeline_name=default_init_pipeline_name, model_name=None, weight=False): group_path = get_model_group_path(model_topurl, output_type=output_type, feature_group=feature_group, energy_source=energy_source, pipeline_name=pipeline_name, assure=False) - model_name = get_model_name(trainer_name, node_type) - return os.path.join(group_path, model_name + ".zip") - -def get_pipeline_url(model_topurl=default_init_model_url, pipeline_name=default_init_pipeline_name): - return os.path.join(model_topurl, pipeline_name + ".zip") + if model_name is None: + model_name = get_model_name(trainer_name, node_type) + file_ext = ".zip" + if weight: + file_ext = ".json" + return os.path.join(group_path, model_name + file_ext) + +def get_pipeline_url(model_topurl=default_init_model_url, pipeline_name=default_init_pipeline_name, weight=False): + file_ext = ".zip" + if weight: + file_ext = ".json" + return os.path.join(model_topurl, pipeline_name + file_ext) def class_to_json(class_obj): return json.loads(json.dumps(class_obj.__dict__)) -def get_machine_path(output_path, version, machine_id): +def get_machine_path(output_path, version, machine_id, assure=True): export_path = os.path.join(output_path, version, machine_id) - return assure_path(export_path) + if assure: + return assure_path(export_path) + return export_path def get_preprocess_folder(pipeline_path, assure=True): preprocess_folder = os.path.join(pipeline_path, PREPROCESS_FOLDERNAME) diff --git a/src/util/train_types.py b/src/util/train_types.py index 051e6798..a4fbb90b 100644 --- a/src/util/train_types.py +++ b/src/util/train_types.py @@ -84,7 +84,7 @@ def deep_sort(elements): FeatureGroup.AcceleratorOnly: deep_sort(ACCELERATE_FEATURES), } -SingleSourceFeatures = [FeatureGroup.CounterOnly.name, FeatureGroup.CgroupOnly.name, FeatureGroup.BPFOnly.name, FeatureGroup.KubeletOnly.name] +SingleSourceFeatures = [FeatureGroup.CounterOnly.name, FeatureGroup.CgroupOnly.name, FeatureGroup.BPFOnly.name, FeatureGroup.BPFIRQ.name, FeatureGroup.KubeletOnly.name] def is_single_source_feature_group(fg): return fg.name in SingleSourceFeatures