Skip to content

Commit

Permalink
fix: mean calculation
Browse files Browse the repository at this point in the history
Signed-off-by: Sunil Thaha <[email protected]>
  • Loading branch information
sthaha committed Jul 22, 2024
1 parent c01af44 commit 8f84268
Show file tree
Hide file tree
Showing 17 changed files with 252 additions and 199 deletions.
39 changes: 29 additions & 10 deletions src/estimate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,31 @@
import os
import sys
cur_path = os.path.join(os.path.dirname(__file__), '.')
sys.path.append(cur_path)
from .model.estimate_common import compute_error
from .model.model import load_model, get_background_containers
from .model.model import (
default_predicted_col_func,
get_predicted_power_colname,
get_predicted_background_power_colname,
get_dynamic_power_colname,
get_predicted_dynamic_power_colname,
get_predicted_dynamic_background_power_colname,
get_label_power_colname,
get_reconstructed_power_colname,
default_idle_predicted_col_func,
)

model_path = os.path.join(os.path.dirname(__file__), 'model')
sys.path.append(model_path)
# fmt: off
__all__ = [
'compute_error',
'load_model',
'get_background_containers',
'default_predicted_col_func',
'get_predicted_power_colname',
'get_predicted_background_power_colname',
'get_dynamic_power_colname',
'get_predicted_dynamic_power_colname',
'get_predicted_dynamic_background_power_colname',
'get_label_power_colname',
'get_reconstructed_power_colname',
'default_idle_predicted_col_func',
]

from estimate_common import compute_error

from model import load_model, get_background_containers
from model import default_predicted_col_func, get_predicted_power_colname, get_predicted_background_power_colname, get_dynamic_power_colname, get_predicted_dynamic_power_colname, get_predicted_dynamic_background_power_colname, get_label_power_colname, get_reconstructed_power_colname, default_idle_predicted_col_func
# fmt: on
Empty file added src/estimate/model/__init__.py
Empty file.
47 changes: 28 additions & 19 deletions src/train/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,31 @@
# comonly used within train module
# commonly used within train module

import os
import sys
cur_path = os.path.join(os.path.dirname(__file__), '.')
sys.path.append(cur_path)
util_path = os.path.join(os.path.dirname(__file__), '..', 'util')
sys.path.append(util_path)
extractor_path = os.path.join(os.path.dirname(__file__), 'extractor')
sys.path.append(extractor_path)
isolator_path = os.path.join(os.path.dirname(__file__), 'isolator')
sys.path.append(isolator_path)
from .pipeline import NewPipeline, load_class
from .profiler.profiler import Profiler, generate_profiles

from extractor import DefaultExtractor
from smooth_extractor import SmoothExtractor
from profiler.profiler import Profiler, generate_profiles
from profiler.node_type_index import NodeTypeIndexCollection, NodeTypeSpec
from isolator import MinIdleIsolator, ProfileBackgroundIsolator, NoneIsolator
from train_isolator import TrainIsolator
from pipeline import NewPipeline, load_class
from .extractor.extractor import DefaultExtractor

DefaultProfiler = Profiler(extractor=DefaultExtractor())
from .extractor.smooth_extractor import SmoothExtractor
from .profiler.node_type_index import NodeTypeIndexCollection, NodeTypeSpec
from .isolator.isolator import MinIdleIsolator, ProfileBackgroundIsolator, NoneIsolator
from .isolator.train_isolator import TrainIsolator

DefaultProfiler = Profiler(extractor=DefaultExtractor())

# fmt: off
__all__ = [
'NewPipeline',
'load_class',
'DefaultExtractor',
'SmoothExtractor',
'Profiler',
'generate_profiles',
'NodeTypeIndexCollection',
'NodeTypeSpec',
'MinIdleIsolator',
'ProfileBackgroundIsolator',
'NoneIsolator',
'TrainIsolator',
'DefaultProfiler'
]
# fmt: on
24 changes: 8 additions & 16 deletions src/train/exporter/exporter.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,15 @@
import os
import sys

import datetime

util_path = os.path.join(os.path.dirname(__file__), '..', '..', 'util')
sys.path.append(util_path)

cur_path = os.path.join(os.path.dirname(__file__))
sys.path.append(cur_path)

from validator import get_validated_export_items, BestModelCollection
from loader import load_metadata, load_node_type_index, get_version_path, get_export_path
from saver import save_pipeline_metadata, save_node_type_index
from format import time_to_str
from writer import generate_pipeline_page, generate_report_results, generate_pipeline_readme, append_version_readme, get_workload_content
from config import ERROR_KEY
from util.loader import load_metadata, load_node_type_index, get_version_path, get_export_path
from util.saver import save_pipeline_metadata, save_node_type_index
from util.format import time_to_str
from util.config import ERROR_KEY
from .validator import get_validated_export_items, BestModelCollection
from .writer import generate_pipeline_page, generate_report_results, generate_pipeline_readme, append_version_readme, get_workload_content

repo_url = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models"


def export(data_path, pipeline_path, db_path, publisher, collect_date, inputs):
# load pipeline metadata
pipeline_metadata = load_metadata(pipeline_path)
Expand Down Expand Up @@ -70,4 +62,4 @@ def export(data_path, pipeline_path, db_path, publisher, collect_date, inputs):
# add new pipeline item to version path
append_version_readme(local_version_path, pipeline_metadata)

return local_export_path
return local_export_path
Empty file added src/train/extractor/__init__.py
Empty file.
33 changes: 14 additions & 19 deletions src/train/extractor/extractor.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
import os
import sys
import pandas as pd
import numpy as np
from abc import ABCMeta, abstractmethod

util_path = os.path.join(os.path.dirname(__file__), '..', '..', 'util')
sys.path.append(util_path)
from util.train_types import FeatureGroups, FeatureGroup, SYSTEM_FEATURES
from util.prom_types import TIMESTAMP_COL, SOURCE_COL, get_energy_unit, usage_ratio_query, node_info_query, energy_component_to_query, feature_to_query, pkg_id_column, container_id_cols, node_info_column
from util.loader import default_node_type
from util.extract_types import container_id_colname, ratio_to_col, component_to_col, get_unit_vals, accelerator_type_colname
from train.extractor.preprocess import drop_zero_column, find_correlations

from train_types import FeatureGroups, FeatureGroup, SYSTEM_FEATURES
from prom_types import TIMESTAMP_COL, SOURCE_COL, get_energy_unit, \
usage_ratio_query,node_info_query, \
energy_component_to_query, feature_to_query, \
pkg_id_column, container_id_cols, node_info_column
from loader import default_node_type
from extract_types import container_id_colname, ratio_to_col, component_to_col, get_unit_vals, accelerator_type_colname
from preprocess import drop_zero_column, find_correlations

# append ratio for each unit
def append_ratio_for_pkg(feature_power_data, is_aggr, query_results, power_columns):
Expand All @@ -31,7 +24,7 @@ def append_ratio_for_pkg(feature_power_data, is_aggr, query_results, power_colum
if is_aggr:
ratio_df = ratio_df.groupby([TIMESTAMP_COL, pkg_id_column]).sum()[usage_ratio_query]
else:
ratio_df[container_id_colname] = ratio_df[container_id_cols].apply(lambda x: '/'.join(x), axis=1)
ratio_df[container_id_colname] = ratio_df[container_id_cols].apply(lambda x: "/".join(x), axis=1)
ratio_df = ratio_df.groupby([TIMESTAMP_COL, pkg_id_column, container_id_colname]).sum()[usage_ratio_query]
ratio_colnames = []
for unit_val in unit_vals:
Expand All @@ -43,12 +36,13 @@ def append_ratio_for_pkg(feature_power_data, is_aggr, query_results, power_colum
feature_power_data = feature_power_data.join(target_ratio_df).dropna()
feature_power_data = feature_power_data.rename(columns={usage_ratio_query: ratio_colname})
ratio_colnames +=[ratio_colname]
tmp_total_col = 'total_ratio'
tmp_total_col = "total_ratio"
feature_power_data[tmp_total_col] = feature_power_data[ratio_colnames].sum(axis=1)
for ratio_colname in ratio_colnames:
feature_power_data[ratio_colname] /= feature_power_data[tmp_total_col]
return feature_power_data.drop(columns=[tmp_total_col])


class Extractor(metaclass=ABCMeta):
# extractor abstract:
# return
Expand All @@ -65,12 +59,13 @@ def extract(self, query_results, feature_group):
def get_name(self):
return NotImplemented


# extract data from query
# for node-level
# return DataFrame (index=timestamp, column=[features][power columns][node_type]), power_columns

class DefaultExtractor(Extractor):

class DefaultExtractor(Extractor):
def get_name(self):
return "default"

Expand Down Expand Up @@ -103,8 +98,8 @@ def extract(self, query_results, energy_components, feature_group, energy_source
is_aggr = node_level and aggr
if is_aggr:
# sum stat of all containers
sum_feature = feature_power_data.groupby([TIMESTAMP_COL]).sum()[workload_features]
mean_power = feature_power_data.groupby([TIMESTAMP_COL]).mean()[power_columns]
sum_feature = feature_power_data.groupby([TIMESTAMP_COL])[workload_features].sum()
mean_power = feature_power_data.groupby([TIMESTAMP_COL])[power_columns].mean()
feature_power_data = sum_feature.join(mean_power)
else:
feature_power_data = feature_power_data.groupby([TIMESTAMP_COL, container_id_colname]).sum()
Expand Down Expand Up @@ -148,7 +143,7 @@ def get_workload_feature_data(self, query_results, features):

if all(col in aggr_query_data.columns for col in container_id_cols):
aggr_query_data.rename(columns={query: feature}, inplace=True)
aggr_query_data[container_id_colname] = aggr_query_data[container_id_cols].apply(lambda x: '/'.join([str(xi) for xi in x]), axis=1)
aggr_query_data[container_id_colname] = aggr_query_data[container_id_cols].apply(lambda x: "/".join([str(xi) for xi in x]), axis=1)
# separate for each container_id
container_id_list = pd.unique(aggr_query_data[container_id_colname])

Expand Down Expand Up @@ -222,7 +217,7 @@ def get_power_data(self, query_results, energy_components, source):
unit_col = get_energy_unit(component) # such as package
query = energy_component_to_query(component)
if query not in query_results:
print(query, 'not in', query_results)
print(query, "not in", query_results)
return None
aggr_query_data = query_results[query].copy()
# filter source
Expand Down
14 changes: 9 additions & 5 deletions src/train/extractor/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,41 @@

import numpy as np

util_path = os.path.join(os.path.dirname(__file__), '..', '..', 'util')
util_path = os.path.join(os.path.dirname(__file__), "..", "..", "util")
sys.path.append(util_path)

model_path = os.path.join(os.path.dirname(__file__), '..', '..', 'estimate', 'model')
model_path = os.path.join(os.path.dirname(__file__), "..", "..", "estimate", "model")
sys.path.append(model_path)

from train_types import PowerSourceMap
from prom_types import TIMESTAMP_COL
from extract_types import col_to_component
from model import get_label_power_colname


def drop_zero_column(data, cols):
sum_col = "sum_val"
data[sum_col] = data[cols].sum(axis=1)
data = data.drop(data[data[sum_col] == 0].index)
data = data.drop(columns=[sum_col])
return data


def remove_outlier(df, workload_features, threshold=1):
# Calculate the Z-score for each column
z_scores = np.abs((df[workload_features] - df[workload_features].mean()) / df[workload_features].std())
# Remove rows with outliers
df_no_outliers = df[(z_scores < threshold).all(axis=1)]
return df_no_outliers


def time_filter(data, min_time, max_time):
_data = data.reset_index()
start_time = _data[TIMESTAMP_COL].min()
_data = _data[(_data[TIMESTAMP_COL] >= start_time+min_time) & (_data[TIMESTAMP_COL] <= start_time+max_time)]
return _data


def get_extracted_power_labels(extracted_data, energy_components, label_cols):
# mean over the same value across container-level
extracted_power_labels = extracted_data[[TIMESTAMP_COL] + label_cols].groupby([TIMESTAMP_COL]).mean().sort_index()
Expand All @@ -43,9 +47,10 @@ def get_extracted_power_labels(extracted_data, energy_components, label_cols):
extracted_power_labels[component_label_col] = extracted_power_labels[target_cols].sum(axis=1)
return extracted_power_labels


def find_correlations(energy_source, feature_power_data, power_columns, workload_features):
power_data = feature_power_data[power_columns].reset_index().groupby([TIMESTAMP_COL]).mean()
feature_data = feature_power_data[workload_features].reset_index().groupby([TIMESTAMP_COL]).sum()
power_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[power_columns].mean()
feature_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[workload_features].sum()
energy_components = PowerSourceMap[energy_source]
target_cols = [col for col in power_columns if col_to_component(col) == energy_components[0]]
process_power_data = power_data.copy()
Expand All @@ -56,4 +61,3 @@ def find_correlations(energy_source, feature_power_data, power_columns, workload
join_data = feature_data.join(process_power_data[energy_source]).dropna()
corr = join_data.corr()[[energy_source]]
return corr.drop(index=energy_source)

11 changes: 2 additions & 9 deletions src/train/extractor/smooth_extractor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,8 @@
import os
import sys
from .extractor import DefaultExtractor
from util.train_types import FeatureGroups, FeatureGroup, SYSTEM_FEATURES

from extractor import DefaultExtractor, find_correlations
util_path = os.path.join(os.path.dirname(__file__), '..', '..', 'util')
sys.path.append(util_path)

from train_types import FeatureGroups, FeatureGroup, SYSTEM_FEATURES

class SmoothExtractor(DefaultExtractor):

def __init__(self, smooth_window=30):
self.smooth_window = smooth_window

Expand All @@ -30,4 +24,3 @@ def extract(self, query_results, energy_components, feature_group, energy_source
corr = find_correlations(energy_source, feature_power_data, power_columns, workload_features)

return smoothed_data, power_columns, corr, features

Empty file added src/train/isolator/__init__.py
Empty file.
Loading

0 comments on commit 8f84268

Please sign in to comment.