diff --git a/build/config/ceems_api_server/ceems_api_server.yml b/build/config/ceems_api_server/ceems_api_server.yml index 2c416e84..d6a27f3d 100644 --- a/build/config/ceems_api_server/ceems_api_server.yml +++ b/build/config/ceems_api_server/ceems_api_server.yml @@ -299,15 +299,8 @@ clusters: [] # # An object of environment variables that will be injected while executing the # # CLI utilities to fetch compute unit data. # # - # # This is handy when executing CLI tools like `keystone` for openstack or `kubectl` - # # for k8s needs to source admin credentials. Those credentials can be set manually - # # here in this section. - # # # environment_variables: {} - # # OS_USERNAME: username # Openstack RC File - # # OS_PASSWORD: password # Openstack RC File - # # OS_TENANT_NAME: projectName # Openstack RC File - # # OS_AUTH_URL: https://identityHost:portNumber/v2.0 # Openstack RC File + # # NAME: value # Environment variable name value pair # # If the resource manager supports API server, configure the REST API # # server details here. @@ -459,23 +452,27 @@ clusters: [] # # Any other configuration needed to reach API server of the resource manager # # can be configured in this section. # # - # # Currently this section is used for both SLURM and Openstack resource managers + # # Currently this section is used for Openstack resource manager # # to configure API versions # # - # # For example, for SLURM if your API endpoints are of form `/slurm/v0.0.40/diag`, - # # the version is `v0.0.40`. - # # Docs: https://slurm.schedmd.com/rest_api.html - # # SLURM's REST API version can be set as `slurm: v0.0.40` - # # - # # In the case of Openstack, we need to fetch from different sources like identity, - # # compute and they use different versioning of API. They can be configured using - # # this section as well + # # In the case of Openstack, this section must have two keys `api_service_endpoints` + # # and `auth`. Both of these are compulsory. + # # `api_service_endpoints` must provide API endpoints for compute and identity + # # services as provided in service catalog of Openstack cluster. `auth` must be the + # # same `auth` object that must be sent in POST request to keystone to get a API token. # # - # extra_config: - # api_versions: {} - # # slurm: v0.0.40 # SLURM - # # identity: v3 # Openstack - # # compute: v2.1 # Openstack + # extra_config: {} + # # api_service_endpoints: + # # compute: https://openstack-nova.example.com/v2.1 + # # identity: https://openstack-keystone.example.com + # # auth: + # # identity: + # # methods: + # # - password + # # password: + # # user: + # # name: admin + # # password: supersecret # A list of Updaters that will be used to update the compute unit metrics. This update # step can be used to update the aggregate metrics of each compute unit in real time diff --git a/build/package/ceems_api_server/ceems_api_server.service b/build/package/ceems_api_server/ceems_api_server.service index 41af0be1..fbcfc31b 100644 --- a/build/package/ceems_api_server/ceems_api_server.service +++ b/build/package/ceems_api_server/ceems_api_server.service @@ -1,5 +1,5 @@ [Unit] -Description=Prometheus CEEMS API Server +Description=CEEMS API Server After=network-online.target [Service] diff --git a/cmd/mock_servers/main.go b/cmd/mock_servers/main.go index 79a06dbc..e4011817 100644 --- a/cmd/mock_servers/main.go +++ b/cmd/mock_servers/main.go @@ -239,6 +239,22 @@ func ServersHandler(w http.ResponseWriter, r *http.Request) { w.Write([]byte("KO")) } +// TokensHandler handles OS tokens. +func TokensHandler(w http.ResponseWriter, r *http.Request) { + decoder := json.NewDecoder(r.Body) + + var t map[string]interface{} + + if err := decoder.Decode(&t); err != nil { + w.Write([]byte("KO")) + + return + } + + w.Header().Add("X-Subject-Token", "apitokensecret") + w.WriteHeader(http.StatusCreated) +} + // UsersHandler handles OS users. func UsersHandler(w http.ResponseWriter, r *http.Request) { if data, err := os.ReadFile("pkg/api/testdata/openstack/identity/users.json"); err == nil { @@ -328,10 +344,11 @@ func osKSServer(ctx context.Context) { // Registering our handler functions, and creating paths. osKSMux := http.NewServeMux() + osKSMux.HandleFunc("/v3/auth/tokens", TokensHandler) osKSMux.HandleFunc("/v3/users", UsersHandler) osKSMux.HandleFunc("/v3/users/{id}/projects", ProjectsHandler) - log.Println("Started Prometheus on port", osKSPortNum) + log.Println("Started Openstack identity API server on port", osKSPortNum) log.Println("To close connection CTRL+C :-)") // Start server diff --git a/internal/common/helpers.go b/internal/common/helpers.go index 4beaaa83..9fa9d4d2 100644 --- a/internal/common/helpers.go +++ b/internal/common/helpers.go @@ -57,6 +57,51 @@ func GetUUIDFromString(stringSlice []string) (string, error) { return uuid.String(), err } +// ConvertMapI2MapS walks the given dynamic object recursively, and +// converts maps with interface{} key type to maps with string key type. +// This function comes handy if you want to marshal a dynamic object into +// JSON where maps with interface{} key type are not allowed. +// +// Recursion is implemented into values of the following types: +// +// -map[interface{}]interface{} +// -map[string]interface{} +// -[]interface{} +// +// When converting map[interface{}]interface{} to map[string]interface{}, +// fmt.Sprint() with default formatting is used to convert the key to a string key. +// +// Nicked from https://github.com/icza/dyno +func ConvertMapI2MapS(v interface{}) interface{} { + switch x := v.(type) { + case map[interface{}]interface{}: + m := map[string]interface{}{} + + for k, v2 := range x { + switch k2 := k.(type) { + case string: // Fast check if it's already a string + m[k2] = ConvertMapI2MapS(v2) + default: + m[fmt.Sprint(k)] = ConvertMapI2MapS(v2) + } + } + + v = m + + case []interface{}: + for i, v2 := range x { + x[i] = ConvertMapI2MapS(v2) + } + + case map[string]interface{}: + for k, v2 := range x { + x[k] = ConvertMapI2MapS(v2) + } + } + + return v +} + // MakeConfig reads config file, merges with passed default config and returns updated // config instance. func MakeConfig[T any](filePath string) (*T, error) { diff --git a/internal/common/helpers_test.go b/internal/common/helpers_test.go index 27aff6d3..a0f8e3e2 100644 --- a/internal/common/helpers_test.go +++ b/internal/common/helpers_test.go @@ -10,6 +10,7 @@ import ( "net/http/httptest" "os" "path/filepath" + "reflect" "testing" "github.com/mahendrapaipuri/ceems/pkg/grafana" @@ -110,6 +111,78 @@ func TestGetUuid(t *testing.T) { assert.Equal(t, expected, got, "mismatched UUIDs") } +func TestConvertMapI2MapS(t *testing.T) { + cases := []struct { + title string // Title of the test case + v interface{} // Input dynamic object + exp interface{} // Expected result + }{ + { + title: "nil value", + v: nil, + exp: nil, + }, + { + title: "string value", + v: "a", + exp: "a", + }, + { + title: "map[interfac{}]interface{} value", + v: map[interface{}]interface{}{ + "s": "s", + 1: 1, + }, + exp: map[string]interface{}{ + "s": "s", + "1": 1, + }, + }, + { + title: "nested maps and slices", + v: map[interface{}]interface{}{ + "s": "s", + 1: 1, + float64(0): []interface{}{ + 1, + "x", + map[interface{}]interface{}{ + "s": "s", + 2.0: 2, + }, + map[string]interface{}{ + "s": "s", + "1": 1, + }, + }, + }, + exp: map[string]interface{}{ + "s": "s", + "1": 1, + "0": []interface{}{ + 1, + "x", + map[string]interface{}{ + "s": "s", + "2": 2, + }, + map[string]interface{}{ + "s": "s", + "1": 1, + }, + }, + }, + }, + } + + for _, c := range cases { + v := ConvertMapI2MapS(c.v) + if !reflect.DeepEqual(v, c.exp) { + t.Errorf("[title: %s] Expected value: %v, got: %v", c.title, c.exp, c.v) + } + } +} + func TestMakeConfig(t *testing.T) { tmpDir := t.TempDir() configFile := ` diff --git a/pkg/api/resource/openstack/compute.go b/pkg/api/resource/openstack/compute.go index b1420378..5f1890c3 100644 --- a/pkg/api/resource/openstack/compute.go +++ b/pkg/api/resource/openstack/compute.go @@ -108,7 +108,7 @@ func (o *openstackManager) activeInstances(ctx context.Context, start time.Time, wg.Wait() // If no servers found, return error(s) - if len(allServers) == 0 { + if allErrs != nil { return nil, allErrs } @@ -257,6 +257,12 @@ func (o *openstackManager) fetchInstances(ctx context.Context, start time.Time, return nil, fmt.Errorf("failed to create request to fetch Openstack instances: %w", err) } + // Add token to request headers + req, err = o.addTokenHeader(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to rotate api token for openstack cluster: %w", err) + } + // Add query parameters q := req.URL.Query() q.Add("all_tenants", "true") @@ -264,7 +270,7 @@ func (o *openstackManager) fetchInstances(ctx context.Context, start time.Time, if deleted { q.Add("deleted", "true") q.Add("changes-since", start.Format(osTimeFormat)) - q.Add("changes-until", end.Format(osTimeFormat)) + q.Add("changes-before", end.Format(osTimeFormat)) } req.URL.RawQuery = q.Encode() diff --git a/pkg/api/resource/openstack/identity.go b/pkg/api/resource/openstack/identity.go index ffb8e0e8..b447b829 100644 --- a/pkg/api/resource/openstack/identity.go +++ b/pkg/api/resource/openstack/identity.go @@ -1,6 +1,7 @@ package openstack import ( + "bytes" "context" "errors" "fmt" @@ -17,6 +18,32 @@ const ( chunkSize = 256 ) +// rotateToken requests new API token from keystone. +func (o *openstackManager) rotateToken(ctx context.Context) error { + // Create a new GET request + req, err := http.NewRequestWithContext( + ctx, + http.MethodPost, + o.tokens().String(), + bytes.NewBuffer(o.auth), + ) + if err != nil { + return fmt.Errorf("failed to create request to rotate API token for openstack cluster: %w", err) + } + + // Get Token + o.apiToken, err = apiTokenRequest(req, o.client) + if err != nil { + return fmt.Errorf("failed to complete request to rotate token for openstack cluster: %w", err) + } + + // Set token expiry. By default Openstack tokens are 1 hour and we use a tolerance + // of 5 minutes just to account for clock skew to avoid failed requests + o.apiTokenExpiry = time.Now().Add(tokenExpiryDuration - 5*time.Minute) + + return nil +} + // updateUsersProjects updates users and projects of a given Openstack cluster. func (o *openstackManager) updateUsersProjects(ctx context.Context, current time.Time) error { // Fetch current users and projects @@ -43,6 +70,12 @@ func (o *openstackManager) fetchUsers(ctx context.Context) ([]User, error) { return nil, fmt.Errorf("failed to create request to fetch users for openstack cluster: %w", err) } + // Add token to request headers + req, err = o.addTokenHeader(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to rotate api token for openstack cluster: %w", err) + } + // Get response resp, err := apiRequest[UsersResponse](req, o.client) if err != nil { @@ -65,6 +98,12 @@ func (o *openstackManager) fetchUserProjects(ctx context.Context, userID string) return nil, fmt.Errorf("failed to create request to fetch user projects for openstack cluster: %w", err) } + // Add token to request headers + req, err = o.addTokenHeader(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to rotate api token for openstack cluster: %w", err) + } + // Get response resp, err := apiRequest[ProjectsResponse](req, o.client) if err != nil { diff --git a/pkg/api/resource/openstack/manager.go b/pkg/api/resource/openstack/manager.go index c2f642c1..ce564975 100644 --- a/pkg/api/resource/openstack/manager.go +++ b/pkg/api/resource/openstack/manager.go @@ -4,6 +4,7 @@ package openstack import ( "context" + "encoding/json" "errors" "fmt" "log/slog" @@ -13,18 +14,26 @@ import ( "slices" "time" + "github.com/mahendrapaipuri/ceems/internal/common" "github.com/mahendrapaipuri/ceems/pkg/api/base" "github.com/mahendrapaipuri/ceems/pkg/api/models" "github.com/mahendrapaipuri/ceems/pkg/api/resource" config_util "github.com/prometheus/common/config" ) -var novaMicroVersionHeaders = []string{ - "X-OpenStack-Nova-API-Version", - "OpenStack-API-Version", -} +const ( + tokenHeaderName = "X-Auth-Token" //nolint:gosec + subjTokenHeaderName = "X-Subject-Token" +) -var osTimeFormat = base.DatetimeLayout + "-0700" +var ( + novaMicroVersionHeaders = []string{ + "X-OpenStack-Nova-API-Version", + "OpenStack-API-Version", + } + osTimeFormat = base.DatetimeLayout + "-0700" + tokenExpiryDuration = 1 * time.Hour // Openstack tokens are valid for 1 hour +) type userProjectsCache struct { userModels []models.User @@ -38,15 +47,28 @@ type openstackManager struct { logger *slog.Logger cluster models.Cluster apiURLs map[string]*url.URL + auth []byte client *http.Client + apiToken string + apiTokenExpiry time.Time userProjectsCache userProjectsCache userProjectsCacheTTL time.Duration userProjectsLastUpdateTime time.Time } -type apiConfig struct { - ComputeAPIURL string `yaml:"compute_api_url"` - IdentityAPIURL string `yaml:"identity_api_url"` +type openstackConfig struct { + APIEndpoints struct { + Compute string `yaml:"compute"` + Identity string `yaml:"identity"` + } `yaml:"api_service_endpoints"` + AuthConfig interface{} `yaml:"auth"` +} + +// addAuthKey embeds AuthConfig as value under `auth` key. +func (c *openstackConfig) addAuthKey() { + obj := map[string]interface{}{} + obj["auth"] = c.AuthConfig + c.AuthConfig = obj } const openstackVMManager = "openstack" @@ -59,7 +81,7 @@ func init() { // New returns a new openstackManager that returns compute instances. func New(cluster models.Cluster, logger *slog.Logger) (resource.Fetcher, error) { // Make openstackManager configs from clusters - openstackManager := openstackManager{ + openstackManager := &openstackManager{ logger: logger, cluster: cluster, apiURLs: make(map[string]*url.URL, 2), @@ -98,9 +120,9 @@ func New(cluster models.Cluster, logger *slog.Logger) (resource.Fetcher, error) return nil, err } - // Fetch compute and identity API URLs from extra_config - apiConfig := &apiConfig{} - if err := cluster.Extra.Decode(apiConfig); err != nil { + // Fetch compute and identity API URLs and auth config from extra_config + osConfig := &openstackConfig{} + if err := cluster.Extra.Decode(osConfig); err != nil { logger.Error("Failed to decode extra_config for Openstack cluster", "id", cluster.ID, "err", err) return nil, err @@ -108,24 +130,35 @@ func New(cluster models.Cluster, logger *slog.Logger) (resource.Fetcher, error) // Ensure we have valid compute and identity API URLs // Unwrap original error to avoid leaking sensitive passwords in output - openstackManager.apiURLs["compute"], err = url.Parse(apiConfig.ComputeAPIURL) + openstackManager.apiURLs["compute"], err = url.Parse(osConfig.APIEndpoints.Compute) if err != nil { logger.Error("Failed to parse compute service API URL for Openstack cluster", "id", cluster.ID, "err", err) return nil, errors.Unwrap(err) } - openstackManager.apiURLs["identity"], err = url.Parse(apiConfig.IdentityAPIURL) + openstackManager.apiURLs["identity"], err = url.Parse(osConfig.APIEndpoints.Identity) if err != nil { logger.Error("Failed to parse identity service API URL for Openstack cluster", "id", cluster.ID, "err", err) return nil, errors.Unwrap(err) } - // // Get initial list of flavors - // if err = openstackManager.updateFlavors(context.Background()); err != nil { - // return nil, err - // } + // Convert auth to bytes to embed into requests later + osConfig.addAuthKey() + + if openstackManager.auth, err = json.Marshal(common.ConvertMapI2MapS(osConfig.AuthConfig)); err != nil { + logger.Error("Failed to marshal auth object for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, errors.Unwrap(err) + } + + // Request first API token from keystone + if err := openstackManager.rotateToken(context.Background()); err != nil { + logger.Error("Failed to request API token for Openstack cluster", "id", cluster.ID, "err", err) + + return nil, errors.Unwrap(err) + } // Get initial users and projects if err = openstackManager.updateUsersProjects(context.Background(), time.Now()); err != nil { @@ -136,7 +169,7 @@ func New(cluster models.Cluster, logger *slog.Logger) (resource.Fetcher, error) logger.Info("Fetching VM instances from Openstack cluster", "id", cluster.ID) - return &openstackManager, nil + return openstackManager, nil } // FetchUnits fetches instances from openstack. @@ -168,6 +201,8 @@ func (o *openstackManager) FetchUsersProjects( if err := o.updateUsersProjects(ctx, current); err != nil { o.logger.Error("Failed to update users and projects data for Openstack cluster", "id", o.cluster.ID, "err", err) + + return nil, nil, err } } @@ -183,10 +218,10 @@ func (o *openstackManager) servers() *url.URL { return o.apiURLs["compute"].JoinPath("/servers/detail") } -// // flavors endpoint. -// func (o *openstackManager) flavors() *url.URL { -// return o.apiURLs["compute"].JoinPath("/flavors/detail") -// } +// tokens endpoint. +func (o *openstackManager) tokens() *url.URL { + return o.apiURLs["identity"].JoinPath("/v3/auth/tokens") +} // users endpoint. func (o *openstackManager) users() *url.URL { @@ -198,6 +233,22 @@ func (o *openstackManager) userProjects(id string) *url.URL { return o.apiURLs["identity"].JoinPath(fmt.Sprintf("/v3/users/%s/projects", id)) } +// addTokenHeader adds API token to request headers. +func (o *openstackManager) addTokenHeader(ctx context.Context, req *http.Request) (*http.Request, error) { + // Check if token is still valid. If not rotate token + if time.Now().After(o.apiTokenExpiry) { + if err := o.rotateToken(ctx); err != nil { + return nil, err + } + } + + // First remove any pre-configured tokens + req.Header.Del(tokenHeaderName) + req.Header.Add(tokenHeaderName, o.apiToken) + + return req, nil +} + // ping attempts to ping Openstack compute and identity API servers. func (o *openstackManager) ping(service string) error { if url, ok := o.apiURLs[service]; ok { diff --git a/pkg/api/resource/openstack/manager_test.go b/pkg/api/resource/openstack/manager_test.go index 9803f35d..572bec3b 100644 --- a/pkg/api/resource/openstack/manager_test.go +++ b/pkg/api/resource/openstack/manager_test.go @@ -2,6 +2,7 @@ package openstack import ( "context" + "encoding/json" "fmt" "io" "log/slog" @@ -174,6 +175,12 @@ func mockOSComputeAPIServer() *httptest.Server { // Start test server server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if strings.Contains(r.URL.Path, "servers") { + if tokens := r.Header[tokenHeaderName]; len(tokens) == 0 { + w.WriteHeader(http.StatusForbidden) + + return + } + var fileName string if _, ok := r.URL.Query()["deleted"]; ok { fileName = "deleted" @@ -204,12 +211,24 @@ func mockOSIdentityAPIServer() *httptest.Server { // Start test server server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { if strings.HasSuffix(r.URL.Path, "users") { + if tokens := r.Header[tokenHeaderName]; len(tokens) == 0 { + w.WriteHeader(http.StatusForbidden) + + return + } + if data, err := os.ReadFile("../../testdata/openstack/identity/users.json"); err == nil { w.Write(data) return } } else if strings.Contains(r.URL.Path, "users") { + if tokens := r.Header[tokenHeaderName]; len(tokens) == 0 { + w.WriteHeader(http.StatusForbidden) + + return + } + pathParts := strings.Split(r.URL.Path, "/") userID := pathParts[len(pathParts)-2] @@ -218,6 +237,21 @@ func mockOSIdentityAPIServer() *httptest.Server { return } + } else if strings.HasSuffix(r.URL.Path, "tokens") { + decoder := json.NewDecoder(r.Body) + + var t map[string]interface{} + + if err := decoder.Decode(&t); err != nil { + w.Write([]byte("KO")) + + return + } + + w.Header().Add(subjTokenHeaderName, "apitokensecret") + w.WriteHeader(http.StatusCreated) + + return } else { w.Write([]byte("KO")) } @@ -229,8 +263,17 @@ func mockOSIdentityAPIServer() *httptest.Server { func mockConfig(computeAPIURL, identityAPIURL string) (yaml.Node, error) { config := ` --- -compute_api_url: %s -identity_api_url: %s` +api_service_endpoints: + compute: %s + identity: %s +auth: + identity: + methods: + - password + password: + user: + name: admin + password: supersecret` cfg := fmt.Sprintf(config, computeAPIURL, identityAPIURL) diff --git a/pkg/api/resource/openstack/request.go b/pkg/api/resource/openstack/request.go index 5db45b4e..b3cd950e 100644 --- a/pkg/api/resource/openstack/request.go +++ b/pkg/api/resource/openstack/request.go @@ -2,6 +2,7 @@ package openstack import ( "encoding/json" + "errors" "fmt" "io" "net/http" @@ -10,7 +11,7 @@ import ( // apiRequest makes the request using client and returns response. func apiRequest[T any](req *http.Request, client *http.Client) (T, error) { // Add necessary headers - req.Header.Add("Content-Type", "application/x-www-form-urlencoded") + req.Header.Add("Content-Type", "application/json") // Make request resp, err := client.Do(req) @@ -38,3 +39,28 @@ func apiRequest[T any](req *http.Request, client *http.Client) (T, error) { return data, nil } + +// apiTokenRequest makes the request using client and returns API token. +func apiTokenRequest(req *http.Request, client *http.Client) (string, error) { + // Add necessary headers + req.Header.Add("Content-Type", "application/json") + + // Make request + resp, err := client.Do(req) + if err != nil { + return "", err + } + defer resp.Body.Close() + + // Check status code + if resp.StatusCode != http.StatusCreated { + return "", fmt.Errorf("request failed with status: %d", resp.StatusCode) + } + + // Read X-Subject-Token from response headers + if tokens := resp.Header[subjTokenHeaderName]; len(tokens) > 0 { + return tokens[0], nil + } + + return "", errors.New("no X-Subject-Token header found in response") +} diff --git a/pkg/api/testdata/config.yml b/pkg/api/testdata/config.yml index 2b95994a..92fad79b 100644 --- a/pkg/api/testdata/config.yml +++ b/pkg/api/testdata/config.yml @@ -33,15 +33,21 @@ clusters: - tsdb-0 web: http_headers: - X-Auth-Token: - secrets: - - supersecrettoken X-OpenStack-Nova-API-Version: values: - latest extra_config: - compute_api_url: http://localhost:8080/v2.1 - identity_api_url: http://localhost:7070 + api_service_endpoints: + compute: http://localhost:8080/v2.1 + identity: http://localhost:7070 + auth: + identity: + methods: + - password + password: + user: + name: admin + password: supersecret - id: os-1 manager: openstack @@ -50,15 +56,20 @@ clusters: - tsdb-1 web: http_headers: - X-Auth-Token: - secrets: - - supersecrettoken X-OpenStack-Nova-API-Version: values: - latest extra_config: - compute_api_url: http://localhost:8080/v2.1 - identity_api_url: http://localhost:7070 + api_service_endpoints: + compute: http://localhost:8080/v2.1 + identity: http://localhost:7070 + auth: + identity: + methods: + - application_credential + application_credential: + id: 21dced0fd20347869b93710d2b98aae0 + secret: supersecret updaters: - id: tsdb-0 diff --git a/website/docs/components/ceems-api-server.md b/website/docs/components/ceems-api-server.md index 68561636..c905d6b6 100644 --- a/website/docs/components/ceems-api-server.md +++ b/website/docs/components/ceems-api-server.md @@ -6,19 +6,19 @@ sidebar_position: 2 ## Background -CEEMS exporter exports compute unit and node level metrics to Prometheus. But this is -not enough to be able to query the metrics from Prometheus efficiently. Especially, for -batch jobs we need at least the timestamps of when the job has started and ended and -on which compute nodes to efficiently query the metrics. Storing these meta data of -the compute units in Prometheus is not ideal as they are not time series and using storing meta -data as labels can increase the cardinality very rapidly. - -At the same time, we would like to show to the end users aggregate metrics of their usage -which needs to make queries to Prometheus every time they load their dashboards. The -CEEMS API server has been introduced into the stack to address these limitations. CEEMS -API server is meant to store and server compute unit meta data, aggregate metrics of -compute units, users and projects _via_ API end points. This data will be gathered from -the underlying resource manager and keep it in a local DB based on SQLite. +CEEMS exporter exports compute unit and node level metrics to Prometheus. But this is +not enough to be able to query the metrics from Prometheus efficiently. Especially, for +batch jobs we need at least the timestamps of when the job has started and ended and +on which compute nodes to efficiently query the metrics. Storing these meta data of +the compute units in Prometheus is not ideal as they are not time series and using storing meta +data as labels can increase the cardinality very rapidly. + +At the same time, we would like to show to the end users aggregate metrics of their usage +which needs to make queries to Prometheus every time they load their dashboards. The +CEEMS API server has been introduced into the stack to address these limitations. CEEMS +API server is meant to store and server compute unit meta data, aggregate metrics of +compute units, users and projects _via_ API end points. This data will be gathered from +the underlying resource manager and keep it in a local DB based on SQLite. :::important[IMPORTANT] @@ -28,19 +28,19 @@ process. ::: -Effectively, it acts as an abstraction layer for different -resource managers and it is capable to storing data from different resource managers. -The advantage of this approach is that it acts a single point of data collection for -different resource managers of a data center and users will be able to consult their +Effectively, it acts as an abstraction layer for different +resource managers and it is capable to storing data from different resource managers. +The advantage of this approach is that it acts a single point of data collection for +different resource managers of a data center and users will be able to consult their usage of their compute units in a unified way. :::note[NOTE] -If the usernames are identical for different resource managers, _i.e.,_ if a data center -has SLURM and Openstack clusters and user identities for these two clusters are provided -by the same Identity Provider (IDP), it is possible for the Operators to use a -single deployment of CEEMS that uses the same IDP and expose the compute unit metrics -of both SLURM and Openstack clusters with the same instance of Grafana with +If the usernames are identical for different resource managers, _i.e.,_ if a data center +has SLURM and Openstack clusters and user identities for these two clusters are provided +by the same Identity Provider (IDP), it is possible for the Operators to use a +single deployment of CEEMS that uses the same IDP and expose the compute unit metrics +of both SLURM and Openstack clusters with the same instance of Grafana with different dashboards. ::: @@ -52,17 +52,17 @@ CEEMS API server primarily serves for two objectives: - To store the compute unit information of different resource managers in a unified way. The information we need is very basic like unique identifier of compute unit, project it belongs to, owner, current state, when it has started, resources allocated, _etc_. -- To update aggregate metrics of each compute unit by querying the TSDB in realtime. -This allows the end users to view the usage of their workloads in realtime like CPU, +- To update aggregate metrics of each compute unit by querying the TSDB in realtime. +This allows the end users to view the usage of their workloads in realtime like CPU, energy, emissions, _etc_. - To keep latest copy of users and their associated projects to enforce access control. -When coupled with -[JSON API DataSource](https://grafana.github.io/grafana-json-datasource/installation/) or +When coupled with +[JSON API DataSource](https://grafana.github.io/grafana-json-datasource/installation/) or [Infinity DataSource](https://grafana.com/grafana/plugins/yesoreyeram-infinity-datasource/) -of Grafana, we can list the compute units of a user -along with the metadata and aggregate metrics of each unit. The stored metadata in the -CEEMS API server DB will allow us to construct the URLs for the Grafana dashboards for +of Grafana, we can list the compute units of a user +along with the metadata and aggregate metrics of each unit. The stored metadata in the +CEEMS API server DB will allow us to construct the URLs for the Grafana dashboards for each compute dynamically based on start and end time of each compute unit. ![User job list](/img/dashboards/job_list_user.png) @@ -71,53 +71,53 @@ each compute dynamically based on start and end time of each compute unit. ### Resource Managers -Now that it is clear that CEEMS _can_ support different resource managers, it is time -to explain how CEEMS actually supports them. CEEMS has its own DB schema that stores -compute units metrics and meta data. Let's take meta data of each compute unit as an -example. For example, SLURM exposes meta data of jobs using either `sacct` command or -SLURM REST API. Openstack does it too using Keystone and Nova API servers so does the -Kubernetes with its API server. However, all these managers expose these meta data -in different ways each having their own API spec. +Now that it is clear that CEEMS _can_ support different resource managers, it is time +to explain how CEEMS actually supports them. CEEMS has its own DB schema that stores +compute units metrics and meta data. Let's take meta data of each compute unit as an +example. For example, SLURM exposes meta data of jobs using either `sacct` command or +SLURM REST API. Openstack does it too using Keystone and Nova API servers so does the +Kubernetes with its API server. However, all these managers expose these meta data +in different ways each having their own API spec. -CEEMS API server must implement each of this resource manager to fetch compute unit +CEEMS API server must implement each of this resource manager to fetch compute unit meta data and store it in CEEMS API server's DB. This is done using factory design -pattern and implemented in an extensible way. That means operators can implement their -own custom third party resource managers and plug into CEEMS API server. Essentially, -this translates to implementing two interfaces, one for fetching compute units and one -for fetching users and projects/namespaces/tenants data from the underlying resource -manager. +pattern and implemented in an extensible way. That means operators can implement their +own custom third party resource managers and plug into CEEMS API server. Essentially, +this translates to implementing two interfaces, one for fetching compute units and one +for fetching users and projects/namespaces/tenants data from the underlying resource +manager. -Currently, CEEMS API server ships SLURM support and soon Openstack support +Currently, CEEMS API server ships SLURM support and soon Openstack support will be added. ### Updaters -As CEEMS API server must store aggregate metrics of each compute unit, it must query -some sort of external DB that stores time series metrics of the compute units to -estimate aggregate metrics. As CEEMS ships an exporter that is capable of exporting -metrics to a Prometheus TSDB, a straight forward approach is to deploy CEEMS exporter +As CEEMS API server must store aggregate metrics of each compute unit, it must query +some sort of external DB that stores time series metrics of the compute units to +estimate aggregate metrics. As CEEMS ships an exporter that is capable of exporting +metrics to a Prometheus TSDB, a straight forward approach is to deploy CEEMS exporter on compute nodes and query Prometheus to estimate aggregate metrics. -This is done using a sub-component of CEEMS API server called updater. The job of -updater is to update the compute units fetched by a given resource manager with the -aggregate metrics of that compute unit. Like in the case of resource manager, updater -uses factory design pattern and it is extensible. It is possible to use custom -third party tools to update the compute units with aggregate metrics. +This is done using a sub-component of CEEMS API server called updater. The job of +updater is to update the compute units fetched by a given resource manager with the +aggregate metrics of that compute unit. Like in the case of resource manager, updater +uses factory design pattern and it is extensible. It is possible to use custom +third party tools to update the compute units with aggregate metrics. -Currently, CEEMS API server ships TSDB updater which is capable of estimating aggregate +Currently, CEEMS API server ships TSDB updater which is capable of estimating aggregate metrics using Prometheus TSDB server. ## Multi cluster support -A single deployment of CEEMS API server must be able to fetch and serve aggregate metrics -of compute units of multiple clusters of either same resource manager or different -resource managers. This means single CEEMS API server can store and serve metrics data -of multiple SLURM, Openstack and Kubernetes clusters. +A single deployment of CEEMS API server must be able to fetch and serve aggregate metrics +of compute units of multiple clusters of either same resource manager or different +resource managers. This means single CEEMS API server can store and serve metrics data +of multiple SLURM, Openstack and Kubernetes clusters. -In the same way, irrespective of each cluster using its own dedicated TSDB or a shared -TSDB with other clusters, Updater sub component of CEEMS API server is capable of +In the same way, irrespective of each cluster using its own dedicated TSDB or a shared +TSDB with other clusters, Updater sub component of CEEMS API server is capable of estimating aggregate metrics of each compute unit. -More details on how to configuration of multi-clusters can be found in -[Configuration](../configuration/ceems-api-server.md) section and some example +More details on how to configuration of multi-clusters can be found in +[Configuration](../configuration/ceems-api-server.md) section and some example scenarios are discussed in [Advanced](../advanced/multi-cluster.md) section. diff --git a/website/docs/configuration/ceems-api-server.md b/website/docs/configuration/ceems-api-server.md index a63c2baf..5ee98585 100644 --- a/website/docs/configuration/ceems-api-server.md +++ b/website/docs/configuration/ceems-api-server.md @@ -124,15 +124,20 @@ clusters: - tsdb-0 web: http_headers: - X-Auth-Token: - secrets: - - supersecrettoken X-OpenStack-Nova-API-Version: values: - latest extra_config: - compute_api_url: https://openstack-nova.example.com/v2.1 - identity_api_url: https://openstack-keystone.example.com + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + methods: + - password + password: + user: + name: admin + password: supersecret ``` Essentially it is a list of objects where each object describes a cluster. @@ -140,7 +145,7 @@ Essentially it is a list of objects where each object describes a cluster. - `id`: A unique identifier for each cluster. The identifier must stay consistent across CEEMS components, especially for CEEMS LB. More details can be found in [Configuring CEEMS LB](./ceems-lb.md) section. -- `manager`: Resource manager kind. Currently only **SLURM** and **Openstack** are +- `manager`: Resource manager kind. Currently only `slurm` and `openstack` are supported. - `updaters`: List of updaters to be used to update the aggregate metrics of the compute units. The order is important as compute units are updated in the same order @@ -161,6 +166,85 @@ All available options for the `web` configuration can be found in provided here. Currently, Openstack resource manager uses this section to configure the API URLs for compute and identity servers to fetch compute units, users and projects data. +### SLURM specific clusters configuration + +As stated before, currently fetching SLURM jobs using `sacct` command is the only supported +way. If the `sacct` binary is available on `PATH`, there is no need to provide any specific +configuration. However, if the binary is present on non-standard location, it is necessary to +provide the path to the binary using `cli` section of the config. For example, if the absolute +path of `sacct` is `/opt/slurm/bin/sacct`, then we need to configure `cli` section as follows: + +```yaml +cli: + path: /opt/slurm/bin +``` + +A minimal full cluster configuration would be: + +```yaml +clusters: + - id: slurm-0 + manager: slurm + cli: + path: /opt/slurm/bin +``` + +The section `cli` also has `environment_variables` key to provide any environment variables +while executing `sacct` command in a sub-process. This section takes key value as values: + +```yaml +clusters: + - id: slurm-0 + manager: slurm + cli: + path: /opt/slurm/bin + environment_variables: + ENVVAR_NAME: ENVVAR_VALUE +``` + +### Openstack specific clusters configuration + +In the case of Openstack, `extra_config` section must be used to setup Openstack's API +and auth configs. The following keys in `extra_config` must be provided: + +- `api_service_endpoints`: This section must provide the API endpoints for compute and +identity services. +- `auth`: This is the same auth object that needs to be passed to Openstack's identity +service to get an API token. More details can be found in [Keystone's API docs](https://docs.openstack.org/api-ref/identity/v3/#authentication-and-token-management). + +An example that provides password auth method is shown below: + +```yaml +extra_config: + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + identity: + methods: + - password + password: + user: + name: admin + password: supersecret +``` + +Similarly, the following example shows on how to use application credentials: + +```yaml +extra_config: + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + identity: + methods: + - application_credential + application_credential: + id: 21dced0fd20347869b93710d2b98aae0 + secret: supersecret +``` + :::important[IMPORTANT] It is important to configure the compute and identity API URLs as displayed by the @@ -171,6 +255,57 @@ as shown in the above config. ::: +:::note[NOTE] + +Admin level privileges must be available for configured auth object as CEEMS API server +needs to fetch the instances of **all** tenants and projects and it is only possible +with admin scope. + +::: + +It is advised to use application credentials instead of Admin password as it is +possible to scope the usage of application credentials to only compute and +identity services whereas admin account will give unrestricted access to all +cluster level resources. More details on how to create application credentials with +scopes can be found in [Keystone's docs](https://docs.openstack.org/keystone/latest/user/application_credentials.html). + +Openstack Nova (compute) uses micro versions for API and by default, CEEMS API +server uses the latest supported micro version. If a specific micro version is +desired it can be configured using `web.http_headers` section as follows: + +```yaml +web: + http_headers: + X-OpenStack-Nova-API-Version: + values: + - 2.12 +``` + +A sample full clusters config for Openstack is shown as below: + +```yaml +clusters: + - id: os-0 + manager: openstack + web: + http_headers: + X-OpenStack-Nova-API-Version: + values: + - latest + extra_config: + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + identity: + methods: + - password + password: + user: + name: admin + password: supersecret +``` + ## Updaters Configuration A sample updater config is shown below: @@ -277,15 +412,20 @@ clusters: manager: openstack web: http_headers: - X-Auth-Token: - secrets: - - supersecrettoken X-OpenStack-Nova-API-Version: values: - latest extra_config: - compute_api_url: https://openstack-nova.example.com/v2.1 - identity_api_url: https://openstack-keystone.example.com + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + methods: + - password + password: + user: + name: admin + password: supersecret ``` Assuming CEEMS exporter is deployed on the compute nodes of both SLURM @@ -320,15 +460,20 @@ clusters: - tsdb-0 web: http_headers: - X-Auth-Token: - secrets: - - supersecrettoken X-OpenStack-Nova-API-Version: values: - latest extra_config: - compute_api_url: https://openstack-nova.example.com/v2.1 - identity_api_url: https://openstack-keystone.example.com + api_service_endpoints: + compute: https://openstack-nova.example.com/v2.1 + identity: https://openstack-keystone.example.com + auth: + methods: + - password + password: + user: + name: admin + password: supersecret updaters: - id: tsdb-0 diff --git a/website/docs/configuration/config-reference.md b/website/docs/configuration/config-reference.md index 1170525b..caf4d37e 100644 --- a/website/docs/configuration/config-reference.md +++ b/website/docs/configuration/config-reference.md @@ -34,6 +34,7 @@ character in the source label should be converted to an underscore * ``: a string that identifies updater type. Currently accepted values are `tsdb`. * ``: a valid PromQL query string. * ``: a valid load balancing strategy. Currently accepted values are `round-robin`, `least-connection` and `resource-based`. +* ``: a generic object The other placeholders are specified separately. @@ -365,26 +366,32 @@ web: # Any other configuration needed to reach API server of the resource manager # can be configured in this section. # -# Currently this section is used for both SLURM and Openstack resource managers +# Currently this section is used for Openstack resource manager # to configure API versions # -# For example, for SLURM if your API endpoints are of form `/slurm/v0.0.40/diag`, -# the version is `v0.0.40`. -# Docs: https://slurm.schedmd.com/rest_api.html -# SLURM's REST API version can be set as `slurm: v0.0.40` -# -# In the case of Openstack, we need to fetch from different sources like identity, -# compute and they use different versioning of API. They can be configured using -# this section as well +# In the case of Openstack, this section must have two keys `api_service_endpoints` +# and `auth`. Both of these are compulsory. +# `api_service_endpoints` must provide API endpoints for compute and identity +# services as provided in service catalog of Openstack cluster. `auth` must be the +# same `auth` object that must be sent in POST request to keystone to get a API token. # # Example: # -# slurm: v0.0.40 # SLURM -# identity: v3 # Openstack -# compute: v2.1 # Openstack +# extra_config: +# api_service_endpoints: +# compute: https://openstack-nova.example.com/v2.1 +# identity: https://openstack-keystone.example.com +# auth: +# identity: +# methods: +# - password +# password: +# user: +# name: admin +# password: supersecret # extra_config: - [ : ... ] + [ : ... ] ``` ## ``