diff --git a/Makefile b/Makefile index 4dd370b7..73660ce0 100644 --- a/Makefile +++ b/Makefile @@ -155,6 +155,8 @@ test-e2e: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/collector/t ./scripts/e2e-test.sh -s api-current-usage-admin-denied-query ./scripts/e2e-test.sh -s api-verify-pass-query ./scripts/e2e-test.sh -s api-verify-fail-query + ./scripts/e2e-test.sh -s api-demo-units-query + ./scripts/e2e-test.sh -s api-demo-usage-query @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-basic @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-forbid-user-query-db @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-allow-user-query-db @@ -192,6 +194,8 @@ test-e2e-update: $(PROMTOOL) build pkg/collector/testdata/sys/.unpacked pkg/coll ./scripts/e2e-test.sh -s api-current-usage-admin-denied-query -u || true ./scripts/e2e-test.sh -s api-verify-pass-query -u || true ./scripts/e2e-test.sh -s api-verify-fail-query -u || true + ./scripts/e2e-test.sh -s api-demo-units-query -u || true + ./scripts/e2e-test.sh -s api-demo-usage-query -u || true @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-basic -u || true @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-forbid-user-query-db -u || true @env GOBIN=$(FIRST_GOPATH) ./scripts/e2e-test.sh -s lb-allow-user-query-db -u || true diff --git a/README.md b/README.md index 82a7f645..b31777e2 100644 --- a/README.md +++ b/README.md @@ -9,10 +9,10 @@ Compute Energy & Emissions Monitoring Stack (CEEMS) contains a Prometheus exporter to -export metrics of compute instance units and a REST API server which is meant to be used -as JSON datasource in Grafana that exposes the metadata and aggregated metrics of each -compute unit. Optionally, it includes a TSDB load balancer that supports basic load -balancing functionality based on retention periods of two or more TSDBs. +export metrics of compute instance units and a REST API server that serves the +metadata and aggregated metrics of each +compute unit. Optionally, it includes a TSDB load balancer that supports basic access +control on TSDB so that one user cannot access metrics of another user. "Compute Unit" in the current context has a wider scope. It can be a batch job in HPC, a VM in cloud, a pod in k8s, _etc_. The main objective of the repository is to quantify @@ -90,7 +90,7 @@ series metrics of their compute units be it a batch job, a VM or a pod. The user also able to have information on total energy consumed and total emissions generated by their individual workloads, by their project/namespace. -On the otherhand system admins will be able to list the consumption of energy, emissions, +On the other hand system admins will be able to list the consumption of energy, emissions, CPU time, memory, _etc_ for each projects/namespaces/users. This can be used to generate reports regularly on the energy usage of the data center. @@ -110,7 +110,8 @@ This server can be used as in Grafana to construct dashboards for users. The DB contain aggregate metrics of each compute unit along with aggregate metrics of each project. -- `ceems_lb`: This is a basic load balancer meant to work with TSDB instances. +- `ceems_lb`: This is a basic load balancer meant to provide basic access control for +TSDB so that one user cannot access metrics of another user. Currently, only SLURM is supported as a resource manager. In future support for Openstack and Kubernetes will be added. @@ -275,7 +276,7 @@ based on these proportions. As discussed in the introduction, `ceems_api_server` exposes usage and compute unit details of users _via_ API end points. This data will be gathered from the underlying resource manager at a configured interval of time and -keep it in a local DB. +keep it in a local DB. The API server has demo end points that serves mock data. ## CEEMS Load Balancer @@ -518,12 +519,12 @@ The stats server can be started as follows: ``` /path/to/ceems_api_server \ --resource.manager.slurm \ - --storage.data.path="/var/lib/ceems" \ + --storage.data.path="data" \ --log.level="debug" ``` Data files like SQLite3 DB created for the server will be placed in -`/var/lib/ceems` directory. Note that if this directory does exist, +`data` directory. Note that if this directory does not exist, `ceems_api_server` will attempt to create one if it has enough privileges. If it fails to create, error will be shown up. @@ -559,12 +560,19 @@ keeps the data for the past one year. ``` /path/to/ceems_api_server \ --resource.manager.slurm \ - --storage.path.data="/var/lib/ceems" \ + --storage.path.data="data" \ --storage.data.update.interval="30m" \ --storage.data.retention.period="1y" \ --log.level="debug" ``` +Once the server is running, we can query for compute units and usage using API endpoints. +To understand the data structure that will be returned by these end points, the server +has demo end points that return mock data. For getting units data, users can open the +browser and go to [localhost:9020/api/units/demo](http://localhost:9020/api/units/demo). +Similarly, for usage data, it is enough to visit +[localhost:9020/api/usage/demo](http://localhost:9020/api/usage/demo) + ### `ceems_lb` A basic config file used by `ceems_lb` is as follows: diff --git a/pkg/api/http/demo.go b/pkg/api/http/demo.go new file mode 100644 index 00000000..8f216c44 --- /dev/null +++ b/pkg/api/http/demo.go @@ -0,0 +1,241 @@ +package http + +import ( + "fmt" + "math/rand" + "slices" + "strconv" + "strings" + "time" + + google_uuid "github.com/google/uuid" + "github.com/mahendrapaipuri/ceems/pkg/api/models" +) + +// Number of units and usage stats to generate +const ( + numUnits = 100 + numUsage = 50 + maxInt64 int64 = 1<<63 - 1 +) + +// Resource manager specific definitions +var ( + resourceMgrs = []string{"slurm", "openstack", "k8s"} + states = map[string][]string{ + "openstack": { + "ACTIVE", "BUILD", "DELETED", "ERROR", "HARD_REBOOT", "MIGRATING", + "PAUSED", "REBOOT", "SHUTOFF", "SOFT_DELETED", "SUSPENDED", "UNKNOWN", + }, + "k8s": {"Pending", "Running", "Succeeded", "Failed"}, + "slurm": { + "CANCELLED", "COMPLETED", "PENDING", "RUNNING", "REQUEUED", "STOPPED", + "TIMEOUT", + }, + } + runningStates = map[string]string{ + "slurm": "RUNNING", + "openstack": "ACTIVE", + "k8s": "Running", + } + projects = map[string][]string{ + "slurm": {"acc1", "acc2", "acc3", "acc4", "acc5"}, + "openstack": {"tenant1", "tenant2", "tenant3"}, + "k8s": {"ns1", "ns2", "ns3", "ns4", "ns5", "ns6"}, + } + users = []string{ + "user1", "user2", "user3", "user4", "user5", "user6", "user7", + } + allProjects []string +) + +// Get a slice of all projects +func init() { + for _, p := range projects { + allProjects = append(allProjects, p...) + } +} + +// randomFloats returns random float64s in the range +func randomFloats(min, max float64) models.JSONFloat { + return models.JSONFloat(min + rand.Float64()*(max-min)) +} + +// random returns random number between min and max excluding max +func random(min, max int64) int64 { + return randomHelper(max-min-1) + min +} + +// randomHelper returns max int64 if n is more than max +func randomHelper(n int64) int64 { + if n < maxInt64 { + return int64(rand.Int63n(int64(n + 1))) + } + x := int64(rand.Uint64()) + for x > n { + x = int64(rand.Uint64()) + } + return x +} + +// mockUnits will generate units with randomised data +func mockUnits() []models.Unit { + // Define mock group, projects + user := users[0] + group := "group" + numResourceMgrs := len(resourceMgrs) + + // Current time in epoch + currentEpoch := time.Now().Local().Unix() + + // Minimum start time. Using 1 day before current time + minStartTime := currentEpoch - 86400 + + // Minimum end time. Must in last 30min + minEndTime := currentEpoch - 1800 + + // Max waiting time between creation and start time in seconds + var maxWait int64 = 7200 + + // Generate units + var units = make([]models.Unit, numUnits) + for i := 0; i < numUnits; i++ { + resourceMgr := resourceMgrs[random(0, int64(numResourceMgrs))] + + // Use manager specific uuid + var uuid string + if resourceMgr == "slurm" { + uuid = strconv.FormatInt(currentEpoch+int64(i), 10) + } else { + uuid = google_uuid.New().String() + } + + // Get random project based on manager + project := projects[resourceMgr][random(0, 2)] + + // Name is always demo followed by ID + name := fmt.Sprintf("demo-%d", numUnits-i) + + // Generate a random start time based on current and min start time + startTimeTS := random(minStartTime, currentEpoch) + createTimeTS := startTimeTS - random(5, maxWait) + startedAt := time.Unix(startTimeTS, 0).Format(time.RFC1123) + + // First 20 jobs must be running and rest should have different status + var state, endedAt string + var endTimeTS, elapsedRaw int64 + if i < 20 { + state = runningStates[resourceMgr] + endTimeTS = 0 + endedAt = "Unknown" + elapsedRaw = currentEpoch - startTimeTS + } else { + state = states[resourceMgr][random(0, int64(len(states[resourceMgr])))] + endTimeTS = random(minEndTime, currentEpoch) + endedAt = time.Unix(endTimeTS, 0).Format(time.RFC1123) + elapsedRaw = endTimeTS - startTimeTS + } + + // If state is pending, starttime, elapsed must be zero + avgUsageFlag := models.JSONFloat(1) + if slices.Contains([]string{"PENDING", "Pending", "REQUEUED", "BUILD", "UNKNOWN"}, state) { + startTimeTS = 0 + elapsedRaw = 0 + startedAt = "Unknown" + endTimeTS = 0 + endedAt = "Unknown" + avgUsageFlag = 0 + } + + units[i] = models.Unit{ + ID: int64(i), + ResourceManager: resourceMgr, + UUID: uuid, + Name: name, + Project: project, + Usr: user, + Grp: group, + CreatedAt: time.Unix(createTimeTS, 0).Format(time.RFC1123), + StartedAt: startedAt, + EndedAt: endedAt, + CreatedAtTS: createTimeTS, + StartedAtTS: startTimeTS, + EndedAtTS: endTimeTS, + Elapsed: time.Duration(elapsedRaw * int64(time.Second)).String(), + ElapsedRaw: elapsedRaw, + State: state, + TotalCPUBilling: 2 * elapsedRaw, + TotalGPUBilling: 5 * elapsedRaw, + TotalMiscBilling: int64(0.5 * float64(elapsedRaw)), + AveCPUUsage: avgUsageFlag * randomFloats(0, 100), + AveCPUMemUsage: avgUsageFlag * randomFloats(0, 100), + TotalCPUEnergyUsage: models.JSONFloat(1.1 * float64(elapsedRaw)), + TotalCPUEmissions: models.JSONFloat(17 * float64(elapsedRaw)), + AveGPUUsage: avgUsageFlag * randomFloats(0, 100), + AveGPUMemUsage: avgUsageFlag * randomFloats(0, 100), + TotalGPUEnergyUsage: models.JSONFloat(15 * float64(elapsedRaw)), + TotalGPUEmissions: models.JSONFloat(158 * float64(elapsedRaw)), + TotalIOWriteHot: models.JSONFloat(0.01 * float64(elapsedRaw)), + TotalIOReadHot: models.JSONFloat(0.03 * float64(elapsedRaw)), + TotalIOWriteCold: models.JSONFloat(0.008 * float64(elapsedRaw)), + TotalIOReadCold: models.JSONFloat(0.01 * float64(elapsedRaw)), + TotalIngress: models.JSONFloat(0.05 * float64(elapsedRaw)), + TotalOutgress: models.JSONFloat(0.02 * float64(elapsedRaw)), + } + } + return units +} + +// mockUsage will generate usage with randomised data +func mockUsage() []models.Usage { + // Set user project map + var userProjectMap = make(map[string][]string) + for _, user := range users { + var userProjects []string + for i := 0; i < int(random(1, 4)); i++ { + userProjects = append( + userProjects, allProjects[int(random(0, int64(len(allProjects))))], + ) + } + userProjectMap[user] = userProjects + } + + // Generate usage + var usage []models.Usage + for user, prjs := range userProjectMap { + for _, prj := range prjs { + var resourceMgr string + if strings.HasPrefix(prj, "acc") { + resourceMgr = "slurm" + } else if strings.HasPrefix(prj, "tenant") { + resourceMgr = "openstack" + } else { + resourceMgr = "k8s" + } + usage = append(usage, models.Usage{ + ResourceManager: resourceMgr, + Project: prj, + Usr: user, + NumUnits: random(0, 1000), + TotalCPUBilling: random(0, 1e5), + TotalGPUBilling: random(0, 1e4), + TotalMiscBilling: random(0, 1e3), + AveCPUUsage: randomFloats(0, 100), + AveCPUMemUsage: randomFloats(0, 100), + TotalCPUEnergyUsage: randomFloats(0, 5e3), + TotalCPUEmissions: randomFloats(0, 50e3), + AveGPUUsage: randomFloats(0, 100), + AveGPUMemUsage: randomFloats(0, 100), + TotalGPUEnergyUsage: randomFloats(0, 50e3), + TotalGPUEmissions: randomFloats(0, 500e3), + TotalIOWriteHot: randomFloats(0, 5e2), + TotalIOReadHot: randomFloats(0, 50e2), + TotalIOWriteCold: randomFloats(0, 100), + TotalIOReadCold: randomFloats(0, 1000), + TotalIngress: randomFloats(0, 5e3), + TotalOutgress: randomFloats(0, 2e3), + }) + } + } + return usage +} diff --git a/pkg/api/http/middleware.go b/pkg/api/http/middleware.go index a7370d9d..b84ed9ff 100644 --- a/pkg/api/http/middleware.go +++ b/pkg/api/http/middleware.go @@ -53,8 +53,8 @@ func (amw *authenticationMiddleware) Middleware(next http.Handler) http.Handler return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { var loggedUser string - // If requested URI is health or root "/" pass through - if strings.HasSuffix(r.URL.Path, "health") || r.URL.Path == "/" { + // If requested URI is health, demo or "/" pass through + if strings.HasSuffix(r.URL.Path, "health") || strings.HasSuffix(r.URL.Path, "demo") || r.URL.Path == "/" { goto end } diff --git a/pkg/api/http/server.go b/pkg/api/http/server.go index cddb7ad8..87950371 100644 --- a/pkg/api/http/server.go +++ b/pkg/api/http/server.go @@ -155,6 +155,9 @@ func NewCEEMSServer(c *Config) (*CEEMSServer, func(), error) { Methods("GET") router.HandleFunc(fmt.Sprintf("/api/%s/verify", unitsResourceName), server.verifyUnitsOwnership).Methods("GET") + // A demo end point that returns mocked data for units and/or usage tables + router.HandleFunc("/api/{resource:(?:units|usage)}/demo", server.demo).Methods("GET") + // pprof debug end points router.PathPrefix("/debug/").Handler(http.DefaultServeMux) @@ -633,3 +636,48 @@ func (s *CEEMSServer) usageAdmin(w http.ResponseWriter, r *http.Request) { s.globalUsage(r.URL.Query()["user"], w, r) } } + +// GET /api/demo/{units,usage} +// Return mocked data for different models +func (s *CEEMSServer) demo(w http.ResponseWriter, r *http.Request) { + // Set headers + s.setHeaders(w) + + // Get path parameter type + var resourceType string + var exists bool + if resourceType, exists = mux.Vars(r)["resource"]; !exists { + errorResponse(w, &apiError{errorBadData, errInvalidRequest}, s.logger, nil) + return + } + + // handle units mock data + if resourceType == "units" { + units := mockUnits() + // Write response + w.WriteHeader(http.StatusOK) + unitsResponse := Response{ + Status: "success", + Data: units, + } + if err := json.NewEncoder(w).Encode(&unitsResponse); err != nil { + level.Error(s.logger).Log("msg", "Failed to encode response", "err", err) + w.Write([]byte("KO")) + } + } + + // handle usage mock data + if resourceType == "usage" { + usage := mockUsage() + // Write response + w.WriteHeader(http.StatusOK) + usageResponse := Response{ + Status: "success", + Data: usage, + } + if err := json.NewEncoder(w).Encode(&usageResponse); err != nil { + level.Error(s.logger).Log("msg", "Failed to encode response", "err", err) + w.Write([]byte("KO")) + } + } +} diff --git a/pkg/api/testdata/output/e2e-test-api-demo-units-query.txt b/pkg/api/testdata/output/e2e-test-api-demo-units-query.txt new file mode 100644 index 00000000..ae4ee13c --- /dev/null +++ b/pkg/api/testdata/output/e2e-test-api-demo-units-query.txt @@ -0,0 +1 @@ +200 \ No newline at end of file diff --git a/pkg/api/testdata/output/e2e-test-api-demo-usage-query.txt b/pkg/api/testdata/output/e2e-test-api-demo-usage-query.txt new file mode 100644 index 00000000..ae4ee13c --- /dev/null +++ b/pkg/api/testdata/output/e2e-test-api-demo-usage-query.txt @@ -0,0 +1 @@ +200 \ No newline at end of file diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 1019413c..2277d7c7 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -138,6 +138,14 @@ then then desc="/api/units/verify end point test with fail request" fixture='pkg/api/testdata/output/e2e-test-api-verify-fail-query.txt' + elif [ "${scenario}" = "api-demo-units-query" ] + then + desc="/api/units/demo end point test" + fixture='pkg/api/testdata/output/e2e-test-api-demo-units-query.txt' + elif [ "${scenario}" = "api-demo-usage-query" ] + then + desc="/api/usage/demo end point test" + fixture='pkg/api/testdata/output/e2e-test-api-demo-usage-query.txt' fi logfile="${tmpdir}/ceems_api_server.log" @@ -420,6 +428,12 @@ then elif [ "${scenario}" = "api-verify-fail-query" ] then get -H "X-Grafana-User: usr2" "127.0.0.1:${port}/api/units/verify?uuid=1479763&uuid=11508" > "${fixture_output}" + elif [ "${scenario}" = "api-demo-units-query" ] + then + get -s -o /dev/null -w "%{http_code}" "127.0.0.1:${port}/api/units/demo" > "${fixture_output}" + elif [ "${scenario}" = "api-demo-usage-query" ] + then + get -s -o /dev/null -w "%{http_code}" "127.0.0.1:${port}/api/usage/demo" > "${fixture_output}" fi elif [[ "${scenario}" =~ ^"lb" ]]