From dba383d6e5c450bc15d82d97289ab445d6361a64 Mon Sep 17 00:00:00 2001 From: Ekaterina Kazachkova Date: Tue, 20 Aug 2024 14:23:20 +0300 Subject: [PATCH] API: Show GPU stats on the node utilization page - fixes (#3640) * GPU Stats API: allow non-existing indices * GPU Stats API: fix for overlapping intervals * GPU Stats API: fix for GPU instance types loading during monitoring process * GPU Stats API: refactor load gpu charts --- .../HeapsterElasticRestHighLevelClient.java | 2 +- .../ESMonitoringManager.java | 35 +++++++++--------- .../client/pipeline/CloudPipelineAPI.java | 5 +++ .../cluster/AllowedInstanceAndPriceTypes.java | 12 +++---- .../monitor/rest/CloudPipelineAPIClient.java | 10 ++++++ .../monitor/service/InstanceTypesLoader.java | 36 ++++++++++++------- 6 files changed, 63 insertions(+), 37 deletions(-) diff --git a/api/src/main/java/com/epam/pipeline/dao/monitoring/metricrequester/HeapsterElasticRestHighLevelClient.java b/api/src/main/java/com/epam/pipeline/dao/monitoring/metricrequester/HeapsterElasticRestHighLevelClient.java index 0e9c69d63c..9f0552a363 100644 --- a/api/src/main/java/com/epam/pipeline/dao/monitoring/metricrequester/HeapsterElasticRestHighLevelClient.java +++ b/api/src/main/java/com/epam/pipeline/dao/monitoring/metricrequester/HeapsterElasticRestHighLevelClient.java @@ -50,7 +50,7 @@ private Request convertToRequest(final SearchRequest searchRequest) throws IOExc request.addParameter("allow_no_indices", "true"); request.addParameter("expand_wildcards", "open,closed"); request.addParameter("search_type", "query_then_fetch"); - request.addParameter("ignore_unavailable", "false"); + request.addParameter("ignore_unavailable", "true"); request.addParameter("batched_reduce_size", "512"); if (searchRequest.source() != null) { request.setEntity(createEntity(searchRequest.source(), XContentType.JSON)); diff --git a/api/src/main/java/com/epam/pipeline/manager/cluster/performancemonitoring/ESMonitoringManager.java b/api/src/main/java/com/epam/pipeline/manager/cluster/performancemonitoring/ESMonitoringManager.java index 30f763967b..0dc33a9eb8 100644 --- a/api/src/main/java/com/epam/pipeline/manager/cluster/performancemonitoring/ESMonitoringManager.java +++ b/api/src/main/java/com/epam/pipeline/manager/cluster/performancemonitoring/ESMonitoringManager.java @@ -328,16 +328,12 @@ private List getGpuCharts(final List loa if (!GpuMetricsGranularity.hasAggregations(loadTypes) && !GpuMetricsGranularity.hasDetails(loadTypes)) { return null; } - final List charts = new ArrayList<>(); - if (GpuMetricsGranularity.hasAggregations(loadTypes)) { - charts.addAll(requestCharts(aggregationRequester, interval, nodeName, start, end)); - } - if (GpuMetricsGranularity.hasDetails(loadTypes)) { - final GPUDetailsRequester detailsRequester = new GPUDetailsRequester(client); - charts.addAll(requestCharts(detailsRequester, interval, nodeName, start, end)); - } if (GpuMetricsGranularity.hasDetails(loadTypes) && GpuMetricsGranularity.hasAggregations(loadTypes)) { - return charts.stream() + final List charts = new ArrayList<>(aggregationRequester + .requestStats(nodeName, start, end, interval)); + final GPUDetailsRequester detailsRequester = new GPUDetailsRequester(client); + charts.addAll(detailsRequester.requestStats(nodeName, start, end, interval)); + return sortGpuCharts(charts.stream() .collect(Collectors.groupingBy(MonitoringStats::getStartTime, Collectors.reducing(this::mergeGpuStats))) .values().stream() @@ -346,15 +342,13 @@ private List getGpuCharts(final List loa .filter(stats -> stats.getGpuUsage() != null && stats.getGpuDetails() != null) .map(stats -> statsWithinRegion(stats, start, end, interval)) .filter(Optional::isPresent) - .map(Optional::get) - .sorted(Comparator.comparing(MonitoringStats::getStartTime, - Comparator.comparing(this::asMonitoringDateTime))) - .collect(Collectors.toList()); + .map(Optional::get)); } - return charts.stream() - .sorted(Comparator.comparing(MonitoringStats::getStartTime, - Comparator.comparing(this::asMonitoringDateTime))) - .collect(Collectors.toList()); + if (GpuMetricsGranularity.hasAggregations(loadTypes)) { + return sortGpuCharts(requestCharts(aggregationRequester, interval, nodeName, start, end).stream()); + } + final GPUDetailsRequester detailsRequester = new GPUDetailsRequester(client); + return sortGpuCharts(requestCharts(detailsRequester, interval, nodeName, start, end).stream()); } private List requestCharts(final AbstractMetricRequester requester, final Duration interval, @@ -365,4 +359,11 @@ private List requestCharts(final AbstractMetricRequester reques .filter(Objects::nonNull) .collect(Collectors.toList()); } + + private List sortGpuCharts(final Stream stats) { + return stats + .sorted(Comparator.comparing(MonitoringStats::getStartTime, + Comparator.comparing(this::asMonitoringDateTime))) + .collect(Collectors.toList()); + } } diff --git a/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/client/pipeline/CloudPipelineAPI.java b/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/client/pipeline/CloudPipelineAPI.java index f772a55b38..a80f8bf960 100644 --- a/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/client/pipeline/CloudPipelineAPI.java +++ b/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/client/pipeline/CloudPipelineAPI.java @@ -17,6 +17,7 @@ package com.epam.pipeline.client.pipeline; import com.epam.pipeline.entity.app.ApplicationInfo; +import com.epam.pipeline.entity.cluster.AllowedInstanceAndPriceTypes; import com.epam.pipeline.entity.cluster.InstanceType; import com.epam.pipeline.entity.cluster.NodeDisk; import com.epam.pipeline.entity.cluster.NodeInstance; @@ -297,6 +298,10 @@ Call>> loadRunsActivityStats(@Query(FROM) String from, @GET("cluster/instance/loadAll") Call>> loadAllInstanceTypes(); + @GET("cluster/instance/allowed") + Call> loadAllowedInstanceAndPriceTypesForRegion( + @Query(REGION_ID) Long regionId); + @GET("cluster/node/{id}/disks") Call>> loadNodeDisks(@Path(ID) String nodeId); diff --git a/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/entity/cluster/AllowedInstanceAndPriceTypes.java b/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/entity/cluster/AllowedInstanceAndPriceTypes.java index 93810278d5..e1b7c4fc7b 100644 --- a/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/entity/cluster/AllowedInstanceAndPriceTypes.java +++ b/cloud-pipeline-common/model/src/main/java/com/epam/pipeline/entity/cluster/AllowedInstanceAndPriceTypes.java @@ -23,12 +23,12 @@ @Value public class AllowedInstanceAndPriceTypes { - @JsonProperty("allowed.instance.types") - private final List allowedInstanceTypes; + @JsonProperty("cluster.allowed.instance.types") + List allowedInstanceTypes; - @JsonProperty("allowed.instance.docker.types") - private final List allowedInstanceDockerTypes; + @JsonProperty("cluster.allowed.instance.types.docker") + List allowedInstanceDockerTypes; - @JsonProperty("allowed.price.types") - private final List allowedPriceTypes; + @JsonProperty("cluster.allowed.price.types") + List allowedPriceTypes; } diff --git a/monitoring-service/src/main/java/com/epam/pipeline/monitor/rest/CloudPipelineAPIClient.java b/monitoring-service/src/main/java/com/epam/pipeline/monitor/rest/CloudPipelineAPIClient.java index 762563f29b..6a8be906f0 100644 --- a/monitoring-service/src/main/java/com/epam/pipeline/monitor/rest/CloudPipelineAPIClient.java +++ b/monitoring-service/src/main/java/com/epam/pipeline/monitor/rest/CloudPipelineAPIClient.java @@ -19,10 +19,12 @@ import com.epam.pipeline.client.pipeline.CloudPipelineAPI; import com.epam.pipeline.client.pipeline.CloudPipelineApiBuilder; import com.epam.pipeline.client.pipeline.CloudPipelineApiExecutor; +import com.epam.pipeline.entity.cluster.AllowedInstanceAndPriceTypes; import com.epam.pipeline.entity.cluster.InstanceType; import com.epam.pipeline.entity.cluster.pool.NodePool; import com.epam.pipeline.entity.pipeline.PipelineRun; import com.epam.pipeline.entity.preference.Preference; +import com.epam.pipeline.entity.region.AbstractCloudRegion; import com.epam.pipeline.vo.cluster.pool.NodePoolUsage; import com.epam.pipeline.vo.user.OnlineUsers; import org.apache.commons.collections4.ListUtils; @@ -96,6 +98,14 @@ public List loadAllInstanceTypes() { return ListUtils.emptyIfNull(executor.execute(cloudPipelineAPI.loadAllInstanceTypes())); } + public List loadAllRegions() { + return ListUtils.emptyIfNull(executor.execute(cloudPipelineAPI.loadAllRegions())); + } + + public AllowedInstanceAndPriceTypes loadAllowedInstanceAndPriceTypesForRegion(final Long regionId) { + return executor.execute(cloudPipelineAPI.loadAllowedInstanceAndPriceTypesForRegion(regionId)); + } + public void archiveRuns() { executor.execute(cloudPipelineAPI.archiveRuns()); } diff --git a/monitoring-service/src/main/java/com/epam/pipeline/monitor/service/InstanceTypesLoader.java b/monitoring-service/src/main/java/com/epam/pipeline/monitor/service/InstanceTypesLoader.java index c2ae1a854f..c5ddd96457 100644 --- a/monitoring-service/src/main/java/com/epam/pipeline/monitor/service/InstanceTypesLoader.java +++ b/monitoring-service/src/main/java/com/epam/pipeline/monitor/service/InstanceTypesLoader.java @@ -17,20 +17,24 @@ package com.epam.pipeline.monitor.service; import com.epam.pipeline.entity.cluster.InstanceType; +import com.epam.pipeline.entity.region.AbstractCloudRegion; import com.epam.pipeline.monitor.rest.CloudPipelineAPIClient; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections4.ListUtils; import org.apache.commons.lang3.StringUtils; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import javax.annotation.PostConstruct; +import java.util.Objects; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; +import java.util.stream.Stream; /** - * Refreshes GPU instance types + * Refreshes GPU instance types across all regions */ @Service @RequiredArgsConstructor @@ -42,24 +46,30 @@ public class InstanceTypesLoader { @PostConstruct public void init() { - refreshInstances(); - } - - @Scheduled(fixedDelayString = "${refresh.instances.timeout:86400000}") - public void refreshInstances() { try { - final Set loadedTypes = client.loadAllInstanceTypes().stream() - .filter(it -> it.getGpu() > 0) - .map(InstanceType::getName) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toSet()); - gpuInstanceTypes.clear(); - gpuInstanceTypes.addAll(loadedTypes); + refreshInstances(); } catch (Exception e) { log.error(e.getMessage(), e); } } + @Scheduled(fixedDelayString = "${refresh.instances.timeout:86400000}") + public void refreshInstances() { + final Set loadedTypes = client.loadAllRegions().stream() + .map(AbstractCloudRegion::getId) + .map(client::loadAllowedInstanceAndPriceTypesForRegion) + .filter(Objects::nonNull) + .flatMap(data -> Stream.concat( + ListUtils.emptyIfNull(data.getAllowedInstanceDockerTypes()).stream(), + ListUtils.emptyIfNull(data.getAllowedInstanceTypes()).stream())) + .filter(it -> it.getGpu() > 0) + .map(InstanceType::getName) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toSet()); + gpuInstanceTypes.clear(); + gpuInstanceTypes.addAll(loadedTypes); + } + public Set loadGpuInstanceTypes() { return gpuInstanceTypes; }