Skip to content

Commit

Permalink
Make Health API more resilient to multi-version clusters (elastic#105789
Browse files Browse the repository at this point in the history
)

First check whether the full cluster supports a specific indicator (feature) before we mark an indicator as "unknown" when (meta) data is missing from the cluster state.
  • Loading branch information
nielsbauman committed Mar 4, 2024
1 parent 2103adc commit 7529dc9
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 42 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/105789.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 105789
summary: Make Health API more resilient to multi-version clusters
area: Health
type: bug
issues:
- 90183
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
---
"cluster health basic test":
- skip:
version: all
reason: "AwaitsFix https://github.com/elastic/elasticsearch/issues/90183"
# version: "- 8.3.99"
# reason: "health was only added in 8.2.0, and master_is_stable in 8.4.0"
version: "- 8.3.99"
reason: "health was only added in 8.2.0, and master_is_stable in 8.4.0"

- do:
health_report: { }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,21 @@
import org.elasticsearch.features.NodeFeature;

import java.util.Map;
import java.util.Set;

public class HealthFeatures implements FeatureSpecification {

public static final NodeFeature SUPPORTS_HEALTH = new NodeFeature("health.supports_health");
public static final NodeFeature SUPPORTS_SHARDS_CAPACITY_INDICATOR = new NodeFeature("health.shards_capacity_indicator");
public static final NodeFeature SUPPORTS_EXTENDED_REPOSITORY_INDICATOR = new NodeFeature("health.extended_repository_indicator");

@Override
public Set<NodeFeature> getFeatures() {
return Set.of(SUPPORTS_EXTENDED_REPOSITORY_INDICATOR);
}

@Override
public Map<NodeFeature, Version> getHistoricalFeatures() {
return Map.of(SUPPORTS_HEALTH, Version.V_8_5_0);
return Map.of(SUPPORTS_HEALTH, Version.V_8_5_0, SUPPORTS_SHARDS_CAPACITY_INDICATOR, Version.V_8_8_0);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -71,9 +73,11 @@ public class DiskHealthIndicatorService implements HealthIndicatorService {
private static final String IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID = "cluster_functionality_unavailable";

private final ClusterService clusterService;
private final FeatureService featureService;

public DiskHealthIndicatorService(ClusterService clusterService) {
public DiskHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand All @@ -83,8 +87,18 @@ public String name() {

@Override
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
ClusterState clusterState = clusterService.state();
Map<String, DiskHealthInfo> diskHealthInfoMap = healthInfo.diskInfoByNode();
if (diskHealthInfoMap == null || diskHealthInfoMap.isEmpty()) {
if (featureService.clusterHasFeature(clusterState, HealthFeatures.SUPPORTS_HEALTH) == false) {
return createIndicator(
HealthStatus.GREEN,
"No disk usage data available. The cluster currently has mixed versions (an upgrade may be in progress).",
HealthIndicatorDetails.EMPTY,
List.of(),
List.of()
);
}
/*
* If there is no disk health info, that either means that a new health node was just elected, or something is seriously
* wrong with health data collection on the health node. Either way, we immediately return UNKNOWN. If there are at least
Expand All @@ -98,7 +112,6 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
Collections.emptyList()
);
}
ClusterState clusterState = clusterService.state();
logNodesMissingHealthInfo(diskHealthInfoMap, clusterState);

DiskHealthAnalyzer diskHealthAnalyzer = new DiskHealthAnalyzer(diskHealthInfoMap, clusterState);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.TriFunction;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -90,9 +92,11 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
);

private final ClusterService clusterService;
private final FeatureService featureService;

public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand All @@ -105,6 +109,15 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
var state = clusterService.state();
var healthMetadata = HealthMetadata.getFromClusterState(state);
if (healthMetadata == null || healthMetadata.getShardLimitsMetadata() == null) {
if (featureService.clusterHasFeature(state, HealthFeatures.SUPPORTS_SHARDS_CAPACITY_INDICATOR) == false) {
return createIndicator(
HealthStatus.GREEN,
"No shard limits configured yet. The cluster currently has mixed versions (an upgrade may be in progress).",
HealthIndicatorDetails.EMPTY,
List.of(),
List.of()
);
}
return unknownIndicator();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1189,9 +1189,9 @@ private Module loadDiagnosticServices(

var serverHealthIndicatorServices = Stream.of(
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
new RepositoryIntegrityHealthIndicatorService(clusterService),
new DiskHealthIndicatorService(clusterService),
new ShardsCapacityHealthIndicatorService(clusterService)
new RepositoryIntegrityHealthIndicatorService(clusterService, featureService),
new DiskHealthIndicatorService(clusterService, featureService),
new ShardsCapacityHealthIndicatorService(clusterService, featureService)
);
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)
.flatMap(plugin -> plugin.getHealthIndicatorServices().stream());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
import org.elasticsearch.cluster.metadata.RepositoriesMetadata;
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
Expand Down Expand Up @@ -59,6 +61,8 @@ public class RepositoryIntegrityHealthIndicatorService implements HealthIndicato
public static final String NO_REPOS_CONFIGURED = "No snapshot repositories configured.";
public static final String ALL_REPOS_HEALTHY = "All repositories are healthy.";
public static final String NO_REPO_HEALTH_INFO = "No repository health info.";
public static final String MIXED_VERSIONS =
"No repository health info. The cluster currently has mixed versions (an upgrade may be in progress).";

public static final List<HealthIndicatorImpact> IMPACTS = List.of(
new HealthIndicatorImpact(
Expand Down Expand Up @@ -95,9 +99,11 @@ public class RepositoryIntegrityHealthIndicatorService implements HealthIndicato
);

private final ClusterService clusterService;
private final FeatureService featureService;

public RepositoryIntegrityHealthIndicatorService(ClusterService clusterService) {
public RepositoryIntegrityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}

@Override
Expand Down Expand Up @@ -128,7 +134,7 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
/**
* Analyzer for the cluster's repositories health; aids in constructing a {@link HealthIndicatorResult}.
*/
static class RepositoryHealthAnalyzer {
class RepositoryHealthAnalyzer {
private final ClusterState clusterState;
private final int totalRepositories;
private final List<String> corruptedRepositories;
Expand All @@ -137,6 +143,7 @@ static class RepositoryHealthAnalyzer {
private final Set<String> invalidRepositories = new HashSet<>();
private final Set<String> nodesWithInvalidRepos = new HashSet<>();
private final HealthStatus healthStatus;
private boolean clusterHasFeature = true;

private RepositoryHealthAnalyzer(
ClusterState clusterState,
Expand Down Expand Up @@ -167,7 +174,15 @@ private RepositoryHealthAnalyzer(
|| invalidRepositories.isEmpty() == false) {
healthStatus = YELLOW;
} else if (repositoriesHealthByNode.isEmpty()) {
healthStatus = UNKNOWN;
clusterHasFeature = featureService.clusterHasFeature(
clusterState,
HealthFeatures.SUPPORTS_EXTENDED_REPOSITORY_INDICATOR
) == false;
if (clusterHasFeature) {
healthStatus = GREEN;
} else {
healthStatus = UNKNOWN;
}
} else {
healthStatus = GREEN;
}
Expand All @@ -179,7 +194,7 @@ public HealthStatus getHealthStatus() {

public String getSymptom() {
if (healthStatus == GREEN) {
return ALL_REPOS_HEALTHY;
return clusterHasFeature ? ALL_REPOS_HEALTHY : MIXED_VERSIONS;
} else if (healthStatus == UNKNOWN) {
return NO_REPO_HEALTH_INFO;
}
Expand Down
Loading

0 comments on commit 7529dc9

Please sign in to comment.