diff --git a/api/v1/storagecluster_types.go b/api/v1/storagecluster_types.go index ca3a38e21d..49982648c2 100644 --- a/api/v1/storagecluster_types.go +++ b/api/v1/storagecluster_types.go @@ -198,21 +198,22 @@ type ManageCephCluster struct { // default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true` in cephCluster CR. // The default value is `30` minutes. OsdMaintenanceTimeout time.Duration `json:"osdMaintenanceTimeout,omitempty"` - // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.95. + // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.75. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - FullRatio *float64 `json:"fullRatio,omitempty"` - // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.85. + NearFullRatio *float64 `json:"nearFullRatio,omitempty"` + // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.80. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - NearFullRatio *float64 `json:"nearFullRatio,omitempty"` - // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.90. + BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.85. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + FullRatio *float64 `json:"fullRatio,omitempty"` + // Whether to allow updating the device class after the OSD is initially provisioned AllowDeviceClassUpdate bool `json:"allowDeviceClassUpdate,omitempty"` } diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 0e46712727..66d6046336 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -311,11 +311,6 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(bool) **out = **in } - if in.FullRatio != nil { - in, out := &in.FullRatio, &out.FullRatio - *out = new(float64) - **out = **in - } if in.NearFullRatio != nil { in, out := &in.NearFullRatio, &out.NearFullRatio *out = new(float64) @@ -326,6 +321,11 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(float64) **out = **in } + if in.FullRatio != nil { + in, out := &in.FullRatio, &out.FullRatio + *out = new(float64) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManageCephCluster. diff --git a/config/crd/bases/ocs.openshift.io_storageclusters.yaml b/config/crd/bases/ocs.openshift.io_storageclusters.yaml index 4e15c8b556..e55d70578a 100644 --- a/config/crd/bases/ocs.openshift.io_storageclusters.yaml +++ b/config/crd/bases/ocs.openshift.io_storageclusters.yaml @@ -752,7 +752,7 @@ spec: backfillFullRatio: description: BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above - this threshold. Default is 0.90. + this threshold. Default is 0.80. maximum: 1 minimum: 0 nullable: true @@ -764,7 +764,7 @@ spec: fullRatio: description: FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default - is 0.95. + is 0.85. maximum: 1 minimum: 0 nullable: true @@ -782,7 +782,7 @@ spec: nearFullRatio: description: NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. - Default is 0.85. + Default is 0.75. maximum: 1 minimum: 0 nullable: true diff --git a/controllers/storagecluster/cephcluster.go b/controllers/storagecluster/cephcluster.go index 5768a6e2ee..2ca6aa1d22 100644 --- a/controllers/storagecluster/cephcluster.go +++ b/controllers/storagecluster/cephcluster.go @@ -1157,16 +1157,39 @@ func createPrometheusRules(r *StorageClusterReconciler, sc *ocsv1.StorageCluster return err } applyLabels(getCephClusterMonitoringLabels(*sc), &prometheusRule.ObjectMeta) - replaceTokens := []exprReplaceToken{ + + replaceTokens := []replaceToken{ { recordOrAlertName: "CephMgrIsAbsent", - wordInExpr: "openshift-storage", + wordToReplace: "openshift-storage", replaceWith: sc.Namespace, }, } + + // if nearFullRatio/backfillFullRatio/fullRatio are specified on the StorageCLuster CR, replace the values in the prometheus rule accordingly + specifiedNearFullRatio := sc.Spec.ManagedResources.CephCluster.NearFullRatio + specifiedBackfillFullRatio := sc.Spec.ManagedResources.CephCluster.BackfillFullRatio + specifiedFullRatio := sc.Spec.ManagedResources.CephCluster.FullRatio + + if specifiedNearFullRatio != nil { + replaceTokens = append(replaceTokens, + createReplaceToken("", "", "75%", fmt.Sprintf("%.2f%%", *specifiedNearFullRatio*100)), + createReplaceToken("", "", "0.75", fmt.Sprintf("%f", *specifiedNearFullRatio))) + } + if specifiedBackfillFullRatio != nil { + replaceTokens = append(replaceTokens, + createReplaceToken("", "", "80%", fmt.Sprintf("%.2f%%", *specifiedBackfillFullRatio*100)), + createReplaceToken("", "", "0.80", fmt.Sprintf("%f", *specifiedBackfillFullRatio))) + } + if specifiedFullRatio != nil { + replaceTokens = append(replaceTokens, + createReplaceToken("", "", "85%", fmt.Sprintf("%.2f%%", *specifiedFullRatio*100)), + createReplaceToken("", "", "0.85", fmt.Sprintf("%f", *specifiedFullRatio))) + } + // nothing to replace in external mode if name != prometheusExternalRuleName { - changePromRuleExpr(prometheusRule, replaceTokens) + changePromRule(prometheusRule, replaceTokens) } if err := createOrUpdatePrometheusRule(r, prometheusRule); err != nil { @@ -1189,38 +1212,64 @@ func applyLabels(labels map[string]string, t *metav1.ObjectMeta) { } } -type exprReplaceToken struct { +type replaceToken struct { groupName string recordOrAlertName string - wordInExpr string + wordToReplace string replaceWith string } -func changePromRuleExpr(promRules *monitoringv1.PrometheusRule, replaceTokens []exprReplaceToken) { - if promRules == nil { +func createReplaceToken(groupName, recordOrAlertName, wordToReplace, replaceWith string) replaceToken { + return replaceToken{ + groupName: groupName, + recordOrAlertName: recordOrAlertName, + wordToReplace: wordToReplace, + replaceWith: replaceWith, + } +} + +// changePromRule replaces the wordToReplace with replaceWith in the PrometheusRule +// This can be used to update the values in the PrometheusRule dynamically +func changePromRule(promRule *monitoringv1.PrometheusRule, tokens []replaceToken) { + if promRule == nil { return } - for _, eachToken := range replaceTokens { - // if both the words, one being replaced and the one replacing it, are same - // then we don't have to do anything - if eachToken.replaceWith == eachToken.wordInExpr { + + // Iterate over each token for replacements + for _, token := range tokens { + // Skip if the word and replacement are the same + if token.replaceWith == token.wordToReplace { continue } - for gIndx, currGroup := range promRules.Spec.Groups { - if eachToken.groupName != "" && eachToken.groupName != currGroup.Name { + + // Iterate through all groups in the Prometheus rule + for groupIdx := range promRule.Spec.Groups { + group := &promRule.Spec.Groups[groupIdx] + // If groupName is specified, ensure it matches; otherwise, apply to all groups + if token.groupName != "" && token.groupName != group.Name { continue } - for rIndx, currRule := range currGroup.Rules { - if eachToken.recordOrAlertName != "" { - if currRule.Record != "" && currRule.Record != eachToken.recordOrAlertName { - continue - } else if currRule.Alert != "" && currRule.Alert != eachToken.recordOrAlertName { - continue + + // Iterate through the rules in the group + for ruleIdx := range group.Rules { + rule := &group.Rules[ruleIdx] + // If recordOrAlertName is specified, ensure it matches; otherwise, apply to all rules + if token.recordOrAlertName == "" || rule.Record == token.recordOrAlertName || rule.Alert == token.recordOrAlertName { + // Update the annotations in the rule + if rule.Annotations != nil { + // Update description if it exists + if description, exists := rule.Annotations["description"]; exists { + newDescription := strings.Replace(description, token.wordToReplace, token.replaceWith, -1) + rule.Annotations["description"] = newDescription + } + } + // Update the expression field in the rule + exprStr := rule.Expr.String() + if exprStr != "" { + newExpr := strings.Replace(exprStr, token.wordToReplace, token.replaceWith, -1) + rule.Expr = intstr.Parse(newExpr) } } - exprStr := currRule.Expr.String() - newExpr := strings.Replace(exprStr, eachToken.wordInExpr, eachToken.replaceWith, -1) - promRules.Spec.Groups[gIndx].Rules[rIndx].Expr = intstr.Parse(newExpr) } } } diff --git a/controllers/storagecluster/cephcluster_test.go b/controllers/storagecluster/cephcluster_test.go index 17a6db4e59..817e2ffeb8 100644 --- a/controllers/storagecluster/cephcluster_test.go +++ b/controllers/storagecluster/cephcluster_test.go @@ -1041,29 +1041,40 @@ func TestParsePrometheusRules(t *testing.T) { } func TestChangePrometheusExprFunc(t *testing.T) { - prometheusRules, err := parsePrometheusRule(localPrometheusRules) + prometheusRule, err := parsePrometheusRule(localPrometheusRules) assert.NilError(t, err) - var changeTokens = []exprReplaceToken{ - {recordOrAlertName: "CephMgrIsAbsent", wordInExpr: "openshift-storage", replaceWith: "new-namespace"}, + var changeTokens = []replaceToken{ + {recordOrAlertName: "CephMgrIsAbsent", wordToReplace: "openshift-storage", replaceWith: "new-namespace"}, // when alert or record name is not specified, // the change should affect all the expressions which has the 'wordInExpr' - {recordOrAlertName: "", wordInExpr: "ceph_pool_stored_raw", replaceWith: "new_ceph_pool_stored_raw"}, + {recordOrAlertName: "", wordToReplace: "ceph_pool_stored_raw", replaceWith: "new_ceph_pool_stored_raw"}, + {recordOrAlertName: "", wordToReplace: "0.75", replaceWith: "0.775"}, + {recordOrAlertName: "", wordToReplace: "85%", replaceWith: "92.50%"}, } - changePromRuleExpr(prometheusRules, changeTokens) - alertNameAndChangedExpr := [][2]string{ + changePromRule(prometheusRule, changeTokens) + + recordOrAlertNameAndReplacedWord := [][2]string{ {"CephMgrIsAbsent", "new-namespace"}, {"CephPoolQuotaBytesNearExhaustion", "new_ceph_pool_stored_raw"}, {"CephPoolQuotaBytesCriticallyExhausted", "new_ceph_pool_stored_raw"}, + {"CephClusterNearFull", "0.775"}, + {"CephOSDNearFull", "0.775"}, + {"CephClusterNearFull", "92.50%"}, + {"CephClusterCriticallyFull", "92.50%"}, + {"CephClusterReadOnly", "92.50%"}, } - for _, grp := range prometheusRules.Spec.Groups { + for _, grp := range prometheusRule.Spec.Groups { for _, rule := range grp.Rules { - for _, eachAlertChanged := range alertNameAndChangedExpr { - alertName := eachAlertChanged[0] - changeStr := eachAlertChanged[1] + for _, eachChange := range recordOrAlertNameAndReplacedWord { + alertName := eachChange[0] + changeStr := eachChange[1] if rule.Alert != alertName { continue } - assert.Assert(t, strings.Contains(rule.Expr.String(), changeStr)) + assert.Assert(t, + strings.Contains(rule.Expr.String(), changeStr) || + (rule.Annotations != nil && strings.Contains(rule.Annotations["description"], changeStr)), + fmt.Sprintf("Expected '%s' to be found in either Expr or Annotations for alert %s", changeStr, alertName)) } } } diff --git a/deploy/csv-templates/crds/ocs/ocs.openshift.io_storageclusters.yaml b/deploy/csv-templates/crds/ocs/ocs.openshift.io_storageclusters.yaml index 4e15c8b556..e55d70578a 100644 --- a/deploy/csv-templates/crds/ocs/ocs.openshift.io_storageclusters.yaml +++ b/deploy/csv-templates/crds/ocs/ocs.openshift.io_storageclusters.yaml @@ -752,7 +752,7 @@ spec: backfillFullRatio: description: BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above - this threshold. Default is 0.90. + this threshold. Default is 0.80. maximum: 1 minimum: 0 nullable: true @@ -764,7 +764,7 @@ spec: fullRatio: description: FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default - is 0.95. + is 0.85. maximum: 1 minimum: 0 nullable: true @@ -782,7 +782,7 @@ spec: nearFullRatio: description: NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. - Default is 0.85. + Default is 0.75. maximum: 1 minimum: 0 nullable: true diff --git a/deploy/ocs-operator/manifests/storagecluster.crd.yaml b/deploy/ocs-operator/manifests/storagecluster.crd.yaml index 4e15c8b556..e55d70578a 100644 --- a/deploy/ocs-operator/manifests/storagecluster.crd.yaml +++ b/deploy/ocs-operator/manifests/storagecluster.crd.yaml @@ -752,7 +752,7 @@ spec: backfillFullRatio: description: BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above - this threshold. Default is 0.90. + this threshold. Default is 0.80. maximum: 1 minimum: 0 nullable: true @@ -764,7 +764,7 @@ spec: fullRatio: description: FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default - is 0.95. + is 0.85. maximum: 1 minimum: 0 nullable: true @@ -782,7 +782,7 @@ spec: nearFullRatio: description: NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. - Default is 0.85. + Default is 0.75. maximum: 1 minimum: 0 nullable: true diff --git a/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go b/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go index ca3a38e21d..49982648c2 100644 --- a/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go +++ b/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go @@ -198,21 +198,22 @@ type ManageCephCluster struct { // default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true` in cephCluster CR. // The default value is `30` minutes. OsdMaintenanceTimeout time.Duration `json:"osdMaintenanceTimeout,omitempty"` - // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.95. + // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.75. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - FullRatio *float64 `json:"fullRatio,omitempty"` - // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.85. + NearFullRatio *float64 `json:"nearFullRatio,omitempty"` + // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.80. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - NearFullRatio *float64 `json:"nearFullRatio,omitempty"` - // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.90. + BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.85. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + FullRatio *float64 `json:"fullRatio,omitempty"` + // Whether to allow updating the device class after the OSD is initially provisioned AllowDeviceClassUpdate bool `json:"allowDeviceClassUpdate,omitempty"` } diff --git a/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go b/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go index 0e46712727..66d6046336 100644 --- a/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go +++ b/metrics/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go @@ -311,11 +311,6 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(bool) **out = **in } - if in.FullRatio != nil { - in, out := &in.FullRatio, &out.FullRatio - *out = new(float64) - **out = **in - } if in.NearFullRatio != nil { in, out := &in.NearFullRatio, &out.NearFullRatio *out = new(float64) @@ -326,6 +321,11 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(float64) **out = **in } + if in.FullRatio != nil { + in, out := &in.FullRatio, &out.FullRatio + *out = new(float64) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManageCephCluster. diff --git a/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go b/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go index ca3a38e21d..49982648c2 100644 --- a/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go +++ b/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/storagecluster_types.go @@ -198,21 +198,22 @@ type ManageCephCluster struct { // default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true` in cephCluster CR. // The default value is `30` minutes. OsdMaintenanceTimeout time.Duration `json:"osdMaintenanceTimeout,omitempty"` - // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.95. + // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.75. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - FullRatio *float64 `json:"fullRatio,omitempty"` - // NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.85. + NearFullRatio *float64 `json:"nearFullRatio,omitempty"` + // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.80. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - NearFullRatio *float64 `json:"nearFullRatio,omitempty"` - // BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.90. + BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + // FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.85. // +kubebuilder:validation:Minimum=0.0 // +kubebuilder:validation:Maximum=1.0 // +nullable - BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"` + FullRatio *float64 `json:"fullRatio,omitempty"` + // Whether to allow updating the device class after the OSD is initially provisioned AllowDeviceClassUpdate bool `json:"allowDeviceClassUpdate,omitempty"` } diff --git a/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go b/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go index 0e46712727..66d6046336 100644 --- a/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go +++ b/vendor/github.com/red-hat-storage/ocs-operator/api/v4/v1/zz_generated.deepcopy.go @@ -311,11 +311,6 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(bool) **out = **in } - if in.FullRatio != nil { - in, out := &in.FullRatio, &out.FullRatio - *out = new(float64) - **out = **in - } if in.NearFullRatio != nil { in, out := &in.NearFullRatio, &out.NearFullRatio *out = new(float64) @@ -326,6 +321,11 @@ func (in *ManageCephCluster) DeepCopyInto(out *ManageCephCluster) { *out = new(float64) **out = **in } + if in.FullRatio != nil { + in, out := &in.FullRatio, &out.FullRatio + *out = new(float64) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ManageCephCluster.