Skip to content

Commit

Permalink
Merge pull request #2820 from openshift-cherrypick-robot/cherry-pick-…
Browse files Browse the repository at this point in the history
…2816-to-release-4.17

Bug 2303342: [release-4.17] Update the Prometheus rule alerts acc to specified value & correct the default values in API desc
  • Loading branch information
openshift-merge-bot[bot] authored Sep 26, 2024
2 parents f754e58 + 90e382b commit afaf9a4
Show file tree
Hide file tree
Showing 11 changed files with 138 additions and 75 deletions.
13 changes: 7 additions & 6 deletions api/v1/storagecluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,21 +198,22 @@ type ManageCephCluster struct {
// default DOWN/OUT interval) when it is draining. This is only relevant when `managePodBudgets` is `true` in cephCluster CR.
// The default value is `30` minutes.
OsdMaintenanceTimeout time.Duration `json:"osdMaintenanceTimeout,omitempty"`
// FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.95.
// NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.75.
// +kubebuilder:validation:Minimum=0.0
// +kubebuilder:validation:Maximum=1.0
// +nullable
FullRatio *float64 `json:"fullRatio,omitempty"`
// NearFullRatio is the ratio at which the cluster is considered nearly full and will raise a ceph health warning. Default is 0.85.
NearFullRatio *float64 `json:"nearFullRatio,omitempty"`
// BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.80.
// +kubebuilder:validation:Minimum=0.0
// +kubebuilder:validation:Maximum=1.0
// +nullable
NearFullRatio *float64 `json:"nearFullRatio,omitempty"`
// BackfillFullRatio is the ratio at which the cluster is too full for backfill. Backfill will be disabled if above this threshold. Default is 0.90.
BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"`
// FullRatio is the ratio at which the cluster is considered full and ceph will stop accepting writes. Default is 0.85.
// +kubebuilder:validation:Minimum=0.0
// +kubebuilder:validation:Maximum=1.0
// +nullable
BackfillFullRatio *float64 `json:"backfillFullRatio,omitempty"`
FullRatio *float64 `json:"fullRatio,omitempty"`

// Whether to allow updating the device class after the OSD is initially provisioned
AllowDeviceClassUpdate bool `json:"allowDeviceClassUpdate,omitempty"`
}
Expand Down
10 changes: 5 additions & 5 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions config/crd/bases/ocs.openshift.io_storageclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ spec:
backfillFullRatio:
description: BackfillFullRatio is the ratio at which the cluster
is too full for backfill. Backfill will be disabled if above
this threshold. Default is 0.90.
this threshold. Default is 0.80.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -764,7 +764,7 @@ spec:
fullRatio:
description: FullRatio is the ratio at which the cluster is
considered full and ceph will stop accepting writes. Default
is 0.95.
is 0.85.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -782,7 +782,7 @@ spec:
nearFullRatio:
description: NearFullRatio is the ratio at which the cluster
is considered nearly full and will raise a ceph health warning.
Default is 0.85.
Default is 0.75.
maximum: 1
minimum: 0
nullable: true
Expand Down
93 changes: 71 additions & 22 deletions controllers/storagecluster/cephcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1157,16 +1157,39 @@ func createPrometheusRules(r *StorageClusterReconciler, sc *ocsv1.StorageCluster
return err
}
applyLabels(getCephClusterMonitoringLabels(*sc), &prometheusRule.ObjectMeta)
replaceTokens := []exprReplaceToken{

replaceTokens := []replaceToken{
{
recordOrAlertName: "CephMgrIsAbsent",
wordInExpr: "openshift-storage",
wordToReplace: "openshift-storage",
replaceWith: sc.Namespace,
},
}

// if nearFullRatio/backfillFullRatio/fullRatio are specified on the StorageCLuster CR, replace the values in the prometheus rule accordingly
specifiedNearFullRatio := sc.Spec.ManagedResources.CephCluster.NearFullRatio
specifiedBackfillFullRatio := sc.Spec.ManagedResources.CephCluster.BackfillFullRatio
specifiedFullRatio := sc.Spec.ManagedResources.CephCluster.FullRatio

if specifiedNearFullRatio != nil {
replaceTokens = append(replaceTokens,
createReplaceToken("", "", "75%", fmt.Sprintf("%.2f%%", *specifiedNearFullRatio*100)),
createReplaceToken("", "", "0.75", fmt.Sprintf("%f", *specifiedNearFullRatio)))
}
if specifiedBackfillFullRatio != nil {
replaceTokens = append(replaceTokens,
createReplaceToken("", "", "80%", fmt.Sprintf("%.2f%%", *specifiedBackfillFullRatio*100)),
createReplaceToken("", "", "0.80", fmt.Sprintf("%f", *specifiedBackfillFullRatio)))
}
if specifiedFullRatio != nil {
replaceTokens = append(replaceTokens,
createReplaceToken("", "", "85%", fmt.Sprintf("%.2f%%", *specifiedFullRatio*100)),
createReplaceToken("", "", "0.85", fmt.Sprintf("%f", *specifiedFullRatio)))
}

// nothing to replace in external mode
if name != prometheusExternalRuleName {
changePromRuleExpr(prometheusRule, replaceTokens)
changePromRule(prometheusRule, replaceTokens)
}

if err := createOrUpdatePrometheusRule(r, prometheusRule); err != nil {
Expand All @@ -1189,38 +1212,64 @@ func applyLabels(labels map[string]string, t *metav1.ObjectMeta) {
}
}

type exprReplaceToken struct {
type replaceToken struct {
groupName string
recordOrAlertName string
wordInExpr string
wordToReplace string
replaceWith string
}

func changePromRuleExpr(promRules *monitoringv1.PrometheusRule, replaceTokens []exprReplaceToken) {
if promRules == nil {
func createReplaceToken(groupName, recordOrAlertName, wordToReplace, replaceWith string) replaceToken {
return replaceToken{
groupName: groupName,
recordOrAlertName: recordOrAlertName,
wordToReplace: wordToReplace,
replaceWith: replaceWith,
}
}

// changePromRule replaces the wordToReplace with replaceWith in the PrometheusRule
// This can be used to update the values in the PrometheusRule dynamically
func changePromRule(promRule *monitoringv1.PrometheusRule, tokens []replaceToken) {
if promRule == nil {
return
}
for _, eachToken := range replaceTokens {
// if both the words, one being replaced and the one replacing it, are same
// then we don't have to do anything
if eachToken.replaceWith == eachToken.wordInExpr {

// Iterate over each token for replacements
for _, token := range tokens {
// Skip if the word and replacement are the same
if token.replaceWith == token.wordToReplace {
continue
}
for gIndx, currGroup := range promRules.Spec.Groups {
if eachToken.groupName != "" && eachToken.groupName != currGroup.Name {

// Iterate through all groups in the Prometheus rule
for groupIdx := range promRule.Spec.Groups {
group := &promRule.Spec.Groups[groupIdx]
// If groupName is specified, ensure it matches; otherwise, apply to all groups
if token.groupName != "" && token.groupName != group.Name {
continue
}
for rIndx, currRule := range currGroup.Rules {
if eachToken.recordOrAlertName != "" {
if currRule.Record != "" && currRule.Record != eachToken.recordOrAlertName {
continue
} else if currRule.Alert != "" && currRule.Alert != eachToken.recordOrAlertName {
continue

// Iterate through the rules in the group
for ruleIdx := range group.Rules {
rule := &group.Rules[ruleIdx]
// If recordOrAlertName is specified, ensure it matches; otherwise, apply to all rules
if token.recordOrAlertName == "" || rule.Record == token.recordOrAlertName || rule.Alert == token.recordOrAlertName {
// Update the annotations in the rule
if rule.Annotations != nil {
// Update description if it exists
if description, exists := rule.Annotations["description"]; exists {
newDescription := strings.Replace(description, token.wordToReplace, token.replaceWith, -1)
rule.Annotations["description"] = newDescription
}
}
// Update the expression field in the rule
exprStr := rule.Expr.String()
if exprStr != "" {
newExpr := strings.Replace(exprStr, token.wordToReplace, token.replaceWith, -1)
rule.Expr = intstr.Parse(newExpr)
}
}
exprStr := currRule.Expr.String()
newExpr := strings.Replace(exprStr, eachToken.wordInExpr, eachToken.replaceWith, -1)
promRules.Spec.Groups[gIndx].Rules[rIndx].Expr = intstr.Parse(newExpr)
}
}
}
Expand Down
33 changes: 22 additions & 11 deletions controllers/storagecluster/cephcluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1041,29 +1041,40 @@ func TestParsePrometheusRules(t *testing.T) {
}

func TestChangePrometheusExprFunc(t *testing.T) {
prometheusRules, err := parsePrometheusRule(localPrometheusRules)
prometheusRule, err := parsePrometheusRule(localPrometheusRules)
assert.NilError(t, err)
var changeTokens = []exprReplaceToken{
{recordOrAlertName: "CephMgrIsAbsent", wordInExpr: "openshift-storage", replaceWith: "new-namespace"},
var changeTokens = []replaceToken{
{recordOrAlertName: "CephMgrIsAbsent", wordToReplace: "openshift-storage", replaceWith: "new-namespace"},
// when alert or record name is not specified,
// the change should affect all the expressions which has the 'wordInExpr'
{recordOrAlertName: "", wordInExpr: "ceph_pool_stored_raw", replaceWith: "new_ceph_pool_stored_raw"},
{recordOrAlertName: "", wordToReplace: "ceph_pool_stored_raw", replaceWith: "new_ceph_pool_stored_raw"},
{recordOrAlertName: "", wordToReplace: "0.75", replaceWith: "0.775"},
{recordOrAlertName: "", wordToReplace: "85%", replaceWith: "92.50%"},
}
changePromRuleExpr(prometheusRules, changeTokens)
alertNameAndChangedExpr := [][2]string{
changePromRule(prometheusRule, changeTokens)

recordOrAlertNameAndReplacedWord := [][2]string{
{"CephMgrIsAbsent", "new-namespace"},
{"CephPoolQuotaBytesNearExhaustion", "new_ceph_pool_stored_raw"},
{"CephPoolQuotaBytesCriticallyExhausted", "new_ceph_pool_stored_raw"},
{"CephClusterNearFull", "0.775"},
{"CephOSDNearFull", "0.775"},
{"CephClusterNearFull", "92.50%"},
{"CephClusterCriticallyFull", "92.50%"},
{"CephClusterReadOnly", "92.50%"},
}
for _, grp := range prometheusRules.Spec.Groups {
for _, grp := range prometheusRule.Spec.Groups {
for _, rule := range grp.Rules {
for _, eachAlertChanged := range alertNameAndChangedExpr {
alertName := eachAlertChanged[0]
changeStr := eachAlertChanged[1]
for _, eachChange := range recordOrAlertNameAndReplacedWord {
alertName := eachChange[0]
changeStr := eachChange[1]
if rule.Alert != alertName {
continue
}
assert.Assert(t, strings.Contains(rule.Expr.String(), changeStr))
assert.Assert(t,
strings.Contains(rule.Expr.String(), changeStr) ||
(rule.Annotations != nil && strings.Contains(rule.Annotations["description"], changeStr)),
fmt.Sprintf("Expected '%s' to be found in either Expr or Annotations for alert %s", changeStr, alertName))
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ spec:
backfillFullRatio:
description: BackfillFullRatio is the ratio at which the cluster
is too full for backfill. Backfill will be disabled if above
this threshold. Default is 0.90.
this threshold. Default is 0.80.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -764,7 +764,7 @@ spec:
fullRatio:
description: FullRatio is the ratio at which the cluster is
considered full and ceph will stop accepting writes. Default
is 0.95.
is 0.85.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -782,7 +782,7 @@ spec:
nearFullRatio:
description: NearFullRatio is the ratio at which the cluster
is considered nearly full and will raise a ceph health warning.
Default is 0.85.
Default is 0.75.
maximum: 1
minimum: 0
nullable: true
Expand Down
6 changes: 3 additions & 3 deletions deploy/ocs-operator/manifests/storagecluster.crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ spec:
backfillFullRatio:
description: BackfillFullRatio is the ratio at which the cluster
is too full for backfill. Backfill will be disabled if above
this threshold. Default is 0.90.
this threshold. Default is 0.80.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -764,7 +764,7 @@ spec:
fullRatio:
description: FullRatio is the ratio at which the cluster is
considered full and ceph will stop accepting writes. Default
is 0.95.
is 0.85.
maximum: 1
minimum: 0
nullable: true
Expand All @@ -782,7 +782,7 @@ spec:
nearFullRatio:
description: NearFullRatio is the ratio at which the cluster
is considered nearly full and will raise a ceph health warning.
Default is 0.85.
Default is 0.75.
maximum: 1
minimum: 0
nullable: true
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit afaf9a4

Please sign in to comment.