diff --git a/PROJECT b/PROJECT index 4bfc6c97792..4782b9fcf91 100644 --- a/PROJECT +++ b/PROJECT @@ -34,6 +34,7 @@ resources: path: github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1 version: v1 webhooks: + defaulting: true validation: true webhookVersion: v1 version: "3" diff --git a/README.md b/README.md index f66c5c3f7a3..89c7a7b3144 100644 --- a/README.md +++ b/README.md @@ -325,12 +325,17 @@ spec: managementState: Managed modelmeshserving: managementState: Managed + modelregistry: + managementState: Managed + registriesNamespace: "rhoai-model-registries" ray: managementState: Managed trainingoperator: managementState: Managed workbenches: managementState: Managed + modelregistry: + managementState: Managed ``` 2. Enable only Dashboard and Workbenches diff --git a/apis/datasciencecluster/v1/datasciencecluster_types.go b/apis/datasciencecluster/v1/datasciencecluster_types.go index b24633ad56b..d64753e07af 100644 --- a/apis/datasciencecluster/v1/datasciencecluster_types.go +++ b/apis/datasciencecluster/v1/datasciencecluster_types.go @@ -31,10 +31,12 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" "github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches" + "github.com/opendatahub-io/opendatahub-operator/v2/controllers/status" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" ) @@ -80,6 +82,15 @@ type Components struct { //Training Operator component configuration. TrainingOperator trainingoperator.TrainingOperator `json:"trainingoperator,omitempty"` + + // ModelRegistry component configuration. + ModelRegistry modelregistry.ModelRegistry `json:"modelregistry,omitempty"` +} + +// ComponentsStatus defines the custom status of DataScienceCluster components. +type ComponentsStatus struct { + // ModelRegistry component status + ModelRegistry *status.ModelRegistryStatus `json:"modelregistry,omitempty"` } // DataScienceClusterStatus defines the observed state of DataScienceCluster. @@ -101,6 +112,10 @@ type DataScienceClusterStatus struct { // List of components with status if installed or not InstalledComponents map[string]bool `json:"installedComponents,omitempty"` + // Expose component's specific status + // +optional + Components ComponentsStatus `json:"components,omitempty"` + // Version and release type Release cluster.Release `json:"release,omitempty"` } diff --git a/apis/datasciencecluster/v1/zz_generated.deepcopy.go b/apis/datasciencecluster/v1/zz_generated.deepcopy.go index a9f4ff44cf1..035c38f876c 100644 --- a/apis/datasciencecluster/v1/zz_generated.deepcopy.go +++ b/apis/datasciencecluster/v1/zz_generated.deepcopy.go @@ -21,6 +21,7 @@ limitations under the License. package v1 import ( + "github.com/opendatahub-io/opendatahub-operator/v2/controllers/status" conditionsv1 "github.com/openshift/custom-resource-status/conditions/v1" corev1 "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" @@ -39,6 +40,7 @@ func (in *Components) DeepCopyInto(out *Components) { in.Ray.DeepCopyInto(&out.Ray) in.TrustyAI.DeepCopyInto(&out.TrustyAI) in.TrainingOperator.DeepCopyInto(&out.TrainingOperator) + in.ModelRegistry.DeepCopyInto(&out.ModelRegistry) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Components. @@ -51,6 +53,26 @@ func (in *Components) DeepCopy() *Components { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComponentsStatus) DeepCopyInto(out *ComponentsStatus) { + *out = *in + if in.ModelRegistry != nil { + in, out := &in.ModelRegistry, &out.ModelRegistry + *out = new(status.ModelRegistryStatus) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComponentsStatus. +func (in *ComponentsStatus) DeepCopy() *ComponentsStatus { + if in == nil { + return nil + } + out := new(ComponentsStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataScienceCluster) DeepCopyInto(out *DataScienceCluster) { *out = *in @@ -148,6 +170,7 @@ func (in *DataScienceClusterStatus) DeepCopyInto(out *DataScienceClusterStatus) (*out)[key] = val } } + in.Components.DeepCopyInto(&out.Components) in.Release.DeepCopyInto(&out.Release) } diff --git a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 6972bf35135..1332c2fb646 100644 --- a/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/bundle/manifests/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -385,6 +385,63 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: "" + description: contextDir is the relative path to + the folder containing manifests in a repository + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: "Set to one of the following values: \n - \"Managed\" + : the operator is actively managing the component and trying + to keep it active. It will only upgrade the component if + it is safe to do so \n - \"Removed\" : the operator is actively + managing the component and will not install it, or if it + is installed, the operator will try to remove it" + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + registriesNamespace: + default: rhoai-model-registries + description: Namespace for model registries to be installed, + configurable only once when model registry is enabled, defaults + to "rhoai-model-registries" + maxLength: 63 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$ + type: string + type: object + x-kubernetes-validations: + - message: RegistriesNamespace is immutable when model registry + is Managed + rule: (self.managementState != 'Managed') || (oldSelf.registriesNamespace + == '') || (oldSelf.managementState != 'Managed')|| (self.registriesNamespace + == oldSelf.registriesNamespace) ray: description: Ray component configuration. properties: @@ -566,6 +623,16 @@ spec: status: description: DataScienceClusterStatus defines the observed state of DataScienceCluster. properties: + components: + description: Expose component's specific status + properties: + modelregistry: + description: ModelRegistry component status + properties: + registriesNamespace: + type: string + type: object + type: object conditions: description: Conditions describes the state of the DataScienceCluster resource. diff --git a/bundle/manifests/rhods-operator.clusterserviceversion.yaml b/bundle/manifests/rhods-operator.clusterserviceversion.yaml index a759df5c774..4eadd900113 100644 --- a/bundle/manifests/rhods-operator.clusterserviceversion.yaml +++ b/bundle/manifests/rhods-operator.clusterserviceversion.yaml @@ -46,6 +46,10 @@ metadata: "modelmeshserving": { "managementState": "Managed" }, + "modelregistry": { + "managementState": "Managed" + "registriesNamespace": "rhoai-model-registries" + }, "ray": { "managementState": "Managed" }, @@ -1032,6 +1036,32 @@ spec: - update - use - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update + - apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: @@ -1793,6 +1823,26 @@ spec: name: Red Hat version: 2.14.0 webhookdefinitions: + - admissionReviewVersions: + - v1 + containerPort: 443 + deploymentName: opendatahub-operator-controller-manager + failurePolicy: Fail + generateName: mutate.operator.opendatahub.io + rules: + - apiGroups: + - datasciencecluster.opendatahub.io + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - datascienceclusters + sideEffects: None + targetPort: 9443 + type: MutatingAdmissionWebhook + webhookPath: /mutate-opendatahub-io-v1 - admissionReviewVersions: - v1 containerPort: 443 diff --git a/components/component.go b/components/component.go index 0730db60559..cd6d44c0972 100644 --- a/components/component.go +++ b/components/component.go @@ -132,6 +132,8 @@ func (c *Component) UpdatePrometheusConfig(_ client.Client, logger logr.Logger, TrustyAIARules string `yaml:"trustyai-alerting.rules"` KserveRRules string `yaml:"kserve-recording.rules"` KserveARules string `yaml:"kserve-alerting.rules"` + ModelRegistryRRules string `yaml:"model-registry-operator-recording.rules"` + ModelRegistryARules string `yaml:"model-registry-operator-alerting.rules"` } `yaml:"data"` } var configMap ConfigMap diff --git a/components/modelregistry/modelregistry.go b/components/modelregistry/modelregistry.go new file mode 100644 index 00000000000..23e4320fe47 --- /dev/null +++ b/components/modelregistry/modelregistry.go @@ -0,0 +1,222 @@ +// Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service +// +groupName=datasciencecluster.opendatahub.io +package modelregistry + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "strings" + "text/template" + + "github.com/go-logr/logr" + operatorv1 "github.com/openshift/api/operator/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" + infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/conversion" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + + _ "embed" +) + +const DefaultModelRegistryCert = "default-modelregistry-cert" + +var ( + ComponentName = "model-registry-operator" + DefaultModelRegistriesNamespace = "rhoai-model-registries" + Path = deploy.DefaultManifestPath + "/" + ComponentName + "/overlays/odh" + // we should not apply this label to the namespace, as it triggered namspace deletion during operator uninstall + // modelRegistryLabels = cluster.WithLabels( + // labels.ODH.OwnedNamespace, "true", + // ). +) + +// Verifies that ModelRegistry implements ComponentInterface. +var _ components.ComponentInterface = (*ModelRegistry)(nil) + +// ModelRegistry struct holds the configuration for the ModelRegistry component. +// The property `registriesNamespace` is immutable when `managementState` is `Managed` + +// +kubebuilder:object:generate=true +// +kubebuilder:validation:XValidation:rule="(self.managementState != 'Managed') || (oldSelf.registriesNamespace == '') || (oldSelf.managementState != 'Managed')|| (self.registriesNamespace == oldSelf.registriesNamespace)",message="RegistriesNamespace is immutable when model registry is Managed" +//nolint:lll + +type ModelRegistry struct { + components.Component `json:""` + + // Namespace for model registries to be installed, configurable only once when model registry is enabled, defaults to "rhoai-model-registries" + // +kubebuilder:default="rhoai-model-registries" + // +kubebuilder:validation:Pattern="^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$" + // +kubebuilder:validation:MaxLength=63 + RegistriesNamespace string `json:"registriesNamespace,omitempty"` +} + +func (m *ModelRegistry) OverrideManifests(ctx context.Context, _ cluster.Platform) error { + // If devflags are set, update default manifests path + if len(m.DevFlags.Manifests) != 0 { + manifestConfig := m.DevFlags.Manifests[0] + if err := deploy.DownloadManifests(ctx, ComponentName, manifestConfig); err != nil { + return err + } + // If overlay is defined, update paths + defaultKustomizePath := "overlays/odh" + if manifestConfig.SourcePath != "" { + defaultKustomizePath = manifestConfig.SourcePath + } + Path = filepath.Join(deploy.DefaultManifestPath, ComponentName, defaultKustomizePath) + } + + return nil +} + +func (m *ModelRegistry) GetComponentName() string { + return ComponentName +} + +func (m *ModelRegistry) ReconcileComponent(ctx context.Context, cli client.Client, logger logr.Logger, + owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, platform cluster.Platform, _ bool) error { + l := m.ConfigComponentLogger(logger, ComponentName, dscispec) + var imageParamMap = map[string]string{ + "IMAGES_MODELREGISTRY_OPERATOR": "RELATED_IMAGE_ODH_MODEL_REGISTRY_OPERATOR_IMAGE", + "IMAGES_GRPC_SERVICE": "RELATED_IMAGE_ODH_MLMD_GRPC_SERVER_IMAGE", + "IMAGES_REST_SERVICE": "RELATED_IMAGE_ODH_MODEL_REGISTRY_IMAGE", + } + enabled := m.GetManagementState() == operatorv1.Managed + monitoringEnabled := dscispec.Monitoring.ManagementState == operatorv1.Managed + + if enabled { + // return error if ServiceMesh is not enabled, as it's a required feature + if dscispec.ServiceMesh == nil || dscispec.ServiceMesh.ManagementState != operatorv1.Managed { + return errors.New("ServiceMesh needs to be set to 'Managed' in DSCI CR, it is required by Model Registry") + } + + if err := m.createDependencies(ctx, cli, dscispec); err != nil { + return err + } + + if m.DevFlags != nil { + // Download manifests and update paths + if err := m.OverrideManifests(ctx, platform); err != nil { + return err + } + } + + // Update image parameters only when we do not have customized manifests set + if (dscispec.DevFlags == nil || dscispec.DevFlags.ManifestsUri == "") && (m.DevFlags == nil || len(m.DevFlags.Manifests) == 0) { + extraParamsMap := map[string]string{ + "DEFAULT_CERT": DefaultModelRegistryCert, + } + if err := deploy.ApplyParams(Path, imageParamMap, extraParamsMap); err != nil { + return fmt.Errorf("failed to update image from %s : %w", Path, err) + } + } + + // Create model registries namespace + // We do not delete this namespace even when ModelRegistry is Removed or when operator is uninstalled. + ns, err := cluster.CreateNamespace(ctx, cli, m.RegistriesNamespace) + if err != nil { + return err + } + l.Info("created model registry namespace", "namespace", m.RegistriesNamespace) + // create servicemeshmember here, for now until post MVP solution + err = enrollToServiceMesh(ctx, cli, dscispec, ns) + if err != nil { + return err + } + l.Info("created model registry servicemesh member", "namespace", m.RegistriesNamespace) + } else { + err := m.removeDependencies(ctx, cli, dscispec) + if err != nil { + return err + } + } + + // Deploy ModelRegistry Operator + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path, dscispec.ApplicationsNamespace, m.GetComponentName(), enabled); err != nil { + return err + } + l.Info("apply manifests done") + + // Create additional model registry resources, componentEnabled=true because these extras are never deleted! + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, Path+"/extras", dscispec.ApplicationsNamespace, m.GetComponentName(), true); err != nil { + return err + } + l.Info("apply extra manifests done") + + if enabled { + if err := cluster.WaitForDeploymentAvailable(ctx, cli, m.GetComponentName(), dscispec.ApplicationsNamespace, 10, 1); err != nil { + return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) + } + } + + // CloudService Monitoring handling + if platform == cluster.ManagedRhods { + if err := m.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { + return err + } + if err := deploy.DeployManifestsFromPath(ctx, cli, owner, + filepath.Join(deploy.DefaultManifestPath, "monitoring", "prometheus", "apps"), + dscispec.Monitoring.Namespace, + "prometheus", true); err != nil { + return err + } + l.Info("updating SRE monitoring done") + } + return nil +} + +func (m *ModelRegistry) createDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // create DefaultModelRegistryCert + if err := cluster.PropagateDefaultIngressCertificate(ctx, cli, DefaultModelRegistryCert, dscispec.ServiceMesh.ControlPlane.Namespace); err != nil { + return err + } + return nil +} + +func (m *ModelRegistry) removeDependencies(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec) error { + // delete DefaultModelRegistryCert + certSecret := corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: DefaultModelRegistryCert, + Namespace: dscispec.ServiceMesh.ControlPlane.Namespace, + }, + } + // ignore error if the secret has already been removed + if err := cli.Delete(ctx, &certSecret); client.IgnoreNotFound(err) != nil { + return err + } + return nil +} + +//go:embed resources/servicemesh-member.tmpl.yaml +var smmTemplate string + +func enrollToServiceMesh(ctx context.Context, cli client.Client, dscispec *dsciv1.DSCInitializationSpec, namespace *corev1.Namespace) error { + tmpl, err := template.New("servicemeshmember").Parse(smmTemplate) + if err != nil { + return fmt.Errorf("error parsing servicemeshmember template: %w", err) + } + builder := strings.Builder{} + controlPlaneData := struct { + Namespace string + ControlPlane *infrav1.ControlPlaneSpec + }{Namespace: namespace.Name, ControlPlane: &dscispec.ServiceMesh.ControlPlane} + + if err = tmpl.Execute(&builder, controlPlaneData); err != nil { + return fmt.Errorf("error executing servicemeshmember template: %w", err) + } + + unstrObj, err := conversion.StrToUnstructured(builder.String()) + if err != nil || len(unstrObj) != 1 { + return fmt.Errorf("error converting servicemeshmember template: %w", err) + } + + return client.IgnoreAlreadyExists(cli.Create(ctx, unstrObj[0])) +} diff --git a/components/modelregistry/resources/servicemesh-member.tmpl.yaml b/components/modelregistry/resources/servicemesh-member.tmpl.yaml new file mode 100644 index 00000000000..8665f2ba54f --- /dev/null +++ b/components/modelregistry/resources/servicemesh-member.tmpl.yaml @@ -0,0 +1,9 @@ +apiVersion: maistra.io/v1 +kind: ServiceMeshMember +metadata: + name: default + namespace: {{.Namespace}} +spec: + controlPlaneRef: + namespace: {{ .ControlPlane.Namespace }} + name: {{ .ControlPlane.Name }} diff --git a/components/modelregistry/zz_generated.deepcopy.go b/components/modelregistry/zz_generated.deepcopy.go new file mode 100644 index 00000000000..86c4a17e14c --- /dev/null +++ b/components/modelregistry/zz_generated.deepcopy.go @@ -0,0 +1,39 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package modelregistry + +import () + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelRegistry) DeepCopyInto(out *ModelRegistry) { + *out = *in + in.Component.DeepCopyInto(&out.Component) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRegistry. +func (in *ModelRegistry) DeepCopy() *ModelRegistry { + if in == nil { + return nil + } + out := new(ModelRegistry) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml index 4590dcfe9d1..ba6a6770e85 100644 --- a/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml +++ b/config/crd/bases/datasciencecluster.opendatahub.io_datascienceclusters.yaml @@ -398,6 +398,66 @@ spec: pattern: ^(Managed|Unmanaged|Force|Removed)$ type: string type: object + modelregistry: + description: ModelRegistry component configuration. + properties: + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: manifests + description: contextDir is the relative path to + the folder containing manifests in a repository, + default value "manifests" + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include + any sub-folder or path: `base`, `overlays/dev`, + `default`, `odh` etc.' + type: string + uri: + default: "" + description: uri is the URI point to a git repo + with tag/branch. e.g. https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object + managementState: + description: |- + Set to one of the following values: + + - "Managed" : the operator is actively managing the component and trying to keep it active. + It will only upgrade the component if it is safe to do so + + - "Removed" : the operator is actively managing the component and will not install it, + or if it is installed, the operator will try to remove it + enum: + - Managed + - Removed + pattern: ^(Managed|Unmanaged|Force|Removed)$ + type: string + registriesNamespace: + default: rhoai-model-registries + description: Namespace for model registries to be installed, + configurable only once when model registry is enabled, defaults + to "rhoai-model-registries" + maxLength: 63 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$ + type: string + type: object + x-kubernetes-validations: + - message: RegistriesNamespace is immutable when model registry + is Managed + rule: (self.managementState != 'Managed') || (oldSelf.registriesNamespace + == '') || (oldSelf.managementState != 'Managed')|| (self.registriesNamespace + == oldSelf.registriesNamespace) ray: description: Ray component configuration. properties: @@ -587,6 +647,16 @@ spec: status: description: DataScienceClusterStatus defines the observed state of DataScienceCluster. properties: + components: + description: Expose component's specific status + properties: + modelregistry: + description: ModelRegistry component status + properties: + registriesNamespace: + type: string + type: object + type: object conditions: description: Conditions describes the state of the DataScienceCluster resource. diff --git a/config/monitoring/prometheus/apps/prometheus-configs.yaml b/config/monitoring/prometheus/apps/prometheus-configs.yaml index b19ec0394ba..933421afca7 100644 --- a/config/monitoring/prometheus/apps/prometheus-configs.yaml +++ b/config/monitoring/prometheus/apps/prometheus-configs.yaml @@ -372,6 +372,31 @@ data: target_label: __address__ replacement: ${1}:8080 + - job_name: 'Model Registry Operator' + honor_labels: true + metrics_path: /metrics + scheme: https + tls_config: + insecure_skip_verify: true + params: + module: [http_2xx] + authorization: + credentials_file: /run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + regex: ^(model-registry-operator-controller-manager-metrics-service)$ + target_label: kubernetes_name + action: keep + - source_labels: [__address__] + regex: (.+):(\d+) + target_label: __address__ + replacement: ${1}:8443 + - job_name: 'RHOAI Metrics' honor_labels: true scheme: http @@ -1577,3 +1602,92 @@ data: labels: severity: warning instance: trustyai-service-operator-controller-manager + + model-registry-operator-recording.rules: | + groups: + - name: SLOs - Model Registry Operator + rules: + - expr: | + absent(up{job=~'Model Registry Operator'}) * 0 or vector(1) + labels: + instance: model-registry-operator + record: probe_success + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[1h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate1h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[2h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate2h + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[30m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate30m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[3d])) + labels: + instance: model-registry-operator + record: probe_success:burnrate3d + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[5m])) + labels: + instance: model-registry-operator + record: probe_success:burnrate5m + - expr: | + 1 - min(avg_over_time(probe_success{instance="model-registry-operator"}[6h])) + labels: + instance: model-registry-operator + record: probe_success:burnrate6h + + model-registry-operator-alerting.rules: | + groups: + - name: SLOs-probe_success_model_controller + rules: + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate5m{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + and + sum(probe_success:burnrate1h{instance=~"model-registry-operator"}) by (instance) > (14.40 * (1-0.98000)) + for: 2m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate30m{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + and + sum(probe_success:burnrate6h{instance=~"model-registry-operator"}) by (instance) > (6.00 * (1-0.98000)) + for: 15m + labels: + severity: critical + namespace: redhat-ods-applications + - alert: Model Registry Operator Probe Success Burn Rate + annotations: + message: 'High error budget burn for {{ $labels.instance }} (current value: {{ $value }}).' + triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-model-registry-operator-probe-success-burn-rate.md" + summary: Model Registry Operator Probe Success Burn Rate + expr: | + sum(probe_success:burnrate2h{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + and + sum(probe_success:burnrate1d{instance=~"model-registry-operator"}) by (instance) > (3.00 * (1-0.98000)) + for: 1h + labels: + severity: warning + namespace: redhat-ods-applications diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 8ddcba22058..8327a66c0ed 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -504,6 +504,32 @@ rules: - update - use - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/finalizers + verbs: + - update +- apiGroups: + - modelregistry.opendatahub.io + resources: + - modelregistries/status + verbs: + - get + - patch + - update - apiGroups: - monitoring.coreos.com resources: diff --git a/config/samples/datasciencecluster_v1_datasciencecluster.yaml b/config/samples/datasciencecluster_v1_datasciencecluster.yaml index 4ffd8b417e8..922469140e2 100644 --- a/config/samples/datasciencecluster_v1_datasciencecluster.yaml +++ b/config/samples/datasciencecluster_v1_datasciencecluster.yaml @@ -39,4 +39,7 @@ spec: workbenches: managementState: "Managed" trustyai: - managementState: "Removed" + managementState: "Removed" + modelregistry: + managementState: "Managed" + registriesNamespace: "rhoai-model-registries" diff --git a/config/webhook/manifests.yaml b/config/webhook/manifests.yaml index 3dcfb31f3e3..9c1174d5173 100644 --- a/config/webhook/manifests.yaml +++ b/config/webhook/manifests.yaml @@ -1,5 +1,31 @@ --- apiVersion: admissionregistration.k8s.io/v1 +kind: MutatingWebhookConfiguration +metadata: + name: mutating-webhook-configuration +webhooks: +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: webhook-service + namespace: system + path: /mutate-opendatahub-io-v1 + failurePolicy: Fail + name: mutate.operator.opendatahub.io + rules: + - apiGroups: + - datasciencecluster.opendatahub.io + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - datascienceclusters + sideEffects: None +--- +apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration metadata: name: validating-webhook-configuration diff --git a/controllers/datasciencecluster/datasciencecluster_controller.go b/controllers/datasciencecluster/datasciencecluster_controller.go index 1189e4b7d2f..cd1cec7ee8d 100644 --- a/controllers/datasciencecluster/datasciencecluster_controller.go +++ b/controllers/datasciencecluster/datasciencecluster_controller.go @@ -53,6 +53,7 @@ import ( dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" "github.com/opendatahub-io/opendatahub-operator/v2/components/datasciencepipelines" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/status" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" annotations "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations" @@ -311,6 +312,8 @@ func (r *DataScienceClusterReconciler) reconcileSubComponent(ctx context.Context } err = component.ReconcileComponent(ctx, r.Client, r.Log, instance, r.DataScienceCluster.DSCISpec, platform, installedComponentValue) + // TODO: replace this hack with a full refactor of component status in the future + if err != nil { // reconciliation failed: log errors, raise event and update status accordingly instance = r.reportError(err, instance, "failed to reconcile "+componentName+" on DataScienceCluster") @@ -339,6 +342,15 @@ func (r *DataScienceClusterReconciler) reconcileSubComponent(ctx context.Context default: status.RemoveComponentCondition(&saved.Status.Conditions, componentName) } + + // TODO: replace this hack with a full refactor of component status in the future + if mr, isMR := component.(*modelregistry.ModelRegistry); isMR { + if enabled { + saved.Status.Components.ModelRegistry = &status.ModelRegistryStatus{RegistriesNamespace: mr.RegistriesNamespace} + } else { + saved.Status.Components.ModelRegistry = nil + } + } }) if err != nil { instance = r.reportError(err, instance, "failed to update DataScienceCluster status after reconciling "+componentName) diff --git a/controllers/datasciencecluster/kubebuilder_rbac.go b/controllers/datasciencecluster/kubebuilder_rbac.go index 7aa9fb3c6b5..36ead7c0ca3 100644 --- a/controllers/datasciencecluster/kubebuilder_rbac.go +++ b/controllers/datasciencecluster/kubebuilder_rbac.go @@ -122,6 +122,10 @@ package datasciencecluster // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=probes,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheusrules,verbs=get;create;patch;delete;deletecollection +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=modelregistry.opendatahub.io,resources=modelregistries/finalizers,verbs=update + // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/finalizers,verbs=get;create;patch;delete;deletecollection // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses/status,verbs=get;create;patch;delete;deletecollection diff --git a/controllers/status/status.go b/controllers/status/status.go index f831676ed2e..a09787a1aba 100644 --- a/controllers/status/status.go +++ b/controllers/status/status.go @@ -211,3 +211,8 @@ func SetComponentCondition(conditions *[]conditionsv1.Condition, component strin func RemoveComponentCondition(conditions *[]conditionsv1.Condition, component string) { conditionsv1.RemoveStatusCondition(conditions, conditionsv1.ConditionType(component+ReadySuffix)) } + +// ModelRegistryStatus struct holds the status for the ModelRegistry component. +type ModelRegistryStatus struct { + RegistriesNamespace string `json:"registriesNamespace,omitempty"` +} diff --git a/controllers/webhook/webhook.go b/controllers/webhook/webhook.go index 0ac19c4229b..87f463c770b 100644 --- a/controllers/webhook/webhook.go +++ b/controllers/webhook/webhook.go @@ -21,6 +21,7 @@ import ( "fmt" "net/http" + operatorv1 "github.com/openshift/api/operator/v1" admissionv1 "k8s.io/api/admission/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" @@ -29,6 +30,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster/gvk" ) @@ -37,12 +40,12 @@ var log = ctrl.Log.WithName("rhoai-controller-webhook") //+kubebuilder:webhook:path=/validate-opendatahub-io-v1,mutating=false,failurePolicy=fail,sideEffects=None,groups=datasciencecluster.opendatahub.io;dscinitialization.opendatahub.io,resources=datascienceclusters;dscinitializations,verbs=create;delete,versions=v1,name=operator.opendatahub.io,admissionReviewVersions=v1 //nolint:lll -type OpenDataHubWebhook struct { +type OpenDataHubValidatingWebhook struct { Client client.Client Decoder *admission.Decoder } -func (w *OpenDataHubWebhook) SetupWithManager(mgr ctrl.Manager) { +func (w *OpenDataHubValidatingWebhook) SetupWithManager(mgr ctrl.Manager) { hookServer := mgr.GetWebhookServer() odhWebhook := &webhook.Admission{ Handler: w, @@ -74,7 +77,7 @@ func denyCountGtZero(ctx context.Context, cli client.Client, gvk schema.GroupVer return admission.Allowed("") } -func (w *OpenDataHubWebhook) checkDupCreation(ctx context.Context, req admission.Request) admission.Response { +func (w *OpenDataHubValidatingWebhook) checkDupCreation(ctx context.Context, req admission.Request) admission.Response { switch req.Kind.Kind { case "DataScienceCluster", "DSCInitialization": default: @@ -93,7 +96,7 @@ func (w *OpenDataHubWebhook) checkDupCreation(ctx context.Context, req admission fmt.Sprintf("Only one instance of %s object is allowed", req.Kind.Kind)) } -func (w *OpenDataHubWebhook) checkDeletion(ctx context.Context, req admission.Request) admission.Response { +func (w *OpenDataHubValidatingWebhook) checkDeletion(ctx context.Context, req admission.Request) admission.Response { if req.Kind.Kind == "DataScienceCluster" { return admission.Allowed("") } @@ -103,7 +106,7 @@ func (w *OpenDataHubWebhook) checkDeletion(ctx context.Context, req admission.Re fmt.Sprintln("Cannot delete DSCI object when DSC object still exists")) } -func (w *OpenDataHubWebhook) Handle(ctx context.Context, req admission.Request) admission.Response { +func (w *OpenDataHubValidatingWebhook) Handle(ctx context.Context, req admission.Request) admission.Response { var resp admission.Response resp.Allowed = true // initialize Allowed to be true in case Operation falls into "default" case @@ -121,3 +124,46 @@ func (w *OpenDataHubWebhook) Handle(ctx context.Context, req admission.Request) return admission.Allowed(fmt.Sprintf("Operation %s on %s allowed", req.Operation, req.Kind.Kind)) } + +//+kubebuilder:webhook:path=/mutate-opendatahub-io-v1,mutating=true,failurePolicy=fail,sideEffects=None,groups=datasciencecluster.opendatahub.io,resources=datascienceclusters,verbs=create;update,versions=v1,name=mutate.operator.opendatahub.io,admissionReviewVersions=v1 +//nolint:lll + +// OpenDataHubMutatingWebhook is a mutating webhook +// It currently only sets defaults for modelregiestry in datascienceclusters. +type OpenDataHubMutatingWebhook struct { + Client client.Client + Decoder *admission.Decoder +} + +func (w *OpenDataHubMutatingWebhook) SetupWithManager(mgr ctrl.Manager) { + hookServer := mgr.GetWebhookServer() + odhWebhook := &webhook.Admission{ + Handler: w, + } + hookServer.Register("/mutate-opendatahub-io-v1", odhWebhook) +} + +func (w *OpenDataHubMutatingWebhook) Handle(ctx context.Context, req admission.Request) admission.Response { + var resp admission.Response + resp.Allowed = true // initialize Allowed to be true in case Operation falls into "default" case + + dsc := &dscv1.DataScienceCluster{} + err := w.Decoder.Decode(req, dsc) + if err != nil { + return admission.Denied(fmt.Sprintf("failed to decode request body: %s", err)) + } + resp = w.setDSCDefaults(ctx, dsc) + return resp +} + +func (w *OpenDataHubMutatingWebhook) setDSCDefaults(_ context.Context, dsc *dscv1.DataScienceCluster) admission.Response { + // set default registriesNamespace if empty + if len(dsc.Spec.Components.ModelRegistry.RegistriesNamespace) == 0 && dsc.Spec.Components.ModelRegistry.ManagementState == operatorv1.Managed { + return admission.Patched("Property registriesNamespace set to default value", webhook.JSONPatchOp{ + Operation: "replace", + Path: "spec.components.modelregistry.registriesNamespace", + Value: modelregistry.DefaultModelRegistriesNamespace, + }) + } + return admission.Allowed("") +} diff --git a/controllers/webhook/webhook_suite_test.go b/controllers/webhook/webhook_suite_test.go index 87fbeb4b82e..7c58571ae81 100644 --- a/controllers/webhook/webhook_suite_test.go +++ b/controllers/webhook/webhook_suite_test.go @@ -47,6 +47,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/datasciencepipelines" "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" "github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches" @@ -130,7 +131,12 @@ var _ = BeforeSuite(func() { }) Expect(err).NotTo(HaveOccurred()) - (&webhook.OpenDataHubWebhook{ + (&webhook.OpenDataHubValidatingWebhook{ + Client: mgr.GetClient(), + Decoder: admission.NewDecoder(mgr.GetScheme()), + }).SetupWithManager(mgr) + + (&webhook.OpenDataHubMutatingWebhook{ Client: mgr.GetClient(), Decoder: admission.NewDecoder(mgr.GetScheme()), }).SetupWithManager(mgr) @@ -200,6 +206,14 @@ var _ = Describe("DSC/DSCI webhook", func() { Expect(k8sClient.Delete(ctx, dscInstance)).Should(Succeed()) Expect(k8sClient.Delete(ctx, dsciInstance)).Should(Succeed()) }) + + It("Should set defaults in DSC instance", func(ctx context.Context) { + dscInstance := newDSC(nameBase+"-dsc-1", namespace) + Expect(k8sClient.Create(ctx, dscInstance)).Should(Succeed()) + Expect(dscInstance.Spec.Components.ModelRegistry.RegistriesNamespace). + Should(Equal(modelregistry.DefaultModelRegistriesNamespace)) + Expect(clearInstance(ctx, dscInstance)).Should(Succeed()) + }) }) func clearInstance(ctx context.Context, instance client.Object) error { diff --git a/docs/DESIGN.md b/docs/DESIGN.md index 87d2ef964a5..c5558009a73 100644 --- a/docs/DESIGN.md +++ b/docs/DESIGN.md @@ -62,6 +62,9 @@ To deploy ODH components seamlessly, ODH operator will watch two CRDs: name: knative-serving modelmeshserving: managementState: Managed + modelregistry: + managementState: Managed + registriesNamespace: "rhoai-model-registries" ray: managementState: Managed kueue: diff --git a/docs/api-overview.md b/docs/api-overview.md index 6edf0a162e5..18d957750fb 100644 --- a/docs/api-overview.md +++ b/docs/api-overview.md @@ -49,6 +49,7 @@ _Appears in:_ - [Kserve](#kserve) - [Kueue](#kueue) - [ModelMeshServing](#modelmeshserving) +- [ModelRegistry](#modelregistry) - [Ray](#ray) - [TrainingOperator](#trainingoperator) - [TrustyAI](#trustyai) @@ -230,6 +231,30 @@ _Appears in:_ +## datasciencecluster.opendatahub.io/modelregistry + +Package modelregistry provides utility functions to config ModelRegistry, an ML Model metadata repository service + + + +#### ModelRegistry + + + + + + + +_Appears in:_ +- [Components](#components) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `Component` _[Component](#component)_ | | | | +| `registriesNamespace` _string_ | Namespace for model registries to be installed, configurable only once when model registry is enabled, defaults to "rhoai-model-registries" | rhoai-model-registries | MaxLength: 63
Pattern: `^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$`
| + + + ## datasciencecluster.opendatahub.io/ray Package ray provides utility functions to config Ray as part of the stack @@ -361,6 +386,23 @@ _Appears in:_ | `ray` _[Ray](#ray)_ | Ray component configuration. | | | | `trustyai` _[TrustyAI](#trustyai)_ | TrustyAI component configuration. | | | | `trainingoperator` _[TrainingOperator](#trainingoperator)_ | Training Operator component configuration. | | | +| `modelregistry` _[ModelRegistry](#modelregistry)_ | ModelRegistry component configuration. | | | + + +#### ComponentsStatus + + + +ComponentsStatus defines the custom status of DataScienceCluster components. + + + +_Appears in:_ +- [DataScienceClusterStatus](#datascienceclusterstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `modelregistry` _[ModelRegistryStatus](#modelregistrystatus)_ | ModelRegistry component status | | | #### ControlPlaneSpec @@ -436,6 +478,7 @@ _Appears in:_ | `relatedObjects` _[ObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.25/#objectreference-v1-core) array_ | RelatedObjects is a list of objects created and maintained by this operator.
Object references will be added to this list after they have been created AND found in the cluster. | | | | `errorMessage` _string_ | | | | | `installedComponents` _object (keys:string, values:boolean)_ | List of components with status if installed or not | | | +| `components` _[ComponentsStatus](#componentsstatus)_ | Expose component's specific status | | | | `release` _[Release](#release)_ | Version and release type | | | diff --git a/docs/upgrade-testing.md b/docs/upgrade-testing.md index 8aa6d9f0a1a..9227e48f81a 100644 --- a/docs/upgrade-testing.md +++ b/docs/upgrade-testing.md @@ -136,6 +136,9 @@ spec: name: knative-serving modelmeshserving: managementState: Managed + modelregistry: + managementState: Managed + registriesNamespace: "rhoai-model-registries" ray: managementState: Managed kueue: @@ -147,4 +150,4 @@ spec: trustyai: managementState: Removed EOF -``` \ No newline at end of file +``` diff --git a/get_all_manifests.sh b/get_all_manifests.sh index f2c709c49b7..add60cff793 100755 --- a/get_all_manifests.sh +++ b/get_all_manifests.sh @@ -5,7 +5,7 @@ GITHUB_URL="https://github.com/" # update to use different git repo for legacy manifests MANIFEST_ORG="red-hat-data-services" -# component: notebook, dsp, kserve, dashbaord, cf/ray/kueue/trainingoperator, trustyai, modelmesh. +# component: notebook, dsp, kserve, dashbaord, cf/ray/kueue/trainingoperator, trustyai, modelmesh, modelregistry # in the format of "repo-org:repo-name:ref-name:source-folder:target-folder". declare -A COMPONENT_MANIFESTS=( ["codeflare"]="red-hat-data-services:codeflare-operator:rhoai-2.14:config:codeflare" @@ -21,6 +21,7 @@ declare -A COMPONENT_MANIFESTS=( ["kserve"]="red-hat-data-services:kserve:rhoai-2.14:config:kserve" ["odh-dashboard"]="red-hat-data-services:odh-dashboard:rhoai-2.14:manifests:dashboard" ["trainingoperator"]="red-hat-data-services:training-operator:rhoai-2.14:manifests:trainingoperator" + ["modelregistry"]="red-hat-data-services:model-registry-operator:rhoai-2.14:config:model-registry-operator" ) # Allow overwriting repo using flags component=repo diff --git a/main.go b/main.go index 1e327332571..88ae6b08e5d 100644 --- a/main.go +++ b/main.go @@ -57,6 +57,7 @@ import ( dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" featurev1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/features/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/certconfigmapgenerator" dscctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/datasciencecluster" dscictrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/dscinitialization" @@ -207,7 +208,12 @@ func main() { //nolint:funlen,maintidx os.Exit(1) } - (&webhook.OpenDataHubWebhook{ + (&webhook.OpenDataHubValidatingWebhook{ + Client: mgr.GetClient(), + Decoder: admission.NewDecoder(mgr.GetScheme()), + }).SetupWithManager(mgr) + + (&webhook.OpenDataHubMutatingWebhook{ Client: mgr.GetClient(), Decoder: admission.NewDecoder(mgr.GetScheme()), }).SetupWithManager(mgr) @@ -345,13 +351,12 @@ func createDeploymentCacheConfig(platform cluster.Platform) map[string]cache.Con case cluster.ManagedRhods: // no need workbench NS, only SFS no Deployment namespaceConfigs["redhat-ods-monitoring"] = cache.Config{} namespaceConfigs["redhat-ods-applications"] = cache.Config{} - //TODO: if ModelReg has a RHOAI NS case cluster.SelfManagedRhods: namespaceConfigs["redhat-ods-applications"] = cache.Config{} - //TODO: if ModelReg has a RHOAI NS default: namespaceConfigs["opendatahub"] = cache.Config{} - namespaceConfigs["odh-model-registries"] = cache.Config{} } + // for modelregistry namespace + namespaceConfigs[modelregistry.DefaultModelRegistriesNamespace] = cache.Config{} return namespaceConfigs } diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index 449353fd6c2..3da1933bb6b 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -36,6 +36,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -97,6 +98,9 @@ func CreateDefaultDSC(ctx context.Context, cli client.Client) error { TrustyAI: trustyai.TrustyAI{ Component: components.Component{ManagementState: operatorv1.Removed}, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ManagementState: operatorv1.Removed}, + }, }, }, } diff --git a/tests/e2e/creation_test.go b/tests/e2e/creation_test.go index f89d0bc5ec8..8fbfc83c63d 100644 --- a/tests/e2e/creation_test.go +++ b/tests/e2e/creation_test.go @@ -20,11 +20,13 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" infrav1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/infrastructure/v1" "github.com/opendatahub-io/opendatahub-operator/v2/components" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/feature/serverless" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" @@ -58,7 +60,7 @@ func creationTestSuite(t *testing.T) { // DSC t.Run("Creation of DataScienceCluster instance", func(t *testing.T) { - err = testCtx.testDSCCreation() + err = testCtx.testDSCCreation(t) require.NoError(t, err, "error creating DataScienceCluster instance") }) t.Run("Creation of more than one of DataScienceCluster instance", func(t *testing.T) { @@ -78,7 +80,6 @@ func creationTestSuite(t *testing.T) { err = testCtx.validateDSCReady() require.NoError(t, err, "DataScienceCluster instance is not Ready") }) - // Kserve t.Run("Validate Knative resoruce", func(t *testing.T) { err = testCtx.validateDSC() @@ -90,15 +91,19 @@ func creationTestSuite(t *testing.T) { require.NoError(t, err, "error getting default cert secrets for Kserve") }) - // TODO: enable when ModelReg is added - // t.Run("Validate default model registry cert available", func(t *testing.T) { - // err = testCtx.testDefaultModelRegistryCertAvailable() - // require.NoError(t, err, "error getting default cert secret for ModelRegistry") - // }) - // t.Run("Validate model registry servicemeshmember available", func(t *testing.T) { - // err = testCtx.testMRServiceMeshMember() - // require.NoError(t, err, "error getting servicemeshmember for Model Registry") - // }) + // ModelReg + t.Run("Validate model registry cert config", func(t *testing.T) { + err = testCtx.validateModelRegistryConfig() + require.NoError(t, err, "error validating ModelRegistry config") + }) + t.Run("Validate default model registry cert available", func(t *testing.T) { + err = testCtx.testDefaultModelRegistryCertAvailable() + require.NoError(t, err, "error getting default cert secret for ModelRegistry") + }) + t.Run("Validate model registry servicemeshmember available", func(t *testing.T) { + err = testCtx.testMRServiceMeshMember() + require.NoError(t, err, "error getting servicemeshmember for Model Registry") + }) // reconcile t.Run("Validate Controller reconcile", func(t *testing.T) { @@ -150,12 +155,11 @@ func (tc *testContext) testDSCICreation() error { return nil } -func (tc *testContext) testDSCCreation() error { +func (tc *testContext) testDSCCreation(t *testing.T) error { + t.Helper() // Create DataScienceCluster resource if not already created - dscLookupKey := types.NamespacedName{Name: tc.testDsc.Name} - createdDSC := &dscv1.DataScienceCluster{} - existingDSCList := &dscv1.DataScienceClusterList{} + existingDSCList := &dscv1.DataScienceClusterList{} err := tc.customClient.List(tc.ctx, existingDSCList) if err == nil { if len(existingDSCList.Items) > 0 { @@ -164,6 +168,9 @@ func (tc *testContext) testDSCCreation() error { return nil } } + + dscLookupKey := types.NamespacedName{Name: tc.testDsc.Name} + createdDSC := &dscv1.DataScienceCluster{} err = tc.customClient.Get(tc.ctx, dscLookupKey, createdDSC) if err != nil { if k8serr.IsNotFound(err) { @@ -282,7 +289,7 @@ func (tc *testContext) testAllComponentCreation(t *testing.T) error { //nolint:f t.Run("Validate "+name, func(t *testing.T) { t.Parallel() err = tc.testComponentCreation(c) - require.NoError(t, err, "error validating component %v when "+c.GetManagementState()) + require.NoError(t, err, "error validating component %s when %v", name, c.GetManagementState()) }) } return nil @@ -436,6 +443,64 @@ func (tc *testContext) testDefaultCertsAvailable() error { return nil } +func (tc *testContext) testDefaultModelRegistryCertAvailable() error { + // return if MR is not set to Managed + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get expected cert secrets + defaultIngressCtrl, err := cluster.FindAvailableIngressController(tc.ctx, tc.customClient) + if err != nil { + return fmt.Errorf("failed to get ingress controller: %w", err) + } + + defaultIngressCertName := cluster.GetDefaultIngressCertSecretName(defaultIngressCtrl) + + defaultIngressSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, "openshift-ingress", defaultIngressCertName) + if err != nil { + return err + } + + // Verify secret from Control Plane namespace matches the default MR cert secret + defaultMRSecretName := modelregistry.DefaultModelRegistryCert + defaultMRSecret, err := cluster.GetSecret(tc.ctx, tc.customClient, tc.testDSCI.Spec.ServiceMesh.ControlPlane.Namespace, + defaultMRSecretName) + if err != nil { + return err + } + + if defaultMRSecret.Type != defaultIngressSecret.Type { + return fmt.Errorf("wrong type of MR cert secret is created for %v. Expected %v, Got %v", defaultMRSecretName, defaultIngressSecret.Type, defaultMRSecret.Type) + } + + if string(defaultIngressSecret.Data["tls.crt"]) != string(defaultMRSecret.Data["tls.crt"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + + if string(defaultIngressSecret.Data["tls.key"]) != string(defaultMRSecret.Data["tls.key"]) { + return fmt.Errorf("default MR cert secret not expected. Epected %v, Got %v", defaultIngressSecret.Data["tls.crt"], defaultMRSecret.Data["tls.crt"]) + } + return nil +} + +func (tc *testContext) testMRServiceMeshMember() error { + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + return nil + } + + // Get unstructured ServiceMeshMember + smm := unstructured.Unstructured{} + smm.SetAPIVersion("maistra.io/v1") + smm.SetKind("ServiceMeshMember") + err := tc.customClient.Get(tc.ctx, + client.ObjectKey{Namespace: modelregistry.DefaultModelRegistriesNamespace, Name: "default"}, &smm) + if err != nil { + return fmt.Errorf("failed to get servicemesh member: %w", err) + } + return nil +} + func (tc *testContext) testUpdateComponentReconcile() error { // Test Updating Dashboard Replicas appDeployments, err := tc.kubeClient.AppsV1().Deployments(tc.applicationsNamespace).List(tc.ctx, metav1.ListOptions{ @@ -542,3 +607,51 @@ func (tc *testContext) testUpdateDSCComponentEnabled() error { dashboardDeploymentName, tc.applicationsNamespace) } + +const testNs = "test-model-registries" + +func (tc *testContext) validateModelRegistryConfig() error { + // check immutable property registriesNamespace + if tc.testDsc.Spec.Components.ModelRegistry.ManagementState != operatorv1.Managed { + // allowed to set registriesNamespace to non-default + err := patchRegistriesNamespace(tc, testNs, testNs, false) + if err != nil { + return err + } + // allowed to set registriesNamespace back to default value + err = patchRegistriesNamespace(tc, modelregistry.DefaultModelRegistriesNamespace, + modelregistry.DefaultModelRegistriesNamespace, false) + if err != nil { + return err + } + } else { + // not allowed to change registriesNamespace + err := patchRegistriesNamespace(tc, testNs, modelregistry.DefaultModelRegistriesNamespace, true) + if err != nil { + return err + } + } + return nil +} + +func patchRegistriesNamespace(tc *testContext, namespace string, expected string, expectErr bool) error { + patchStr := fmt.Sprintf("{\"spec\":{\"components\":{\"modelregistry\":{\"registriesNamespace\":\"%s\"}}}}", namespace) + err := tc.customClient.Patch(tc.ctx, tc.testDsc, client.RawPatch(types.MergePatchType, []byte(patchStr))) + if err != nil { + if !expectErr { + return fmt.Errorf("unexpected error when setting registriesNamespace in DSC %s to %s: %w", + tc.testDsc.Name, namespace, err) + } + } else { + if expectErr { + return fmt.Errorf("unexpected success when setting registriesNamespace in DSC %s to %s", + tc.testDsc.Name, namespace) + } + } + // compare expected against returned registriesNamespace + if tc.testDsc.Spec.Components.ModelRegistry.RegistriesNamespace != expected { + return fmt.Errorf("expected registriesNamespace %s, got %s", + expected, tc.testDsc.Spec.Components.ModelRegistry.RegistriesNamespace) + } + return nil +} diff --git a/tests/e2e/helper_test.go b/tests/e2e/helper_test.go index 5273e12e0cc..e122ce36921 100644 --- a/tests/e2e/helper_test.go +++ b/tests/e2e/helper_test.go @@ -31,6 +31,7 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" + "github.com/opendatahub-io/opendatahub-operator/v2/components/modelregistry" "github.com/opendatahub-io/opendatahub-operator/v2/components/ray" "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" @@ -72,7 +73,6 @@ func (tc *testContext) waitForOperatorDeployment(name string, replicas int32) er } } } - log.Printf("Error in %s deployment", name) return false, nil }) @@ -168,6 +168,11 @@ func setupDSCInstance(name string) *dscv1.DataScienceCluster { ManagementState: operatorv1.Removed, }, }, + ModelRegistry: modelregistry.ModelRegistry{ + Component: components.Component{ + ManagementState: operatorv1.Removed, + }, + }, }, }, }