Add the KFP v2 example and update the tutorial (#1587)

* Add the KFP v2 benchmark test. * Update the tutorial. Fixes #1586 Signed-off-by: Ye Cao <[email protected]>
v6d-io · Oct 12, 2023 · fc436eb · fc436eb
1 parent 678117b
commit fc436eb
Show file tree

Hide file tree

Showing 20 changed files with 901 additions and 176 deletions.
diff --git a/docs/images/kubeflow_create_run.png b/docs/images/kubeflow_create_run.png
diff --git a/docs/images/kubeflow_upload_pipeline.png b/docs/images/kubeflow_upload_pipeline.png
diff --git a/...ials/kubernetes/efficient-data-sharing-in-kubeflow-with-vineyard-csi-driver.rst b/...ials/kubernetes/efficient-data-sharing-in-kubeflow-with-vineyard-csi-driver.rst
diff --git a/java/hive/hive b/java/hive/hive
diff --git a/k8s/examples/vineyard-csidriver/Makefile b/k8s/examples/vineyard-csidriver/Makefile
@@ -1,22 +1,23 @@
+REGISTRY := "ghcr.io/v6d-io/v6d/kubeflow-example"
 docker-build:
 	docker build prepare-data/ -f Dockerfile \
 		--build-arg APP=prepare-data.py \
-		-t prepare-data
+		-t $(REGISTRY)/prepare-data
 
 	docker build preprocess/ -f Dockerfile \
 		--build-arg APP=preprocess.py \
-		-t preprocess-data
+		-t $(REGISTRY)/preprocess-data
 
 	docker build train/ -f Dockerfile \
 		--build-arg APP=train.py \
-		-t train-data
+		-t $(REGISTRY)/train-data
 
 	docker build test/ -f Dockerfile \
 		--build-arg APP=test.py \
-		-t test-data
+		-t $(REGISTRY)/test-data
 
-load-images:
-	kind load docker-image prepare-data
-	kind load docker-image preprocess-data
-	kind load docker-image train-data
-	kind load docker-image test-data
+push-images:
+	docker push $(REGISTRY)/prepare-data
+	docker push $(REGISTRY)/preprocess-data
+	docker push $(REGISTRY)/train-data
+	docker push $(REGISTRY)/test-data
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py
@@ -0,0 +1,64 @@
+from kfp import dsl
+from kfp import kubernetes
+
+@dsl.container_component
+def PreProcess(data_multiplier: int):
+    return dsl.ContainerSpec(
+        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
+        command = ['python3', 'preprocess.py'],
+        args = [f'--data_multiplier={data_multiplier}', '--with_vineyard=True'],
+    )
+
+@dsl.container_component
+def Train():
+    return dsl.ContainerSpec(
+        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
+        command = ['python3', 'train.py'],
+        args = ['--with_vineyard=True'],
+    )
+
+@dsl.container_component
+def Test():
+    return dsl.ContainerSpec(
+        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
+        command = ['python3', 'test.py'],
+        args = ['--with_vineyard=True'],
+    )
+
+def mount_pvc(component, pvc_name):
+    kubernetes.mount_pvc(
+        component,
+        pvc_name=pvc_name,
+        mount_path='/data',
+    )
+    kubernetes.mount_pvc(
+        component,
+        pvc_name="vineyard-objects",
+        mount_path='/vineyard/data',
+    )
+
+@dsl.pipeline(
+   name='Machine Learning Pipeline With Vineyard',
+   description='An example pipeline that trains and logs a regression model.'
+)
+def pipeline(data_multiplier: int):
+    vineyard_objects_pvc = kubernetes.CreatePVC(
+        # can also use pvc_name instead of pvc_name_suffix to use a pre-existing PVC
+        pvc_name='vineyard-objects',
+        access_modes=['ReadWriteMany'],
+        # the size does not matter, but it must not be empty
+        size='1Mi',
+        storage_class_name='vineyard-system.vineyardd-sample.csi',
+    )
+
+    comp1 = PreProcess(data_multiplier=data_multiplier).after(vineyard_objects_pvc)
+    mount_pvc(comp1, "benchmark-data")
+    comp2 = Train().after(comp1)
+    mount_pvc(comp2, "benchmark-data")
+    comp3 = Test().after(comp2)
+    mount_pvc(comp3, "benchmark-data")
+    kubernetes.DeletePVC(pvc_name="vineyard-objects").after(comp3)
+
+if __name__ == '__main__':
+    from kfp import compiler
+    compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml
@@ -0,0 +1,220 @@
+# PIPELINE DEFINITION
+# Name: machine-learning-pipeline-with-vineyard
+# Description: An example pipeline that trains and logs a regression model.
+# Inputs:
+#    data_multiplier: int
+components:
+  comp-createpvc:
+    executorLabel: exec-createpvc
+    inputDefinitions:
+      parameters:
+        access_modes:
+          description: 'AccessModes to request for the provisioned PVC. May
+
+            be one or more of ``''ReadWriteOnce''``, ``''ReadOnlyMany''``, ``''ReadWriteMany''``,
+            or
+
+            ``''ReadWriteOncePod''``. Corresponds to `PersistentVolumeClaim.spec.accessModes
+            <https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes>`_.'
+          parameterType: LIST
+        annotations:
+          description: Annotations for the PVC's metadata. Corresponds to `PersistentVolumeClaim.metadata.annotations
+            <https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaim>`_.
+          isOptional: true
+          parameterType: STRUCT
+        pvc_name:
+          description: 'Name of the PVC. Corresponds to `PersistentVolumeClaim.metadata.name
+            <https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaim>`_.
+            Only one of ``pvc_name`` and ``pvc_name_suffix`` can
+
+            be provided.'
+          isOptional: true
+          parameterType: STRING
+        pvc_name_suffix:
+          description: 'Prefix to use for a dynamically generated name, which
+
+            will take the form ``<argo-workflow-name>-<pvc_name_suffix>``. Only one
+
+            of ``pvc_name`` and ``pvc_name_suffix`` can be provided.'
+          isOptional: true
+          parameterType: STRING
+        size:
+          description: The size of storage requested by the PVC that will be provisioned.
+            For example, ``'5Gi'``. Corresponds to `PersistentVolumeClaim.spec.resources.requests.storage
+            <https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaimSpec>`_.
+          parameterType: STRING
+        storage_class_name:
+          defaultValue: ''
+          description: 'Name of StorageClass from which to provision the PV
+
+            to back the PVC. ``None`` indicates to use the cluster''s default
+
+            storage_class_name. Set to ``''''`` for a statically specified PVC.'
+          isOptional: true
+          parameterType: STRING
+        volume_name:
+          description: 'Pre-existing PersistentVolume that should back the
+
+            provisioned PersistentVolumeClaim. Used for statically
+
+            specified PV only. Corresponds to `PersistentVolumeClaim.spec.volumeName
+            <https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaimSpec>`_.'
+          isOptional: true
+          parameterType: STRING
+    outputDefinitions:
+      parameters:
+        name:
+          parameterType: STRING
+  comp-deletepvc:
+    executorLabel: exec-deletepvc
+    inputDefinitions:
+      parameters:
+        pvc_name:
+          description: Name of the PVC to delete. Supports passing a runtime-generated
+            name, such as a name provided by ``kubernetes.CreatePvcOp().outputs['name']``.
+          parameterType: STRING
+  comp-preprocess:
+    executorLabel: exec-preprocess
+    inputDefinitions:
+      parameters:
+        data_multiplier:
+          parameterType: NUMBER_INTEGER
+  comp-test:
+    executorLabel: exec-test
+  comp-train:
+    executorLabel: exec-train
+deploymentSpec:
+  executors:
+    exec-createpvc:
+      container:
+        image: argostub/createpvc
+    exec-deletepvc:
+      container:
+        image: argostub/deletepvc
+    exec-preprocess:
+      container:
+        args:
+        - --data_multiplier={{$.inputs.parameters['data_multiplier']}}
+        - --with_vineyard=True
+        command:
+        - python3
+        - preprocess.py
+        image: ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data
+    exec-test:
+      container:
+        args:
+        - --with_vineyard=True
+        command:
+        - python3
+        - test.py
+        image: ghcr.io/v6d-io/v6d/kubeflow-example/test-data
+    exec-train:
+      container:
+        args:
+        - --with_vineyard=True
+        command:
+        - python3
+        - train.py
+        image: ghcr.io/v6d-io/v6d/kubeflow-example/train-data
+pipelineInfo:
+  description: An example pipeline that trains and logs a regression model.
+  name: machine-learning-pipeline-with-vineyard
+root:
+  dag:
+    tasks:
+      createpvc:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-createpvc
+        inputs:
+          parameters:
+            access_modes:
+              runtimeValue:
+                constant:
+                - ReadWriteMany
+            pvc_name:
+              runtimeValue:
+                constant: vineyard-objects
+            size:
+              runtimeValue:
+                constant: 1Mi
+            storage_class_name:
+              runtimeValue:
+                constant: vineyard-system.vineyardd-sample.csi
+        taskInfo:
+          name: createpvc
+      deletepvc:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-deletepvc
+        dependentTasks:
+        - test
+        inputs:
+          parameters:
+            pvc_name:
+              runtimeValue:
+                constant: vineyard-objects
+        taskInfo:
+          name: deletepvc
+      preprocess:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-preprocess
+        dependentTasks:
+        - createpvc
+        inputs:
+          parameters:
+            data_multiplier:
+              componentInputParameter: data_multiplier
+        taskInfo:
+          name: preprocess
+      test:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-test
+        dependentTasks:
+        - train
+        taskInfo:
+          name: test
+      train:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-train
+        dependentTasks:
+        - preprocess
+        taskInfo:
+          name: train
+  inputDefinitions:
+    parameters:
+      data_multiplier:
+        parameterType: NUMBER_INTEGER
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.3.0
+---
+platforms:
+  kubernetes:
+    deploymentSpec:
+      executors:
+        exec-preprocess:
+          pvcMount:
+          - constant: benchmark-data
+            mountPath: /data
+          - constant: vineyard-objects
+            mountPath: /vineyard/data
+        exec-test:
+          pvcMount:
+          - constant: benchmark-data
+            mountPath: /data
+          - constant: vineyard-objects
+            mountPath: /vineyard/data
+        exec-train:
+          pvcMount:
+          - constant: benchmark-data
+            mountPath: /data
+          - constant: vineyard-objects
+            mountPath: /vineyard/data
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py
@@ -0,0 +1,47 @@
+from kfp import dsl
+from kfp import kubernetes
+
+@dsl.container_component
+def PreProcess(data_multiplier: int):
+    return dsl.ContainerSpec(
+        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
+        command = ['python3', 'preprocess.py'],
+        args=[f'--data_multiplier={data_multiplier}'],
+    )
+
+@dsl.container_component
+def Train():
+    return dsl.ContainerSpec(
+        image='ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
+        command = ['python3', 'train.py'],
+    )
+
+@dsl.container_component
+def Test():
+    return dsl.ContainerSpec(
+        image='ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
+        command = ['python3', 'test.py'],
+    )
+
+def mount_pvc(component, pvc_name):
+    kubernetes.mount_pvc(
+        component,
+        pvc_name=pvc_name,
+        mount_path='/data',
+    )
+
+@dsl.pipeline(
+   name='Machine Learning Pipeline',
+   description='An example pipeline that trains and logs a regression model.'
+)
+def pipeline(data_multiplier: int):
+    comp1 = PreProcess(data_multiplier=data_multiplier)
+    mount_pvc(comp1, "benchmark-data")
+    comp2 = Train().after(comp1)
+    mount_pvc(comp2, "benchmark-data")
+    comp3 = Test().after(comp2)
+    mount_pvc(comp3, "benchmark-data")
+
+if __name__ == '__main__':
+    from kfp import compiler
+    compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')