Skip to content

Commit

Permalink
Add the KFP v2 example and update the tutorial (#1587)
Browse files Browse the repository at this point in the history
* Add the KFP v2 benchmark test.
* Update the tutorial.

Fixes #1586

Signed-off-by: Ye Cao <[email protected]>
  • Loading branch information
dashanji authored Oct 12, 2023
1 parent 678117b commit fc436eb
Show file tree
Hide file tree
Showing 20 changed files with 901 additions and 176 deletions.
Binary file added docs/images/kubeflow_create_run.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/kubeflow_upload_pipeline.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Large diffs are not rendered by default.

Empty file added java/hive/hive
Empty file.
19 changes: 10 additions & 9 deletions k8s/examples/vineyard-csidriver/Makefile
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
REGISTRY := "ghcr.io/v6d-io/v6d/kubeflow-example"
docker-build:
docker build prepare-data/ -f Dockerfile \
--build-arg APP=prepare-data.py \
-t prepare-data
-t $(REGISTRY)/prepare-data

docker build preprocess/ -f Dockerfile \
--build-arg APP=preprocess.py \
-t preprocess-data
-t $(REGISTRY)/preprocess-data

docker build train/ -f Dockerfile \
--build-arg APP=train.py \
-t train-data
-t $(REGISTRY)/train-data

docker build test/ -f Dockerfile \
--build-arg APP=test.py \
-t test-data
-t $(REGISTRY)/test-data

load-images:
kind load docker-image prepare-data
kind load docker-image preprocess-data
kind load docker-image train-data
kind load docker-image test-data
push-images:
docker push $(REGISTRY)/prepare-data
docker push $(REGISTRY)/preprocess-data
docker push $(REGISTRY)/train-data
docker push $(REGISTRY)/test-data
64 changes: 64 additions & 0 deletions k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from kfp import dsl
from kfp import kubernetes

@dsl.container_component
def PreProcess(data_multiplier: int):
return dsl.ContainerSpec(
image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
command = ['python3', 'preprocess.py'],
args = [f'--data_multiplier={data_multiplier}', '--with_vineyard=True'],
)

@dsl.container_component
def Train():
return dsl.ContainerSpec(
image = 'ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
command = ['python3', 'train.py'],
args = ['--with_vineyard=True'],
)

@dsl.container_component
def Test():
return dsl.ContainerSpec(
image = 'ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
command = ['python3', 'test.py'],
args = ['--with_vineyard=True'],
)

def mount_pvc(component, pvc_name):
kubernetes.mount_pvc(
component,
pvc_name=pvc_name,
mount_path='/data',
)
kubernetes.mount_pvc(
component,
pvc_name="vineyard-objects",
mount_path='/vineyard/data',
)

@dsl.pipeline(
name='Machine Learning Pipeline With Vineyard',
description='An example pipeline that trains and logs a regression model.'
)
def pipeline(data_multiplier: int):
vineyard_objects_pvc = kubernetes.CreatePVC(
# can also use pvc_name instead of pvc_name_suffix to use a pre-existing PVC
pvc_name='vineyard-objects',
access_modes=['ReadWriteMany'],
# the size does not matter, but it must not be empty
size='1Mi',
storage_class_name='vineyard-system.vineyardd-sample.csi',
)

comp1 = PreProcess(data_multiplier=data_multiplier).after(vineyard_objects_pvc)
mount_pvc(comp1, "benchmark-data")
comp2 = Train().after(comp1)
mount_pvc(comp2, "benchmark-data")
comp3 = Test().after(comp2)
mount_pvc(comp3, "benchmark-data")
kubernetes.DeletePVC(pvc_name="vineyard-objects").after(comp3)

if __name__ == '__main__':
from kfp import compiler
compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')
220 changes: 220 additions & 0 deletions k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
# PIPELINE DEFINITION
# Name: machine-learning-pipeline-with-vineyard
# Description: An example pipeline that trains and logs a regression model.
# Inputs:
# data_multiplier: int
components:
comp-createpvc:
executorLabel: exec-createpvc
inputDefinitions:
parameters:
access_modes:
description: 'AccessModes to request for the provisioned PVC. May
be one or more of ``''ReadWriteOnce''``, ``''ReadOnlyMany''``, ``''ReadWriteMany''``,
or
``''ReadWriteOncePod''``. Corresponds to `PersistentVolumeClaim.spec.accessModes
<https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes>`_.'
parameterType: LIST
annotations:
description: Annotations for the PVC's metadata. Corresponds to `PersistentVolumeClaim.metadata.annotations
<https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaim>`_.
isOptional: true
parameterType: STRUCT
pvc_name:
description: 'Name of the PVC. Corresponds to `PersistentVolumeClaim.metadata.name
<https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaim>`_.
Only one of ``pvc_name`` and ``pvc_name_suffix`` can
be provided.'
isOptional: true
parameterType: STRING
pvc_name_suffix:
description: 'Prefix to use for a dynamically generated name, which
will take the form ``<argo-workflow-name>-<pvc_name_suffix>``. Only one
of ``pvc_name`` and ``pvc_name_suffix`` can be provided.'
isOptional: true
parameterType: STRING
size:
description: The size of storage requested by the PVC that will be provisioned.
For example, ``'5Gi'``. Corresponds to `PersistentVolumeClaim.spec.resources.requests.storage
<https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaimSpec>`_.
parameterType: STRING
storage_class_name:
defaultValue: ''
description: 'Name of StorageClass from which to provision the PV
to back the PVC. ``None`` indicates to use the cluster''s default
storage_class_name. Set to ``''''`` for a statically specified PVC.'
isOptional: true
parameterType: STRING
volume_name:
description: 'Pre-existing PersistentVolume that should back the
provisioned PersistentVolumeClaim. Used for statically
specified PV only. Corresponds to `PersistentVolumeClaim.spec.volumeName
<https://kubernetes.io/docs/reference/kubernetes-api/config-and-storage-resources/persistent-volume-claim-v1/#PersistentVolumeClaimSpec>`_.'
isOptional: true
parameterType: STRING
outputDefinitions:
parameters:
name:
parameterType: STRING
comp-deletepvc:
executorLabel: exec-deletepvc
inputDefinitions:
parameters:
pvc_name:
description: Name of the PVC to delete. Supports passing a runtime-generated
name, such as a name provided by ``kubernetes.CreatePvcOp().outputs['name']``.
parameterType: STRING
comp-preprocess:
executorLabel: exec-preprocess
inputDefinitions:
parameters:
data_multiplier:
parameterType: NUMBER_INTEGER
comp-test:
executorLabel: exec-test
comp-train:
executorLabel: exec-train
deploymentSpec:
executors:
exec-createpvc:
container:
image: argostub/createpvc
exec-deletepvc:
container:
image: argostub/deletepvc
exec-preprocess:
container:
args:
- --data_multiplier={{$.inputs.parameters['data_multiplier']}}
- --with_vineyard=True
command:
- python3
- preprocess.py
image: ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data
exec-test:
container:
args:
- --with_vineyard=True
command:
- python3
- test.py
image: ghcr.io/v6d-io/v6d/kubeflow-example/test-data
exec-train:
container:
args:
- --with_vineyard=True
command:
- python3
- train.py
image: ghcr.io/v6d-io/v6d/kubeflow-example/train-data
pipelineInfo:
description: An example pipeline that trains and logs a regression model.
name: machine-learning-pipeline-with-vineyard
root:
dag:
tasks:
createpvc:
cachingOptions:
enableCache: true
componentRef:
name: comp-createpvc
inputs:
parameters:
access_modes:
runtimeValue:
constant:
- ReadWriteMany
pvc_name:
runtimeValue:
constant: vineyard-objects
size:
runtimeValue:
constant: 1Mi
storage_class_name:
runtimeValue:
constant: vineyard-system.vineyardd-sample.csi
taskInfo:
name: createpvc
deletepvc:
cachingOptions:
enableCache: true
componentRef:
name: comp-deletepvc
dependentTasks:
- test
inputs:
parameters:
pvc_name:
runtimeValue:
constant: vineyard-objects
taskInfo:
name: deletepvc
preprocess:
cachingOptions:
enableCache: true
componentRef:
name: comp-preprocess
dependentTasks:
- createpvc
inputs:
parameters:
data_multiplier:
componentInputParameter: data_multiplier
taskInfo:
name: preprocess
test:
cachingOptions:
enableCache: true
componentRef:
name: comp-test
dependentTasks:
- train
taskInfo:
name: test
train:
cachingOptions:
enableCache: true
componentRef:
name: comp-train
dependentTasks:
- preprocess
taskInfo:
name: train
inputDefinitions:
parameters:
data_multiplier:
parameterType: NUMBER_INTEGER
schemaVersion: 2.1.0
sdkVersion: kfp-2.3.0
---
platforms:
kubernetes:
deploymentSpec:
executors:
exec-preprocess:
pvcMount:
- constant: benchmark-data
mountPath: /data
- constant: vineyard-objects
mountPath: /vineyard/data
exec-test:
pvcMount:
- constant: benchmark-data
mountPath: /data
- constant: vineyard-objects
mountPath: /vineyard/data
exec-train:
pvcMount:
- constant: benchmark-data
mountPath: /data
- constant: vineyard-objects
mountPath: /vineyard/data
47 changes: 47 additions & 0 deletions k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from kfp import dsl
from kfp import kubernetes

@dsl.container_component
def PreProcess(data_multiplier: int):
return dsl.ContainerSpec(
image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
command = ['python3', 'preprocess.py'],
args=[f'--data_multiplier={data_multiplier}'],
)

@dsl.container_component
def Train():
return dsl.ContainerSpec(
image='ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
command = ['python3', 'train.py'],
)

@dsl.container_component
def Test():
return dsl.ContainerSpec(
image='ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
command = ['python3', 'test.py'],
)

def mount_pvc(component, pvc_name):
kubernetes.mount_pvc(
component,
pvc_name=pvc_name,
mount_path='/data',
)

@dsl.pipeline(
name='Machine Learning Pipeline',
description='An example pipeline that trains and logs a regression model.'
)
def pipeline(data_multiplier: int):
comp1 = PreProcess(data_multiplier=data_multiplier)
mount_pvc(comp1, "benchmark-data")
comp2 = Train().after(comp1)
mount_pvc(comp2, "benchmark-data")
comp3 = Test().after(comp2)
mount_pvc(comp3, "benchmark-data")

if __name__ == '__main__':
from kfp import compiler
compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')
Loading

0 comments on commit fc436eb

Please sign in to comment.