Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

host network promoted to spec field #294

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apis/inference/v1alpha1/elasticbatchjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ type ElasticBatchJobSpec struct {
// "Worker": ReplicaSpec,
// }
ElasticBatchReplicaSpecs map[common.ReplicaType]*common.ReplicaSpec `json:"elasticBatchReplicaSpecs"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *common.NetworkMode `json:"networkmode,omitempty"`
}

// +genclient
Expand Down
5 changes: 5 additions & 0 deletions apis/inference/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions apis/training/v1alpha1/elasticdljob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ type ElasticDLJobSpec struct {
// "Master": ElasticDLReplicaSpec,
// }
ElasticDLReplicaSpecs map[common.ReplicaType]*common.ReplicaSpec `json:"elasticdlReplicaSpecs"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *common.NetworkMode `json:"networkmode,omitempty"`
}

// +genclient
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/marsjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ type MarsJobSpec struct {
// MarsReplicaSpecs is a map of MarsReplicaType(key) to ReplicaSpec(value),
// specifying replicas and template of each type.
MarsReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"marsReplicaSpecs"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *commonv1.NetworkMode `json:"networkmode,omitempty"`
}

// MarsJobStatus defines the observed state of MarsJob
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/mpijob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ type MPIJobSpec struct {

// LegacySpec reserves the deprecated fields for backward compatibility.
*MPIJobLegacySpec `json:",inline"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *apiv1.NetworkMode `json:"networkmode,omitempty"`
}

// MPIJobLegacySpec is a collection of legacy fields that were used in v1alpha1/v1alpha2 but
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/pytorchjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ type PyTorchJobSpec struct {
// CacheBackend is used to configure the cache engine for job
// +optional
CacheBackend *cachev1alpha1.CacheBackendSpec `json:"cacheBackend"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *common.NetworkMode `json:"networkmode,omitempty"`
}

// PyTorchJobStatus defines the observed state of PyTorchJob
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/tfjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ type TFJobSpec struct {
// CacheBackend is used to configure the cache engine for job
// +optional
CacheBackend *cachev1alpha1.CacheBackendSpec `json:"cacheBackend"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *commonv1.NetworkMode `json:"networkmode,omitempty"`
}

// +genclient
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/xdljob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ type XDLJobSpec struct {
// MinFinishWorkPercentage takes precedence over MinFinishWorkerNum if both are
// specified.
MinFinishWorkerPercentage *int32 `json:"minFinishWorkRate,omitempty"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *v1.NetworkMode `json:"networkmode,omitempty"`
}

// XDLJobStatus defines the observed state of XDLJob
Expand Down
3 changes: 3 additions & 0 deletions apis/training/v1alpha1/xgboostjob_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ type XGBoostJobSpec struct {
// "Worker": ReplicaSpec,
// }
XGBReplicaSpecs map[commonv1.ReplicaType]*commonv1.ReplicaSpec `json:"xgbReplicaSpecs"`

// NetworkMode defines network mode for intra job communicating.
NetworkMode *commonv1.NetworkMode `json:"networkmode,omitempty"`
}

// XGBoostJobStatus defines the observed state of XGBoostJob
Expand Down
35 changes: 35 additions & 0 deletions apis/training/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions config/crd/bases/inference.kubedl.io_elasticbatchjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3069,6 +3069,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_elasticdljobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3063,6 +3063,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_marsjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3063,6 +3063,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_mpijobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3075,6 +3075,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
processingResourceType:
type: string
processingUnits:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_pytorchjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
pytorchReplicaSpecs:
additionalProperties:
properties:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_tfjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ spec:
type: object
type: object
type: object
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_xdljobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ spec:
minFinishWorkRate:
format: int32
type: integer
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 2 additions & 0 deletions config/crd/bases/training.kubedl.io_xgboostjobs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ spec:
required:
- schedule
type: object
networkmode:
type: string
schedulingPolicy:
properties:
minAvailable:
Expand Down
2 changes: 1 addition & 1 deletion controllers/elasticbatch/elasticbatchjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (r *ElasticBatchJobReconciler) Reconcile(_ context.Context, req ctrl.Reques
// Set default properties for elasticbatch job.
r.scheme.Default(elasticbatchJob)

result, err := r.ctrl.ReconcileJobs(elasticbatchJob, elasticbatchJob.Spec.ElasticBatchReplicaSpecs, elasticbatchJob.Status, &elasticbatchJob.Spec.RunPolicy, nil, nil)
result, err := r.ctrl.ReconcileJobs(elasticbatchJob, elasticbatchJob.Spec.ElasticBatchReplicaSpecs, elasticbatchJob.Status, &elasticbatchJob.Spec.RunPolicy, nil, nil, elasticbatchJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "elasticbatch job reconcile failed")
return result, err
Expand Down
2 changes: 1 addition & 1 deletion controllers/elasticdl/elasticdljob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func (r *ElasticDLJobReconciler) Reconcile(_ context.Context, req ctrl.Request)
// Set default properties for elasicdl job.
r.scheme.Default(elasticdlJob)

result, err := r.ctrl.ReconcileJobs(elasticdlJob, elasticdlJob.Spec.ElasticDLReplicaSpecs, elasticdlJob.Status, &elasticdlJob.Spec.RunPolicy, nil, nil)
result, err := r.ctrl.ReconcileJobs(elasticdlJob, elasticdlJob.Spec.ElasticDLReplicaSpecs, elasticdlJob.Status, &elasticdlJob.Spec.RunPolicy, nil, nil, elasticdlJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "elasticdl job reconcile failed")
return result, err
Expand Down
2 changes: 1 addition & 1 deletion controllers/mars/marsjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func (r *MarsJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl

r.scheme.Default(marsJob)

result, err := r.ctrl.ReconcileJobs(marsJob, marsJob.Spec.MarsReplicaSpecs, marsJob.Status.JobStatus, &marsJob.Spec.RunPolicy, nil, nil)
result, err := r.ctrl.ReconcileJobs(marsJob, marsJob.Spec.MarsReplicaSpecs, marsJob.Status.JobStatus, &marsJob.Spec.RunPolicy, nil, nil, marsJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "mars job reconcile failed")
return result, err
Expand Down
2 changes: 1 addition & 1 deletion controllers/mpi/mpijob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ func (r *MPIJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl.
// Set default properties for tensorflow job.
r.scheme.Default(mpiJob)

result, err = r.ctrl.ReconcileJobs(mpiJob, mpiJob.Spec.MPIReplicaSpecs, mpiJob.Status, &mpiJob.Spec.RunPolicy, nil, nil)
result, err = r.ctrl.ReconcileJobs(mpiJob, mpiJob.Spec.MPIReplicaSpecs, mpiJob.Status, &mpiJob.Spec.RunPolicy, nil, nil, mpiJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "mpi job reconcile failed")
return result, err
Expand Down
4 changes: 2 additions & 2 deletions controllers/pytorch/pytorchjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ func (r *PytorchJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (c
// Set default properties for pytorch job.
r.scheme.Default(pytorchJob)

result, err := r.ctrl.ReconcileJobs(pytorchJob, pytorchJob.Spec.PyTorchReplicaSpecs, pytorchJob.Status, &pytorchJob.Spec.RunPolicy, pytorchJob.Spec.ModelVersion, pytorchJob.Spec.CacheBackend)
result, err := r.ctrl.ReconcileJobs(pytorchJob, pytorchJob.Spec.PyTorchReplicaSpecs, pytorchJob.Status, &pytorchJob.Spec.RunPolicy, pytorchJob.Spec.ModelVersion, pytorchJob.Spec.CacheBackend, pytorchJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "pytorch job reconcile failed")
return result, err
Expand Down Expand Up @@ -241,7 +241,7 @@ func (r *PytorchJobReconciler) SetClusterSpec(ctx context.Context, job interface
}

masterRole := rtype == strings.ToLower(string(training.PyTorchReplicaTypeMaster))
if masterHostPort, ok := job_controller.GetHostNetworkPortFromContext(ctx, "master", "0"); job_controller.EnableHostNetwork(pytorchJob) && ok {
if masterHostPort, ok := job_controller.GetHostNetworkPortFromContext(ctx, "master", "0"); job_controller.EnableHostNetwork(pytorchJob.Spec.NetworkMode) && ok {
if masterRole || features.KubeDLFeatureGates.Enabled(features.HostNetWithHeadlessSvc) {
masterPort = masterHostPort
}
Expand Down
2 changes: 1 addition & 1 deletion controllers/tensorflow/tensorflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func genClusterSpec(ctx context.Context, tfJob *training.TFJob, selfType, selfIn
selfPort := port
// Set endpoint port as selected hostnetwork port so that tensorflow worker process could listen
// to correct port by TF_CONFIG[cluster].
if job_controller.EnableHostNetwork(tfJob) && rt == selfType && strconv.Itoa(int(i)) == selfIndex {
if job_controller.EnableHostNetwork(tfJob.Spec.NetworkMode) && rt == selfType && strconv.Itoa(int(i)) == selfIndex {
hostPort, ok := job_controller.GetHostNetworkPortFromContext(ctx, selfType, selfIndex)
if ok {
selfPort = hostPort
Expand Down
2 changes: 1 addition & 1 deletion controllers/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ func (r *TFJobReconciler) Reconcile(_ context.Context, req ctrl.Request) (ctrl.R
// Set default properties for tensorflow job.
r.scheme.Default(tfJob)

result, err := r.ctrl.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy, tfJob.Spec.ModelVersion, tfJob.Spec.CacheBackend)
result, err := r.ctrl.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy, tfJob.Spec.ModelVersion, tfJob.Spec.CacheBackend, tfJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "tensorflow job reconcile failed")
return result, err
Expand Down
2 changes: 1 addition & 1 deletion controllers/xdl/xdljob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func (r *XDLJobReconciler) Reconcile(_ context.Context, request reconcile.Reques
// Set default properties for xdl job.
r.scheme.Default(xdlJob)

result, err := r.ctrl.ReconcileJobs(xdlJob, xdlJob.Spec.XDLReplicaSpecs, xdlJob.Status, &xdlJob.Spec.RunPolicy, nil, nil)
result, err := r.ctrl.ReconcileJobs(xdlJob, xdlJob.Spec.XDLReplicaSpecs, xdlJob.Status, &xdlJob.Spec.RunPolicy, nil, nil, xdlJob.Spec.NetworkMode)
if err != nil {
log.Error(err, "xdl job reconcile failed.")
return result, err
Expand Down
2 changes: 1 addition & 1 deletion controllers/xgboost/xgboostjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ func (r *XgboostJobReconciler) Reconcile(_ context.Context, req reconcile.Reques
// Set default properties for xgboost job
r.scheme.Default(xgboostjob)

result, err := r.ctrl.ReconcileJobs(xgboostjob, xgboostjob.Spec.XGBReplicaSpecs, xgboostjob.Status.JobStatus, &xgboostjob.Spec.RunPolicy, nil, nil)
result, err := r.ctrl.ReconcileJobs(xgboostjob, xgboostjob.Spec.XGBReplicaSpecs, xgboostjob.Status.JobStatus, &xgboostjob.Spec.RunPolicy, nil, nil, xgboostjob.Spec.NetworkMode)
if err != nil {
log.Error(err, "xgboost job reconcile failed")
return result, err
Expand Down
10 changes: 0 additions & 10 deletions pkg/job_controller/api/v1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ const (
AnnotationGitSyncConfig = KubeDLPrefix + "/git-sync-config"
// AnnotationTenancyInfo annotate tenancy information.
AnnotationTenancyInfo = KubeDLPrefix + "/tenancy"
// AnnotationNetworkMode annotate job network mode.
AnnotationNetworkMode = KubeDLPrefix + "/network-mode"
// AnnotationEnableElasticTraining indicates job enables elastic training.
AnnotationEnableElasticTraining = KubeDLPrefix + "/enable-elastic-training"
// AnnotationElasticScaleState indicates current progress of elastic scaling (inflight | done)
Expand Down Expand Up @@ -75,14 +73,6 @@ const (
JobReplicaTypeAIMaster ReplicaType = "AIMaster"
)

// NetworkMode defines network mode for intra job communicating.
type NetworkMode string

const (
// HostNetworkMode indicates that replicas use host-network to communicate with each other.
HostNetworkMode NetworkMode = "host"
)

const (
ElasticScaleInflight = "inflight"
ElasticScaleDone = "done"
Expand Down
2 changes: 2 additions & 0 deletions pkg/job_controller/api/v1/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
)

// ControllerInterface defines the Interface to be implemented by custom operators. e.g. tf-operator needs to implement this interface
// +k8s:deepcopy-gen=false
type ControllerInterface interface {
//ControllerName Returns the Controller name
ControllerName() string
Expand Down Expand Up @@ -72,6 +73,7 @@ type ControllerInterface interface {
}

// ElasticScaling defines the interface to be implemented by custom workload elastic behaviors.
// +k8s:deepcopy-gen=false
type ElasticScaling interface {
// EnableElasticScaling indicates workload enables elastic scaling or not.
EnableElasticScaling(job v1.Object, runPolicy *RunPolicy) bool
Expand Down
8 changes: 8 additions & 0 deletions pkg/job_controller/api/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,11 @@ type DAGCondition struct {
// OnPhase defines at which phase the upstream replica will trigger this condition.
OnPhase v1.PodPhase `json:"onPhase"`
}

// NetworkMode defines network mode for intra job communicating.
type NetworkMode string

const (
// HostNetworkMode indicates that replicas use host-network to communicate with each other.
HostNetworkMode NetworkMode = "host"
)
Loading