Skip to content

Commit

Permalink
koordlet: rdma device inject
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
wangjianyu.wjy committed Nov 29, 2024
1 parent 1afffbe commit d16c933
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 2 deletions.
9 changes: 9 additions & 0 deletions pkg/koordlet/runtimehooks/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/cpuset"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/gpu"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/groupidentity"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/rdma"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/resctrl"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/tc"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks/terwayqos"
Expand Down Expand Up @@ -60,6 +61,12 @@ const (
// beta: v1.1
GPUEnvInject featuregate.Feature = "GPUEnvInject"

// RDMADeviceInject injects rdma device info according to allocate result from koord-scheduler.
//
// owner: @ZiMengSheng
// alpha: v1.6
RDMADeviceInject featuregate.Feature = "RDMADeviceInject"

// BatchResource sets request and limits of cpu and memory on cgroup file according batch resources.
//
// owner: @saintube @zwzhang0107
Expand Down Expand Up @@ -101,6 +108,7 @@ var (
GroupIdentity: {Default: true, PreRelease: featuregate.Beta},
CPUSetAllocator: {Default: true, PreRelease: featuregate.Beta},
GPUEnvInject: {Default: false, PreRelease: featuregate.Alpha},
RDMADeviceInject: {Default: false, PreRelease: featuregate.Alpha},
BatchResource: {Default: true, PreRelease: featuregate.Beta},
CPUNormalization: {Default: false, PreRelease: featuregate.Alpha},
CoreSched: {Default: false, PreRelease: featuregate.Alpha},
Expand All @@ -113,6 +121,7 @@ var (
GroupIdentity: groupidentity.Object(),
CPUSetAllocator: cpuset.Object(),
GPUEnvInject: gpu.Object(),
RDMADeviceInject: rdma.Object(),
BatchResource: batchresource.Object(),
CPUNormalization: cpunormalization.Object(),
CoreSched: coresched.Object(),
Expand Down
120 changes: 120 additions & 0 deletions pkg/koordlet/runtimehooks/hooks/rdma/rdma.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rdma

import (
"fmt"
"os"
"path/filepath"

"github.com/containerd/nri/pkg/api"
"k8s.io/klog/v2"

ext "github.com/koordinator-sh/koordinator/apis/extension"
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks"
"github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/protocol"
rmconfig "github.com/koordinator-sh/koordinator/pkg/runtimeproxy/config"
)

const (
IBDevDir = "/dev/infiniband"
RDMACM = "/dev/infiniband/rdma_cm"
SysBusPci = "/sys/bus/pci/devices"
)

type rdmaPlugin struct{}

func (p *rdmaPlugin) Register(op hooks.Options) {
klog.V(5).Infof("register hook %v", "rdma device inject")
hooks.Register(rmconfig.PreCreateContainer, "rdma device inject", "inject NVIDIA_VISIBLE_DEVICES env into container", p.InjectDevice)
}

var singleton *rdmaPlugin

func Object() *rdmaPlugin {
if singleton == nil {
singleton = &rdmaPlugin{}
}
return singleton
}

func (p *rdmaPlugin) InjectDevice(proto protocol.HooksProtocol) error {
containerCtx := proto.(*protocol.ContainerContext)
if containerCtx == nil {
return fmt.Errorf("container protocol is nil for plugin gpu")
}
containerReq := containerCtx.Request
alloc, err := ext.GetDeviceAllocations(containerReq.PodAnnotations)
if err != nil {
return err
}
devices, ok := alloc[schedulingv1alpha1.RDMA]
if !ok || len(devices) == 0 {
klog.V(5).Infof("no rdma alloc info in pod anno, %s", containerReq.PodMeta.Name)
return nil
}
containerCtx.Response.AddContainerDevices = []*api.LinuxDevice{
{
Path: RDMACM,
Type: "c",
FileMode: &api.OptionalFileMode{
Value: 0666,
},
},
}

for _, device := range devices {
for _, vf := range device.Extension.VirtualFunctions {
uverbsOfVF, err := getUVerbsViaPciAdd(vf.BusID)
if err != nil {
return err
}
containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices,
&api.LinuxDevice{
Path: uverbsOfVF,
Type: "c",
FileMode: &api.OptionalFileMode{
Value: 0666,
},
})
}
uverbs, err := getUVerbsViaPciAdd(device.ID)
if err != nil {
return err
}
containerCtx.Response.AddContainerDevices = append(containerCtx.Response.AddContainerDevices,
&api.LinuxDevice{
Path: uverbs,
Type: "c",
FileMode: &api.OptionalFileMode{
Value: 0666,
},
})
}
return nil
}

func getUVerbsViaPciAdd(pciAddress string) (string, error) {
pciDir := filepath.Join(SysBusPci, pciAddress, "infiniband_verbs")
files, err := os.ReadDir(pciDir)
if err != nil || len(files) == 0 {
klog.Errorf("fail read pciDir %v", err)
return "", fmt.Errorf("failed to get uverbs: %s", err.Error())
}
return filepath.Join(IBDevDir, files[0].Name()), nil
}
12 changes: 10 additions & 2 deletions pkg/koordlet/runtimehooks/protocol/container_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta
}

type ContainerResponse struct {
Resources Resources
AddContainerEnvs map[string]string
Resources Resources
AddContainerEnvs map[string]string
AddContainerDevices []*api.LinuxDevice
}

func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) {
Expand Down Expand Up @@ -279,6 +280,13 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu
}
}

if len(c.Response.AddContainerDevices) != 0 {
for i := range c.Response.AddContainerDevices {
adjust.AddDevice(c.Response.AddContainerDevices[i])
}

}

c.Update()

return adjust, update, nil
Expand Down

0 comments on commit d16c933

Please sign in to comment.