Skip to content

Commit

Permalink
Merge pull request #22 from elezar/fix-devices
Browse files Browse the repository at this point in the history
Fix construction of linked devices.
  • Loading branch information
elezar authored Jan 11, 2024
2 parents 8fc3087 + b0ec32c commit 02af3d8
Show file tree
Hide file tree
Showing 29 changed files with 1,468 additions and 29 deletions.
38 changes: 38 additions & 0 deletions examples/devices/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package main

import (
"fmt"
"os"

"github.com/NVIDIA/go-gpuallocator/gpuallocator"
)

func main() {
dl, err := gpuallocator.NewDevices()
if err != nil {
fmt.Printf("error getting devices: %v\n", err)
os.Exit(1)
}

fmt.Printf("Found %d devices:\n", len(dl))
for i, device := range dl {
fmt.Printf("device %d:\n", i)
fmt.Printf("%s\n", device.Details())
}
}
7 changes: 5 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ module github.com/NVIDIA/go-gpuallocator

go 1.20

require github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d
require github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a

require github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f // indirect
require (
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f // indirect
github.com/google/uuid v1.4.0 // indirect
)

replace (
k8s.io/api => k8s.io/api v0.18.2
Expand Down
6 changes: 4 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d h1:XxRHS7eNkZVcPpZZmUcoT4oO8FEcoYKn06sooQh5niU=
github.com/NVIDIA/go-nvlib v0.0.0-20231116150931-9fd385bace0d/go.mod h1:HPFNPAYqQeoos58MKUboWsdZMu71EzSQrbmd+QBRD40=
github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a h1:EH7wiaq9+NYDgCBJEcGa3HTO2Sz6dRlmO2y9yMxA5jE=
github.com/NVIDIA/go-nvlib v0.0.0-20240109130712-11603560817a/go.mod h1:U82N6/xKp6OnoqpALBH0C5SO59Buu4sX1Z3rQtBsBKQ=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f h1:FTblgO87K1vPB8tcwM5EOFpFf6UpsrlDpErPm25mFWE=
github.com/NVIDIA/go-nvml v0.12.0-1.0.20231020145430-e06766c5e74f/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/uuid v1.4.0 h1:MtMxsa51/r9yyhkyLsVeVt0B+BGQZzpQiTQ4eHZ8bc4=
github.com/google/uuid v1.4.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
Expand Down
21 changes: 11 additions & 10 deletions gpuallocator/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,36 +98,37 @@ func (o *deviceListBuilder) build() (DeviceList, error) {
_ = o.nvmllib.Shutdown()
}()

nvmlDevices, err := o.devicelib.GetDevices()
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}

var devices DeviceList
err := o.devicelib.VisitDevices(func(i int, d device.Device) error {
for i, d := range nvmlDevices {
device, err := newDevice(i, d)
if err != nil {
return fmt.Errorf("failed to construct linked device: %v", err)
return nil, fmt.Errorf("failed to construct linked device: %v", err)
}
devices = append(devices, device)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}

for i, d1 := range devices {
for j, d2 := range devices {
for i, d1 := range nvmlDevices {
for j, d2 := range nvmlDevices {
if i != j {
p2plink, err := links.GetP2PLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting P2PLink for devices (%v, %v): %v", i, j, err)
}
if p2plink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, p2plink})
devices[i].Links[j] = append(devices[i].Links[j], P2PLink{devices[j], p2plink})
}

nvlink, err := links.GetNVLink(d1, d2)
if err != nil {
return nil, fmt.Errorf("error getting NVLink for devices (%v, %v): %v", i, j, err)
}
if nvlink != links.P2PLinkUnknown {
d1.Links[d2.Index] = append(d1.Links[d2.Index], P2PLink{d2, nvlink})
devices[i].Links[j] = append(devices[i].Links[j], P2PLink{devices[j], nvlink})
}
}
}
Expand Down
78 changes: 67 additions & 11 deletions internal/links/device.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,62 @@ const (
EighteenNVLINKLinks
)

// String returns the string representation of the P2PLink type.
func (l P2PLinkType) String() string {
switch l {
case P2PLinkCrossCPU:
return "P2PLinkCrossCPU"
case P2PLinkSameCPU:
return "P2PLinkSameCPU"
case P2PLinkHostBridge:
return "P2PLinkHostBridge"
case P2PLinkMultiSwitch:
return "P2PLinkMultiSwitch"
case P2PLinkSingleSwitch:
return "P2PLinkSingleSwitch"
case P2PLinkSameBoard:
return "P2PLinkSameBoard"
case SingleNVLINKLink:
return "SingleNVLINKLink"
case TwoNVLINKLinks:
return "TwoNVLINKLinks"
case ThreeNVLINKLinks:
return "ThreeNVLINKLinks"
case FourNVLINKLinks:
return "FourNVLINKLinks"
case FiveNVLINKLinks:
return "FiveNVLINKLinks"
case SixNVLINKLinks:
return "SixNVLINKLinks"
case SevenNVLINKLinks:
return "SevenNVLINKLinks"
case EightNVLINKLinks:
return "EightNVLINKLinks"
case NineNVLINKLinks:
return "NineNVLINKLinks"
case TenNVLINKLinks:
return "TenNVLINKLinks"
case ElevenNVLINKLinks:
return "ElevenNVLINKLinks"
case TwelveNVLINKLinks:
return "TwelveNVLINKLinks"
case ThirteenNVLINKLinks:
return "ThirteenNVLINKLinks"
case FourteenNVLINKLinks:
return "FourteenNVLINKLinks"
case FifteenNVLINKLinks:
return "FifteenNVLINKLinks"
case SixteenNVLINKLinks:
return "SixteenNVLINKLinks"
case SeventeenNVLINKLinks:
return "SeventeenNVLINKLinks"
case EighteenNVLINKLinks:
return "EighteenNVLINKLinks"
default:
return fmt.Sprintf("UNKOWN (%v)", uint(l))
}
}

// GetP2PLink gets the peer-to-peer connectivity between two devices.
func GetP2PLink(dev1 device.Device, dev2 device.Device) (P2PLinkType, error) {
level, ret := dev1.GetTopologyCommonAncestor(dev2)
Expand Down Expand Up @@ -149,23 +205,23 @@ func getAllNvLinkRemotePciInfo(dev device.Device) ([]PciInfo, error) {
var pciInfos []PciInfo
for i := 0; i < nvml.NVLINK_MAX_LINKS; i++ {
state, ret := dev.GetNvLinkState(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
if ret == nvml.ERROR_NOT_SUPPORTED || ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get nvlink state: %v", ret)
}

if state == nvml.FEATURE_ENABLED {
pciInfo, ret := dev.GetNvLinkRemotePciInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get remote pci info: %v", ret)
}
pciInfos = append(pciInfos, PciInfo(pciInfo))
if state != nvml.FEATURE_ENABLED {
continue
}
pciInfo, ret := dev.GetNvLinkRemotePciInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED || ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("failed to get remote pci info: %v", ret)
}
pciInfos = append(pciInfos, PciInfo(pciInfo))
}

return pciInfos, nil
Expand Down
94 changes: 94 additions & 0 deletions vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/identifier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 8 additions & 3 deletions vendor/github.com/NVIDIA/go-nvlib/pkg/nvml/device.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 02af3d8

Please sign in to comment.