Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions cmd/vgpu/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func init() {
rootCmd.Flags().UintVar(&config.DeviceSplitCount, "device-split-count", 2, "the number for NVIDIA device split")
rootCmd.Flags().UintVar(&config.GPUMemoryFactor, "gpu-memory-factor", 1, "the default gpu memory block size is 1MB")
rootCmd.Flags().Float64Var(&config.DeviceCoresScaling, "device-cores-scaling", 1.0, "the ratio for NVIDIA device cores scaling")
rootCmd.Flags().Float64Var(&config.DeviceMemoryScaling, "device-memory-scaling", 1.0, "the ratio for NVIDIA device memory scaling")
rootCmd.Flags().StringVar(&config.NodeName, "node-name", viper.GetString("node-name"), "node name")

rootCmd.PersistentFlags().AddGoFlagSet(util.GlobalFlagSet())
Expand Down Expand Up @@ -104,13 +105,13 @@ func start() error {
klog.Info("Starting OS watcher.")
sigs := NewOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)

nvidiaCfg := util.LoadNvidiaConfig()
nvidiaCfg := util.LoadNvidiaConfig(migStrategyFlag)

cache := nvidiadevice.NewDeviceCache()
cache.Start()
defer cache.Stop()

register := nvidiadevice.NewDeviceRegister(cache)
register := nvidiadevice.NewDeviceRegister(cache, nvidiaCfg)
register.Start()
defer register.Stop()

Expand All @@ -122,7 +123,7 @@ restart:
p.Stop()
}
klog.Info("Retreiving plugins.")
migStrategy, err := nvidiadevice.NewMigStrategy(migStrategyFlag)
migStrategy, err := nvidiadevice.NewMigStrategy(nvidiaCfg.MigStrategy)
if err != nil {
return fmt.Errorf("error creating MIG strategy: %v", err)
}
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.ubuntu20.04
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ FROM ubuntu:20.04 AS builder
ARG TARGETARCH
RUN apt-get update
RUN apt-get -y install ca-certificates g++ wget
RUN wget -qO- https://storage.googleapis.com/golang/go1.23.7.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx
RUN wget -qO- https://storage.googleapis.com/golang/go1.24.6.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx
ENV GOPATH=/go
ENV PATH=$GOPATH/bin:/usr/local/go/bin:$PATH
WORKDIR /go/src/volcano.sh/devices
Expand Down
16 changes: 9 additions & 7 deletions pkg/plugin/vgpu/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type NvidiaConfig struct {
DisableCoreLimit bool `yaml:"disableCoreLimit"`
MigGeometriesList []AllowedMigGeometries `yaml:"knownMigGeometries"`
GPUMemoryFactor uint `yaml:"gpuMemoryFactor"`
MigStrategy string `yaml:"migStrategy"`
}

var (
Expand Down Expand Up @@ -70,13 +71,14 @@ func Device() device.Interface {
}

var (
DeviceSplitCount uint
GPUMemoryFactor uint
Mode string
DeviceCoresScaling float64
NodeName string
RuntimeSocketFlag string
DisableCoreLimit bool
DeviceSplitCount uint
GPUMemoryFactor uint
Mode string
DeviceCoresScaling float64
DeviceMemoryScaling float64
NodeName string
RuntimeSocketFlag string
DisableCoreLimit bool
)

type MigTemplate struct {
Expand Down
14 changes: 9 additions & 5 deletions pkg/plugin/vgpu/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func (m *NvidiaDevicePlugin) initialize() {
m.server = grpc.NewServer([]grpc.ServerOption{}...)
m.health = make(chan *Device)
m.stop = make(chan interface{})
m.virtualDevices, _ = util.GetDevices(config.GPUMemoryFactor)
m.virtualDevices, _ = util.GetDevices(config.GPUMemoryFactor, m.schedulerConfig.DeviceMemoryScaling, m.migStrategy)
}

func (m *NvidiaDevicePlugin) cleanup() {
Expand Down Expand Up @@ -374,7 +374,7 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
}

} else {
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()})
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices(m.schedulerConfig.DeviceCoreScaling)})
for {
select {
case <-m.stop:
Expand All @@ -383,7 +383,7 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
// FIXME: there is no way to recover from the Unhealthy state.
//d.Health = pluginapi.Unhealthy
log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID)
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()})
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices(m.schedulerConfig.DeviceCoreScaling)})
}
}
}
Expand Down Expand Up @@ -559,7 +559,7 @@ func (m *NvidiaDevicePlugin) deviceIDsFromUUIDs(uuids []string) []string {
return uuids
}

func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
func (m *NvidiaDevicePlugin) apiDevices(coreScaling float64) []*pluginapi.Device {
if strings.Compare(m.migStrategy, "mixed") == 0 {
var pdevs []*pluginapi.Device
for _, d := range m.cachedDevices {
Expand Down Expand Up @@ -589,7 +589,11 @@ func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
if strings.Compare(m.resourceName, util.ResourceCores) == 0 {
for _, dev := range devices {
i := 0
for i < 100 {
cores := 100
if strings.Compare(m.migStrategy, "none") == 0 {
cores = int(coreScaling * float64(100))
}
for i < cores {
res = append(res, &pluginapi.Device{
ID: fmt.Sprintf("%v-core-%v", dev.ID, i),
Health: dev.Health,
Expand Down
27 changes: 16 additions & 11 deletions pkg/plugin/vgpu/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,18 @@ import (
type DevListFunc func() []*Device

type DeviceRegister struct {
deviceCache *DeviceCache
unhealthy chan *Device
stopCh chan struct{}
deviceCache *DeviceCache
unhealthy chan *Device
nvidiaConfig *config.NvidiaConfig
stopCh chan struct{}
}

func NewDeviceRegister(deviceCache *DeviceCache) *DeviceRegister {
func NewDeviceRegister(deviceCache *DeviceCache, nvidiaConfig *config.NvidiaConfig) *DeviceRegister {
return &DeviceRegister{
deviceCache: deviceCache,
unhealthy: make(chan *Device),
stopCh: make(chan struct{}),
deviceCache: deviceCache,
nvidiaConfig: nvidiaConfig,
unhealthy: make(chan *Device),
stopCh: make(chan struct{}),
}
}

Expand Down Expand Up @@ -75,12 +77,15 @@ func (r *DeviceRegister) apiDevices() *[]*util.DeviceInfo {
}

klog.V(3).Infoln("nvml registered device id=", dev.ID, "memory=", memory.Total, "type=", model)

registeredmem := int32(memory.Total/(1024*1024)) / int32(config.GPUMemoryFactor)
memTotal := memory.Total / (1024 * 1024)
if r.nvidiaConfig.MigStrategy == MigStrategyNone {
memTotal = uint64(float64(memTotal) * r.nvidiaConfig.DeviceMemoryScaling)
}
registeredmem := int32(memTotal / uint64(config.GPUMemoryFactor))
klog.V(3).Infoln("GPUMemoryFactor=", config.GPUMemoryFactor, "registeredmem=", registeredmem)
res = append(res, &util.DeviceInfo{
Id: dev.ID,
Count: int32(config.DeviceSplitCount),
Count: int32(r.nvidiaConfig.DeviceSplitCount),
Devmem: registeredmem,
Mode: config.Mode,
Type: fmt.Sprintf("%v-%v", "NVIDIA", model),
Expand All @@ -98,7 +103,7 @@ func (r *DeviceRegister) RegisterInAnnotation() error {
klog.Errorln("get node error", err.Error())
return err
}
encodeddevices := util.EncodeNodeDevices(*devices)
encodeddevices := util.EncodeNodeDevices(*devices, r.nvidiaConfig.MigStrategy, r.nvidiaConfig.DeviceCoreScaling)
annos[util.NodeHandshake] = "Reported " + time.Now().String()
annos[util.NodeNvidiaDeviceRegistered] = encodeddevices
klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String())
Expand Down
54 changes: 40 additions & 14 deletions pkg/plugin/vgpu/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,14 @@ func DecodeNodeDevices(str string) []*DeviceInfo {
return retval
}

func EncodeNodeDevices(dlist []*DeviceInfo) string {
func EncodeNodeDevices(dlist []*DeviceInfo, migStrategy string, coreScaling float64) string {
tmp := ""
core := "100"
if strings.Compare(migStrategy, "none") == 0 {
core = strconv.FormatInt(int64(coreScaling*100), 10)
}
for _, val := range dlist {
tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + val.Type + "," + strconv.FormatBool(val.Health) + "," + val.Mode + ":"
tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + core + "," + val.Type + "," + strconv.FormatBool(val.Health) + "," + val.Mode + ":"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The string format here needs to be modified very carefully. It should be modified in synchronization with the volcano-scheduler and keep compatible.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

volcano APPROVED the PR:
volcano-sh/volcano#4597

}
klog.V(3).Infoln("Encoded node Devices", tmp)
return tmp
Expand All @@ -171,19 +175,22 @@ func EncodePodDevices(pd PodDevices) string {
return strings.Join(ss, ";")
}

func DecodeContainerDevices(str string) ContainerDevices {
func DecodeContainerDevices(str string) (ContainerDevices, error) {
if len(str) == 0 {
return ContainerDevices{}
return ContainerDevices{}, nil
}
cd := strings.Split(str, ":")
contdev := ContainerDevices{}
tmpdev := ContainerDevice{}
if len(str) == 0 {
return contdev
return contdev, nil
}
for _, val := range cd {
if strings.Contains(val, ",") {
tmpstr := strings.Split(val, ",")
if len(tmpstr) < 4 {
return contdev, fmt.Errorf("invalid container device format: %s", val)
}
tmpdev.UUID = tmpstr[0]
tmpdev.Type = tmpstr[1]
devmem, _ := strconv.ParseInt(tmpstr[2], 10, 32)
Expand All @@ -193,23 +200,30 @@ func DecodeContainerDevices(str string) ContainerDevices {
contdev = append(contdev, tmpdev)
}
}
return contdev
return contdev, nil
}

func DecodePodDevices(str string) PodDevices {
func DecodePodDevices(str string) (PodDevices, error) {
if len(str) == 0 {
return PodDevices{}
return PodDevices{}, nil
}
var pd PodDevices
for _, s := range strings.Split(str, ";") {
cd := DecodeContainerDevices(s)
cd, err := DecodeContainerDevices(s)
if err != nil {
return nil, err
}
pd = append(pd, cd)
}
return pd
return pd, nil
}

func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevices, error) {
pdevices := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
pdevices, err := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
if err != nil {
klog.Errorf("failed to decode pod devices: %v", err)
return v1.Container{}, nil, err
}
klog.Infoln("pdevices=", pdevices)
res := ContainerDevices{}
for idx, val := range pdevices {
Expand All @@ -228,7 +242,11 @@ func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevice
}

func EraseNextDeviceTypeFromAnnotation(dtype string, p v1.Pod) error {
pdevices := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
pdevices, err := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
if err != nil {
klog.Errorf("failed to decode pod devices: %v", err)
return err
}
res := PodDevices{}
found := false
for _, val := range pdevices {
Expand Down Expand Up @@ -383,7 +401,7 @@ func GenerateVirtualDeviceID(id uint, fakeCounter uint) string {
}

// GetDevices returns virtual devices and all physical devices by index.
func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
func GetDevices(gpuMemoryFactor uint, memoryScaling float64, migStrategy string) ([]*pluginapi.Device, map[uint]string) {
n, ret := config.Nvml().DeviceGetCount()
if ret != nvml.SUCCESS {
klog.Fatalf("call nvml.DeviceGetCount with error: %v", ret)
Expand All @@ -407,6 +425,9 @@ func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
klog.Fatalf("call GetMemoryInfo with error: %v", ret)
}
deviceGPUMemory := uint(memory.Total / (1024 * 1024))
if strings.Compare(migStrategy, "none") == 0 {
deviceGPUMemory = uint(float64(deviceGPUMemory) * memoryScaling)
}
for j := uint(0); j < deviceGPUMemory/gpuMemoryFactor; j++ {
klog.V(4).Infof("adding virtual device: %d", j)
fakeID := GenerateVirtualDeviceID(id, j)
Expand Down Expand Up @@ -543,7 +564,7 @@ func ExtractMigTemplatesFromUUID(uuid string) (string, int, error) {
return templateGroupName, pos, nil
}

func LoadNvidiaConfig() *config.NvidiaConfig {
func LoadNvidiaConfig(migStrategyFlag string) *config.NvidiaConfig {
configs, err := LoadConfigFromCM("volcano-vgpu-device-config")
if err != nil {
klog.InfoS("configMap not found", err.Error())
Expand All @@ -555,6 +576,8 @@ func LoadNvidiaConfig() *config.NvidiaConfig {
nvidiaConfig.DeviceSplitCount = config.DeviceSplitCount
nvidiaConfig.DeviceCoreScaling = config.DeviceCoresScaling
nvidiaConfig.GPUMemoryFactor = config.GPUMemoryFactor
nvidiaConfig.DeviceMemoryScaling = config.DeviceMemoryScaling
nvidiaConfig.MigStrategy = migStrategyFlag
if err := readFromConfigFile(&nvidiaConfig); err != nil {
klog.InfoS("readFrom device cm error", err.Error())
}
Expand Down Expand Up @@ -583,6 +606,9 @@ func readFromConfigFile(sConfig *config.NvidiaConfig) error {
if val.Devicecorescaling > 0 {
sConfig.DeviceCoreScaling = val.Devicecorescaling
}
if val.Migstrategy != "" {
sConfig.MigStrategy = val.Migstrategy
}
if val.Devicesplitcount > 0 {
sConfig.DeviceSplitCount = val.Devicesplitcount
}
Expand Down
Loading