Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions doc/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ After making changes, restart the volcano-vgpu-device-plugin and volcano-schedul
## Node Configs

**Note:**
All the configurations listed below are managed within the `volcano-vgpu-node-config` ConfigMap.
You can update these configurations using the following methods:
volcano-vgpu-device-plugin allows for per-node configuration of the device plugin behavior, and all these settings are centrally managed within the volcano-vgpu-node-config ConfigMap. You can update them using the following methods:

```bash
kubectl edit configmap volcano-vgpu-node-config -n <namespace>
Expand All @@ -56,3 +55,8 @@ String type, `hami-core` for using hami-core for container resource limitation,
Integer type, device memory oversubscription on that node
* `devicecorescaling`:
Integer type, device core oversubscription on that node
* `devicesplitcount`: Allowed number of tasks sharing a device.
* `filterdevices`: Devices that are not registered to HAMi.
* `uuid`: UUIDs of devices to ignore
* `index`: Indexes of devices to ignore.
* A device is ignored by HAMi if it's in `uuid` or `index` list.
45 changes: 45 additions & 0 deletions pkg/plugin/vgpu/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,48 @@ type DevicePluginConfigs struct {
FilterDevice *FilterDevice `json:"filterdevices"`
} `json:"nodeconfig"`
}

var (
filterOnce sync.Once
uuidMap map[string]struct{}
indexMap map[uint]struct{}
)

func FilterDeviceToRegister(uuid string, index int) bool {
filterOnce.Do(initFilter)
if len(uuidMap) == 0 && len(indexMap) == 0 {
return false
}

if _, ok := uuidMap[uuid]; ok {
return true
}

if _, ok := indexMap[uint(index)]; ok {
return true
}

return false
}

func initFilter() {
uuidMap = make(map[string]struct{})
indexMap = make(map[uint]struct{})
if DevicePluginFilterDevice == nil {
return
}

if len(DevicePluginFilterDevice.UUID) > 0 {
uuidMap = make(map[string]struct{}, len(DevicePluginFilterDevice.UUID))
for _, u := range DevicePluginFilterDevice.UUID {
uuidMap[u] = struct{}{}
}
}

if len(DevicePluginFilterDevice.Index) > 0 {
indexMap = make(map[uint]struct{}, len(DevicePluginFilterDevice.Index))
for _, idx := range DevicePluginFilterDevice.Index {
indexMap[idx] = struct{}{}
}
}
}
8 changes: 8 additions & 0 deletions pkg/plugin/vgpu/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ func (g *GpuDeviceManager) Devices() []*Device {
d, ret := config.Nvml().DeviceGetHandleByIndex(i)
check(ret)

uuid, ret := d.GetUUID()
check(ret)
// Filter GPU device
if config.FilterDeviceToRegister(uuid, i) {
klog.V(5).Infof("Filtering out GPU device index=%d, uuid=%s", i, uuid)
continue
}

migMode, _, ret := d.GetMigMode()
if ret != nvml.SUCCESS {
if ret == nvml.ERROR_NOT_SUPPORTED {
Expand Down
Loading