Skip to content

Commit

Permalink
Merge pull request #10 from TymonLee/main_add_vgpu_monitor
Browse files Browse the repository at this point in the history
add vGPU monitor container
  • Loading branch information
archlitchi authored Aug 12, 2024
2 parents 18eb946 + 567494c commit 1a14603
Show file tree
Hide file tree
Showing 15 changed files with 1,135 additions and 32 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,13 @@ volcano-scheduler-metrics records every GPU usage and limitation, visit the foll
curl {volcano scheduler cluster ip}:8080/metrics
```

You can also collect the **GPU utilization**, **GPU memory usage**, **pods' GPU memory limitations** and **pods' GPU memory usage** metrics on nodes by visiting the following addresses:

```
curl {volcano device plugin cluster ip}:9394/metrics
```
![img](./doc/vgpu_device_plugin_metrics.png)

# Issues and Contributing
[Checkout the Contributing document!](CONTRIBUTING.md)

Expand Down
17 changes: 17 additions & 0 deletions cmd/vGPUmonitor/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
# Copyright 2024 The HAMi Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto
go build
103 changes: 103 additions & 0 deletions cmd/vGPUmonitor/cudevshr.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
Copyright 2024 The HAMi Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"errors"
"fmt"
"os"
"syscall"
"unsafe"

"golang.org/x/exp/mmap"
)

type deviceMemory struct {
contextSize uint64
moduleSize uint64
bufferSize uint64
offset uint64
total uint64
}

type shrregProcSlotT struct {
pid int32
hostpid int32
used [16]deviceMemory
monitorused [16]uint64
status int32
}

type uuid struct {
uuid [96]byte
}

type semT struct {
sem [32]byte
}

type sharedRegionT struct {
initializedFlag int32
smInitFlag int32
ownerPid uint32
sem semT
num uint64
uuids [16]uuid

limit [16]uint64
smLimit [16]uint64
procs [1024]shrregProcSlotT

procnum int32
utilizationSwitch int32
recentKernel int32
priority int32
}

type nvidiaCollector struct {
// Exposed for testing
cudevshrPath string
at *mmap.ReaderAt
cudaCache *sharedRegionT
}

func mmapcachefile(filename string, nc *nvidiaCollector) error {
var m = &sharedRegionT{}
f, err := os.OpenFile(filename, os.O_RDWR, 0666)
if err != nil {
fmt.Println("openfile error=", err.Error())
return err
}
data, err := syscall.Mmap(int(f.Fd()), 0, int(unsafe.Sizeof(*m)), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return err
}
var cachestr *sharedRegionT = *(**sharedRegionT)(unsafe.Pointer(&data))
fmt.Println("sizeof=", unsafe.Sizeof(*m), "cachestr=", cachestr.utilizationSwitch, cachestr.recentKernel)
nc.cudaCache = cachestr
return nil
}

func getvGPUMemoryInfo(nc *nvidiaCollector) (*sharedRegionT, error) {
if len(nc.cudevshrPath) > 0 {
if nc.cudaCache == nil {
mmapcachefile(nc.cudevshrPath, nc)
}
return nc.cudaCache, nil
}
return &sharedRegionT{}, errors.New("not found path")
}
135 changes: 135 additions & 0 deletions cmd/vGPUmonitor/feedback.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
Copyright 2024 The HAMi Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"time"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"k8s.io/klog/v2"
)

type UtilizationPerDevice []int

var srPodList map[string]podusage

func init() {
srPodList = make(map[string]podusage)
}

func CheckBlocking(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool {
for _, devuuid := range pu.sr.uuids {
_, ok := utSwitchOn[string(devuuid.uuid[:])]
if ok {
for i := 0; i < p; i++ {
if utSwitchOn[string(devuuid.uuid[:])][i] > 0 {
return true
}
}
return false
}
}
return false
}

// Check whether task with higher priority use GPU or there are other tasks with the same priority.
func CheckPriority(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool {
for _, devuuid := range pu.sr.uuids {
_, ok := utSwitchOn[string(devuuid.uuid[:])]
if ok {
for i := 0; i < p; i++ {
if utSwitchOn[string(devuuid.uuid[:])][i] > 0 {
return true
}
}
if utSwitchOn[string(devuuid.uuid[:])][p] > 1 {
return true
}
}
}
return false
}

func Observe(srlist *map[string]podusage) error {
utSwitchOn := map[string]UtilizationPerDevice{}

for idx, val := range *srlist {
if val.sr == nil {
continue
}
if val.sr.recentKernel > 0 {
(*srlist)[idx].sr.recentKernel--
if (*srlist)[idx].sr.recentKernel > 0 {
for _, devuuid := range val.sr.uuids {
// Null device condition
if devuuid.uuid[0] == 0 {
continue
}
if len(utSwitchOn[string(devuuid.uuid[:])]) == 0 {
utSwitchOn[string(devuuid.uuid[:])] = []int{0, 0}
}
utSwitchOn[string(devuuid.uuid[:])][val.sr.priority]++
}
}
}
}
for idx, val := range *srlist {
if val.sr == nil {
continue
}
if CheckBlocking(utSwitchOn, int(val.sr.priority), val) {
if (*srlist)[idx].sr.recentKernel >= 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to on %v", idx)
(*srlist)[idx].sr.recentKernel = -1
}
} else {
if (*srlist)[idx].sr.recentKernel < 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting Blocking to off %v", idx)
(*srlist)[idx].sr.recentKernel = 0
}
}
if CheckPriority(utSwitchOn, int(val.sr.priority), val) {
if (*srlist)[idx].sr.utilizationSwitch != 1 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to on %v", idx)
(*srlist)[idx].sr.utilizationSwitch = 1
}
} else {
if (*srlist)[idx].sr.utilizationSwitch != 0 {
klog.Infof("utSwitchon=%v", utSwitchOn)
klog.Infof("Setting UtilizationSwitch to off %v", idx)
(*srlist)[idx].sr.utilizationSwitch = 0
}
}
}
return nil
}

func watchAndFeedback() {
nvml.Init()
for {
time.Sleep(time.Second * 5)
err := monitorPath(srPodList)
if err != nil {
klog.Errorf("monitorPath failed %v", err.Error())
}
klog.Infof("WatchAndFeedback srPodList=%v", srPodList)
Observe(&srPodList)
}
}
34 changes: 34 additions & 0 deletions cmd/vGPUmonitor/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
Copyright 2024 The HAMi Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"k8s.io/klog/v2"
)

func main() {
if err := ValidateEnvVars(); err != nil {
klog.Fatalf("Failed to validate environment variables: %v", err)
}
errchannel := make(chan error)
go initMetrics()
go watchAndFeedback()
for {
err := <-errchannel
klog.Errorf("failed to serve: %v", err)
}
}
Loading

0 comments on commit 1a14603

Please sign in to comment.