Skip to content

Commit b92fc47

Browse files
author
zhitianli
committed
add vgpu monitor container to be able to collect vgpu related metrics on nodes
1 parent 3f3a5f1 commit b92fc47

File tree

16 files changed

+2044
-32
lines changed

16 files changed

+2044
-32
lines changed

cmd/vGPUmonitor/build.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
# Copyright 2024 The HAMi Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto
17+
go build

cmd/vGPUmonitor/cudevshr.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/*
2+
Copyright 2024 The HAMi Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"bytes"
21+
"encoding/binary"
22+
"errors"
23+
"fmt"
24+
"os"
25+
"syscall"
26+
"unsafe"
27+
28+
"golang.org/x/exp/mmap"
29+
)
30+
31+
const maxDevices = 16
32+
33+
type deviceMemory struct {
34+
contextSize uint64
35+
moduleSize uint64
36+
bufferSize uint64
37+
offset uint64
38+
total uint64
39+
}
40+
41+
type shrregProcSlotT struct {
42+
pid int32
43+
hostpid int32
44+
used [16]deviceMemory
45+
monitorused [16]uint64
46+
status int32
47+
}
48+
49+
type uuid struct {
50+
uuid [96]byte
51+
}
52+
53+
type semT struct {
54+
sem [32]byte
55+
}
56+
57+
type sharedRegionT struct {
58+
initializedFlag int32
59+
smInitFlag int32
60+
ownerPid uint32
61+
sem semT
62+
num uint64
63+
uuids [16]uuid
64+
65+
limit [16]uint64
66+
smLimit [16]uint64
67+
procs [1024]shrregProcSlotT
68+
69+
procnum int32
70+
utilizationSwitch int32
71+
recentKernel int32
72+
priority int32
73+
}
74+
75+
type SharedRegionInfoT struct {
76+
pid int32
77+
fd int32
78+
initStatus int16
79+
sharedRegion sharedRegionT
80+
}
81+
82+
type nvidiaCollector struct {
83+
// Exposed for testing
84+
cudevshrPath string
85+
at *mmap.ReaderAt
86+
cudaCache *sharedRegionT
87+
}
88+
89+
func setProcSlot(offset int64, at *mmap.ReaderAt) (shrregProcSlotT, error) {
90+
temp := shrregProcSlotT{}
91+
buff := make([]byte, 4)
92+
at.ReadAt(buff, offset)
93+
bytesbuffer := bytes.NewBuffer(buff)
94+
binary.Read(bytesbuffer, binary.LittleEndian, &temp.pid)
95+
var monitorused uint64
96+
//fmt.Println("pid==", temp.pid, "buff=", buff)
97+
buff = make([]byte, 8)
98+
for i := 0; i < maxDevices; i++ {
99+
at.ReadAt(buff, offset+8+8*int64(i))
100+
bytesbuffer = bytes.NewBuffer(buff)
101+
binary.Read(bytesbuffer, binary.LittleEndian, &temp.used[i])
102+
}
103+
for i := 0; i < maxDevices; i++ {
104+
at.ReadAt(buff, offset+8+8*16+8*int64(i))
105+
bytesbuffer = bytes.NewBuffer(buff)
106+
binary.Read(bytesbuffer, binary.LittleEndian, &monitorused)
107+
if monitorused > temp.used[i].total {
108+
temp.used[i].total = monitorused
109+
}
110+
}
111+
return temp, nil
112+
}
113+
114+
func getDeviceUsedMemory(idx int, sharedregion sharedRegionT) (uint64, error) {
115+
var sum uint64
116+
sum = 0
117+
if idx < 0 || idx > 16 {
118+
return 0, errors.New("out of device idx")
119+
}
120+
for _, val := range sharedregion.procs {
121+
sum += val.used[idx].total
122+
}
123+
return sum, nil
124+
}
125+
126+
func mmapcachefile(filename string, nc *nvidiaCollector) error {
127+
var m = &sharedRegionT{}
128+
f, err := os.OpenFile(filename, os.O_RDWR, 0666)
129+
if err != nil {
130+
fmt.Println("openfile error=", err.Error())
131+
return err
132+
}
133+
data, err := syscall.Mmap(int(f.Fd()), 0, int(unsafe.Sizeof(*m)), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED)
134+
if err != nil {
135+
return err
136+
}
137+
var cachestr *sharedRegionT = *(**sharedRegionT)(unsafe.Pointer(&data))
138+
fmt.Println("sizeof=", unsafe.Sizeof(*m), "cachestr=", cachestr.utilizationSwitch, cachestr.recentKernel)
139+
nc.cudaCache = cachestr
140+
return nil
141+
}
142+
143+
func getvGPUMemoryInfo(nc *nvidiaCollector) (*sharedRegionT, error) {
144+
if len(nc.cudevshrPath) > 0 {
145+
if nc.cudaCache == nil {
146+
mmapcachefile(nc.cudevshrPath, nc)
147+
}
148+
return nc.cudaCache, nil
149+
}
150+
return &sharedRegionT{}, errors.New("not found path")
151+
}

0 commit comments

Comments
 (0)