Skip to content

Commit c569eef

Browse files
author
zhitianli
committed
remove useless noderpc in vGPUmonitor
1 parent c19073c commit c569eef

File tree

8 files changed

+0
-917
lines changed

8 files changed

+0
-917
lines changed

cmd/vGPUmonitor/cudevshr.go

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ limitations under the License.
1717
package main
1818

1919
import (
20-
"bytes"
21-
"encoding/binary"
2220
"errors"
2321
"fmt"
2422
"os"
@@ -28,8 +26,6 @@ import (
2826
"golang.org/x/exp/mmap"
2927
)
3028

31-
const maxDevices = 16
32-
3329
type deviceMemory struct {
3430
contextSize uint64
3531
moduleSize uint64
@@ -72,57 +68,13 @@ type sharedRegionT struct {
7268
priority int32
7369
}
7470

75-
type SharedRegionInfoT struct {
76-
pid int32
77-
fd int32
78-
initStatus int16
79-
sharedRegion sharedRegionT
80-
}
81-
8271
type nvidiaCollector struct {
8372
// Exposed for testing
8473
cudevshrPath string
8574
at *mmap.ReaderAt
8675
cudaCache *sharedRegionT
8776
}
8877

89-
func setProcSlot(offset int64, at *mmap.ReaderAt) (shrregProcSlotT, error) {
90-
temp := shrregProcSlotT{}
91-
buff := make([]byte, 4)
92-
at.ReadAt(buff, offset)
93-
bytesbuffer := bytes.NewBuffer(buff)
94-
binary.Read(bytesbuffer, binary.LittleEndian, &temp.pid)
95-
var monitorused uint64
96-
//fmt.Println("pid==", temp.pid, "buff=", buff)
97-
buff = make([]byte, 8)
98-
for i := 0; i < maxDevices; i++ {
99-
at.ReadAt(buff, offset+8+8*int64(i))
100-
bytesbuffer = bytes.NewBuffer(buff)
101-
binary.Read(bytesbuffer, binary.LittleEndian, &temp.used[i])
102-
}
103-
for i := 0; i < maxDevices; i++ {
104-
at.ReadAt(buff, offset+8+8*16+8*int64(i))
105-
bytesbuffer = bytes.NewBuffer(buff)
106-
binary.Read(bytesbuffer, binary.LittleEndian, &monitorused)
107-
if monitorused > temp.used[i].total {
108-
temp.used[i].total = monitorused
109-
}
110-
}
111-
return temp, nil
112-
}
113-
114-
func getDeviceUsedMemory(idx int, sharedregion sharedRegionT) (uint64, error) {
115-
var sum uint64
116-
sum = 0
117-
if idx < 0 || idx > 16 {
118-
return 0, errors.New("out of device idx")
119-
}
120-
for _, val := range sharedregion.procs {
121-
sum += val.used[idx].total
122-
}
123-
return sum, nil
124-
}
125-
12678
func mmapcachefile(filename string, nc *nvidiaCollector) error {
12779
var m = &sharedRegionT{}
12880
f, err := os.OpenFile(filename, os.O_RDWR, 0666)

cmd/vGPUmonitor/feedback.go

Lines changed: 0 additions & 146 deletions
Original file line numberDiff line numberDiff line change
@@ -17,166 +17,20 @@ limitations under the License.
1717
package main
1818

1919
import (
20-
"errors"
21-
"fmt"
22-
"os"
23-
"sort"
24-
"strconv"
25-
"strings"
26-
"sync"
2720
"time"
2821

2922
"github.com/NVIDIA/go-nvml/pkg/nvml"
30-
corev1 "k8s.io/api/core/v1"
3123
"k8s.io/klog/v2"
3224
)
3325

34-
var cgroupDriver int
35-
36-
type hostGPUPid struct {
37-
hostGPUPid int
38-
mtime uint64
39-
}
40-
4126
type UtilizationPerDevice []int
4227

43-
var mutex sync.Mutex
4428
var srPodList map[string]podusage
4529

4630
func init() {
4731
srPodList = make(map[string]podusage)
4832
}
4933

50-
func setcGgroupDriver() int {
51-
// 1 for cgroupfs 2 for systemd
52-
kubeletconfig, err := os.ReadFile("/hostvar/lib/kubelet/config.yaml")
53-
if err != nil {
54-
return 0
55-
}
56-
content := string(kubeletconfig)
57-
pos := strings.LastIndex(content, "cgroupDriver:")
58-
if pos < 0 {
59-
return 0
60-
}
61-
if strings.Contains(content, "systemd") {
62-
return 2
63-
}
64-
if strings.Contains(content, "cgroupfs") {
65-
return 1
66-
}
67-
return 0
68-
}
69-
70-
func getUsedGPUPid() ([]uint, nvml.Return) {
71-
tmp := []nvml.ProcessInfo{}
72-
count, err := nvml.DeviceGetCount()
73-
if err != nvml.SUCCESS {
74-
return []uint{}, err
75-
}
76-
for i := 0; i < count; i++ {
77-
device, err := nvml.DeviceGetHandleByIndex(i)
78-
if err != nvml.SUCCESS {
79-
return []uint{}, err
80-
}
81-
ids, err := device.GetComputeRunningProcesses()
82-
if err != nvml.SUCCESS {
83-
return []uint{}, err
84-
}
85-
tmp = append(tmp, ids...)
86-
}
87-
result := make([]uint, 0)
88-
m := make(map[uint]bool)
89-
for _, v := range tmp {
90-
if _, ok := m[uint(v.Pid)]; !ok {
91-
result = append(result, uint(v.Pid))
92-
m[uint(v.Pid)] = true
93-
}
94-
}
95-
sort.Slice(tmp, func(i, j int) bool { return tmp[i].Pid > tmp[j].Pid })
96-
return result, nvml.SUCCESS
97-
}
98-
99-
func setHostPid(pod corev1.Pod, ctr corev1.ContainerStatus, sr *podusage) error {
100-
var pids []string
101-
mutex.Lock()
102-
defer mutex.Unlock()
103-
104-
if cgroupDriver == 0 {
105-
cgroupDriver = setcGgroupDriver()
106-
}
107-
if cgroupDriver == 0 {
108-
return errors.New("can not identify cgroup driver")
109-
}
110-
usedGPUArray, err := getUsedGPUPid()
111-
if err != nvml.SUCCESS {
112-
return errors.New("get usedGPUID failed, ret:" + nvml.ErrorString(err))
113-
}
114-
if len(usedGPUArray) == 0 {
115-
return nil
116-
}
117-
qos := strings.ToLower(string(pod.Status.QOSClass))
118-
var filename string
119-
if cgroupDriver == 1 {
120-
/* Cgroupfs */
121-
filename = fmt.Sprintf("/sysinfo/fs/cgroup/memory/kubepods/%s/pod%s/%s/tasks", qos, pod.UID, strings.TrimPrefix(ctr.ContainerID, "docker://"))
122-
}
123-
if cgroupDriver == 2 {
124-
/* Systemd */
125-
cgroupuid := strings.ReplaceAll(string(pod.UID), "-", "_")
126-
filename = fmt.Sprintf("/sysinfo/fs/cgroup/systemd/kubepods.slice/kubepods-%s.slice/kubepods-%s-pod%s.slice/docker-%s.scope/tasks", qos, qos, cgroupuid, strings.TrimPrefix(ctr.ContainerID, "docker://"))
127-
}
128-
fmt.Println("filename=", filename)
129-
content, ferr := os.ReadFile(filename)
130-
if ferr != nil {
131-
return ferr
132-
}
133-
pids = strings.Split(string(content), "\n")
134-
hostPidArray := []hostGPUPid{}
135-
for _, val := range pids {
136-
tmp, _ := strconv.Atoi(val)
137-
if tmp != 0 {
138-
var stat os.FileInfo
139-
var err error
140-
if stat, err = os.Lstat(fmt.Sprintf("/proc/%v", tmp)); err != nil {
141-
return err
142-
}
143-
mtime := stat.ModTime().Unix()
144-
hostPidArray = append(hostPidArray, hostGPUPid{
145-
hostGPUPid: tmp,
146-
mtime: uint64(mtime),
147-
})
148-
}
149-
}
150-
usedGPUHostArray := []hostGPUPid{}
151-
for _, val := range usedGPUArray {
152-
for _, hostpid := range hostPidArray {
153-
if uint(hostpid.hostGPUPid) == val {
154-
usedGPUHostArray = append(usedGPUHostArray, hostpid)
155-
}
156-
}
157-
}
158-
//fmt.Println("usedHostGPUArray=", usedGPUHostArray)
159-
sort.Slice(usedGPUHostArray, func(i, j int) bool { return usedGPUHostArray[i].mtime > usedGPUHostArray[j].mtime })
160-
if sr == nil || sr.sr == nil {
161-
return nil
162-
}
163-
for idx, val := range sr.sr.procs {
164-
//fmt.Println("pid=", val.pid)
165-
if val.pid == 0 {
166-
break
167-
}
168-
if idx < len(usedGPUHostArray) {
169-
if val.hostpid == 0 || val.hostpid != int32(usedGPUHostArray[idx].hostGPUPid) {
170-
fmt.Println("Assign host pid to pid instead", usedGPUHostArray[idx].hostGPUPid, val.pid, val.hostpid)
171-
sr.sr.procs[idx].hostpid = int32(usedGPUHostArray[idx].hostGPUPid)
172-
fmt.Println("val=", val.hostpid, sr.sr.procs[idx].hostpid)
173-
}
174-
}
175-
}
176-
return nil
177-
178-
}
179-
18034
func CheckBlocking(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool {
18135
for _, devuuid := range pu.sr.uuids {
18236
_, ok := utSwitchOn[string(devuuid.uuid[:])]

cmd/vGPUmonitor/main.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,7 @@ func main() {
2424
if err := ValidateEnvVars(); err != nil {
2525
klog.Fatalf("Failed to validate environment variables: %v", err)
2626
}
27-
cgroupDriver = 0
2827
errchannel := make(chan error)
29-
go serveInfo(errchannel)
3028
go initMetrics()
3129
go watchAndFeedback()
3230
for {

cmd/vGPUmonitor/metrics.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ limitations under the License.
1717
package main
1818

1919
import (
20-
"errors"
2120
"fmt"
2221
"log"
2322
"net/http"
@@ -121,15 +120,6 @@ func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) {
121120
//prometheus.DescribeByCollect(cc, ch)
122121
}
123122

124-
func parseidstr(podusage string) (string, string, error) {
125-
tmp := strings.Split(podusage, "_")
126-
if len(tmp) > 1 {
127-
return tmp[0], tmp[1], nil
128-
} else {
129-
return "", "", errors.New("parse error")
130-
}
131-
}
132-
133123
func gettotalusage(usage podusage, vidx int) (deviceMemory, error) {
134124
added := deviceMemory{
135125
bufferSize: 0,
@@ -148,10 +138,6 @@ func gettotalusage(usage podusage, vidx int) (deviceMemory, error) {
148138
return added, nil
149139
}
150140

151-
func getsrlist() map[string]podusage {
152-
return srPodList
153-
}
154-
155141
// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it
156142
// creates constant metrics for each host on the fly based on the returned data.
157143
//

0 commit comments

Comments
 (0)