-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from TymonLee/main_add_vgpu_monitor
add vGPU monitor container
- Loading branch information
Showing
15 changed files
with
1,135 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bash | ||
# Copyright 2024 The HAMi Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto | ||
go build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
/* | ||
Copyright 2024 The HAMi Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"errors" | ||
"fmt" | ||
"os" | ||
"syscall" | ||
"unsafe" | ||
|
||
"golang.org/x/exp/mmap" | ||
) | ||
|
||
type deviceMemory struct { | ||
contextSize uint64 | ||
moduleSize uint64 | ||
bufferSize uint64 | ||
offset uint64 | ||
total uint64 | ||
} | ||
|
||
type shrregProcSlotT struct { | ||
pid int32 | ||
hostpid int32 | ||
used [16]deviceMemory | ||
monitorused [16]uint64 | ||
status int32 | ||
} | ||
|
||
type uuid struct { | ||
uuid [96]byte | ||
} | ||
|
||
type semT struct { | ||
sem [32]byte | ||
} | ||
|
||
type sharedRegionT struct { | ||
initializedFlag int32 | ||
smInitFlag int32 | ||
ownerPid uint32 | ||
sem semT | ||
num uint64 | ||
uuids [16]uuid | ||
|
||
limit [16]uint64 | ||
smLimit [16]uint64 | ||
procs [1024]shrregProcSlotT | ||
|
||
procnum int32 | ||
utilizationSwitch int32 | ||
recentKernel int32 | ||
priority int32 | ||
} | ||
|
||
type nvidiaCollector struct { | ||
// Exposed for testing | ||
cudevshrPath string | ||
at *mmap.ReaderAt | ||
cudaCache *sharedRegionT | ||
} | ||
|
||
func mmapcachefile(filename string, nc *nvidiaCollector) error { | ||
var m = &sharedRegionT{} | ||
f, err := os.OpenFile(filename, os.O_RDWR, 0666) | ||
if err != nil { | ||
fmt.Println("openfile error=", err.Error()) | ||
return err | ||
} | ||
data, err := syscall.Mmap(int(f.Fd()), 0, int(unsafe.Sizeof(*m)), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED) | ||
if err != nil { | ||
return err | ||
} | ||
var cachestr *sharedRegionT = *(**sharedRegionT)(unsafe.Pointer(&data)) | ||
fmt.Println("sizeof=", unsafe.Sizeof(*m), "cachestr=", cachestr.utilizationSwitch, cachestr.recentKernel) | ||
nc.cudaCache = cachestr | ||
return nil | ||
} | ||
|
||
func getvGPUMemoryInfo(nc *nvidiaCollector) (*sharedRegionT, error) { | ||
if len(nc.cudevshrPath) > 0 { | ||
if nc.cudaCache == nil { | ||
mmapcachefile(nc.cudevshrPath, nc) | ||
} | ||
return nc.cudaCache, nil | ||
} | ||
return &sharedRegionT{}, errors.New("not found path") | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* | ||
Copyright 2024 The HAMi Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/NVIDIA/go-nvml/pkg/nvml" | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
type UtilizationPerDevice []int | ||
|
||
var srPodList map[string]podusage | ||
|
||
func init() { | ||
srPodList = make(map[string]podusage) | ||
} | ||
|
||
func CheckBlocking(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool { | ||
for _, devuuid := range pu.sr.uuids { | ||
_, ok := utSwitchOn[string(devuuid.uuid[:])] | ||
if ok { | ||
for i := 0; i < p; i++ { | ||
if utSwitchOn[string(devuuid.uuid[:])][i] > 0 { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
} | ||
return false | ||
} | ||
|
||
// Check whether task with higher priority use GPU or there are other tasks with the same priority. | ||
func CheckPriority(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool { | ||
for _, devuuid := range pu.sr.uuids { | ||
_, ok := utSwitchOn[string(devuuid.uuid[:])] | ||
if ok { | ||
for i := 0; i < p; i++ { | ||
if utSwitchOn[string(devuuid.uuid[:])][i] > 0 { | ||
return true | ||
} | ||
} | ||
if utSwitchOn[string(devuuid.uuid[:])][p] > 1 { | ||
return true | ||
} | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func Observe(srlist *map[string]podusage) error { | ||
utSwitchOn := map[string]UtilizationPerDevice{} | ||
|
||
for idx, val := range *srlist { | ||
if val.sr == nil { | ||
continue | ||
} | ||
if val.sr.recentKernel > 0 { | ||
(*srlist)[idx].sr.recentKernel-- | ||
if (*srlist)[idx].sr.recentKernel > 0 { | ||
for _, devuuid := range val.sr.uuids { | ||
// Null device condition | ||
if devuuid.uuid[0] == 0 { | ||
continue | ||
} | ||
if len(utSwitchOn[string(devuuid.uuid[:])]) == 0 { | ||
utSwitchOn[string(devuuid.uuid[:])] = []int{0, 0} | ||
} | ||
utSwitchOn[string(devuuid.uuid[:])][val.sr.priority]++ | ||
} | ||
} | ||
} | ||
} | ||
for idx, val := range *srlist { | ||
if val.sr == nil { | ||
continue | ||
} | ||
if CheckBlocking(utSwitchOn, int(val.sr.priority), val) { | ||
if (*srlist)[idx].sr.recentKernel >= 0 { | ||
klog.Infof("utSwitchon=%v", utSwitchOn) | ||
klog.Infof("Setting Blocking to on %v", idx) | ||
(*srlist)[idx].sr.recentKernel = -1 | ||
} | ||
} else { | ||
if (*srlist)[idx].sr.recentKernel < 0 { | ||
klog.Infof("utSwitchon=%v", utSwitchOn) | ||
klog.Infof("Setting Blocking to off %v", idx) | ||
(*srlist)[idx].sr.recentKernel = 0 | ||
} | ||
} | ||
if CheckPriority(utSwitchOn, int(val.sr.priority), val) { | ||
if (*srlist)[idx].sr.utilizationSwitch != 1 { | ||
klog.Infof("utSwitchon=%v", utSwitchOn) | ||
klog.Infof("Setting UtilizationSwitch to on %v", idx) | ||
(*srlist)[idx].sr.utilizationSwitch = 1 | ||
} | ||
} else { | ||
if (*srlist)[idx].sr.utilizationSwitch != 0 { | ||
klog.Infof("utSwitchon=%v", utSwitchOn) | ||
klog.Infof("Setting UtilizationSwitch to off %v", idx) | ||
(*srlist)[idx].sr.utilizationSwitch = 0 | ||
} | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func watchAndFeedback() { | ||
nvml.Init() | ||
for { | ||
time.Sleep(time.Second * 5) | ||
err := monitorPath(srPodList) | ||
if err != nil { | ||
klog.Errorf("monitorPath failed %v", err.Error()) | ||
} | ||
klog.Infof("WatchAndFeedback srPodList=%v", srPodList) | ||
Observe(&srPodList) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
/* | ||
Copyright 2024 The HAMi Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"k8s.io/klog/v2" | ||
) | ||
|
||
func main() { | ||
if err := ValidateEnvVars(); err != nil { | ||
klog.Fatalf("Failed to validate environment variables: %v", err) | ||
} | ||
errchannel := make(chan error) | ||
go initMetrics() | ||
go watchAndFeedback() | ||
for { | ||
err := <-errchannel | ||
klog.Errorf("failed to serve: %v", err) | ||
} | ||
} |
Oops, something went wrong.