-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
zhitianli
committed
Aug 8, 2024
1 parent
3f3a5f1
commit 1dd88c9
Showing
16 changed files
with
2,046 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env bash | ||
# Copyright 2024 The HAMi Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto | ||
go build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
Copyright 2024 The HAMi Authors. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"bytes" | ||
"encoding/binary" | ||
"errors" | ||
"fmt" | ||
"os" | ||
"syscall" | ||
"unsafe" | ||
|
||
"golang.org/x/exp/mmap" | ||
) | ||
|
||
const maxDevices = 16 | ||
|
||
type deviceMemory struct { | ||
contextSize uint64 | ||
moduleSize uint64 | ||
bufferSize uint64 | ||
offset uint64 | ||
total uint64 | ||
} | ||
|
||
type shrregProcSlotT struct { | ||
pid int32 | ||
hostpid int32 | ||
used [16]deviceMemory | ||
monitorused [16]uint64 | ||
status int32 | ||
} | ||
|
||
type uuid struct { | ||
uuid [96]byte | ||
} | ||
|
||
type semT struct { | ||
sem [32]byte | ||
} | ||
|
||
type sharedRegionT struct { | ||
initializedFlag int32 | ||
smInitFlag int32 | ||
ownerPid uint32 | ||
sem semT | ||
num uint64 | ||
uuids [16]uuid | ||
|
||
limit [16]uint64 | ||
smLimit [16]uint64 | ||
procs [1024]shrregProcSlotT | ||
|
||
procnum int32 | ||
utilizationSwitch int32 | ||
recentKernel int32 | ||
priority int32 | ||
} | ||
|
||
type SharedRegionInfoT struct { | ||
pid int32 | ||
fd int32 | ||
initStatus int16 | ||
sharedRegion sharedRegionT | ||
} | ||
|
||
type nvidiaCollector struct { | ||
// Exposed for testing | ||
cudevshrPath string | ||
at *mmap.ReaderAt | ||
cudaCache *sharedRegionT | ||
} | ||
|
||
func setProcSlot(offset int64, at *mmap.ReaderAt) (shrregProcSlotT, error) { | ||
temp := shrregProcSlotT{} | ||
buff := make([]byte, 4) | ||
at.ReadAt(buff, offset) | ||
bytesbuffer := bytes.NewBuffer(buff) | ||
binary.Read(bytesbuffer, binary.LittleEndian, &temp.pid) | ||
var monitorused uint64 | ||
//fmt.Println("pid==", temp.pid, "buff=", buff) | ||
buff = make([]byte, 8) | ||
for i := 0; i < maxDevices; i++ { | ||
at.ReadAt(buff, offset+8+8*int64(i)) | ||
bytesbuffer = bytes.NewBuffer(buff) | ||
binary.Read(bytesbuffer, binary.LittleEndian, &temp.used[i]) | ||
} | ||
for i := 0; i < maxDevices; i++ { | ||
at.ReadAt(buff, offset+8+8*16+8*int64(i)) | ||
bytesbuffer = bytes.NewBuffer(buff) | ||
binary.Read(bytesbuffer, binary.LittleEndian, &monitorused) | ||
if monitorused > temp.used[i].total { | ||
temp.used[i].total = monitorused | ||
} | ||
} | ||
return temp, nil | ||
} | ||
|
||
func getDeviceUsedMemory(idx int, sharedregion sharedRegionT) (uint64, error) { | ||
var sum uint64 | ||
sum = 0 | ||
if idx < 0 || idx > 16 { | ||
return 0, errors.New("out of device idx") | ||
} | ||
for _, val := range sharedregion.procs { | ||
sum += val.used[idx].total | ||
} | ||
return sum, nil | ||
} | ||
|
||
func mmapcachefile(filename string, nc *nvidiaCollector) error { | ||
var m = &sharedRegionT{} | ||
f, err := os.OpenFile(filename, os.O_RDWR, 0666) | ||
if err != nil { | ||
fmt.Println("openfile error=", err.Error()) | ||
return err | ||
} | ||
data, err := syscall.Mmap(int(f.Fd()), 0, int(unsafe.Sizeof(*m)), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED) | ||
if err != nil { | ||
return err | ||
} | ||
var cachestr *sharedRegionT = *(**sharedRegionT)(unsafe.Pointer(&data)) | ||
fmt.Println("sizeof=", unsafe.Sizeof(*m), "cachestr=", cachestr.utilizationSwitch, cachestr.recentKernel) | ||
nc.cudaCache = cachestr | ||
return nil | ||
} | ||
|
||
func getvGPUMemoryInfo(nc *nvidiaCollector) (*sharedRegionT, error) { | ||
if len(nc.cudevshrPath) > 0 { | ||
if nc.cudaCache == nil { | ||
mmapcachefile(nc.cudevshrPath, nc) | ||
} | ||
return nc.cudaCache, nil | ||
} | ||
return &sharedRegionT{}, errors.New("not found path") | ||
} |
Oops, something went wrong.