diff --git a/README.md b/README.md index cfcd3e4..6e897ae 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,13 @@ volcano-scheduler-metrics records every GPU usage and limitation, visit the foll curl {volcano scheduler cluster ip}:8080/metrics ``` +You can also collect the **GPU utilization**, **GPU memory usage**, **pods' GPU memory limitations** and **pods' GPU memory usage** metrics on nodes by visiting the following addresses: + +``` +curl {volcano device plugin cluster ip}:9394/metrics +``` +![img](./doc/vgpu_device_plugin_metrics.png) + # Issues and Contributing [Checkout the Contributing document!](CONTRIBUTING.md) diff --git a/cmd/vGPUmonitor/build.sh b/cmd/vGPUmonitor/build.sh new file mode 100644 index 0000000..c6bfa72 --- /dev/null +++ b/cmd/vGPUmonitor/build.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Copyright 2024 The HAMi Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=. --go-grpc_opt=paths=source_relative noderpc/noderpc.proto +go build diff --git a/cmd/vGPUmonitor/cudevshr.go b/cmd/vGPUmonitor/cudevshr.go new file mode 100644 index 0000000..11d2f5b --- /dev/null +++ b/cmd/vGPUmonitor/cudevshr.go @@ -0,0 +1,103 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "errors" + "fmt" + "os" + "syscall" + "unsafe" + + "golang.org/x/exp/mmap" +) + +type deviceMemory struct { + contextSize uint64 + moduleSize uint64 + bufferSize uint64 + offset uint64 + total uint64 +} + +type shrregProcSlotT struct { + pid int32 + hostpid int32 + used [16]deviceMemory + monitorused [16]uint64 + status int32 +} + +type uuid struct { + uuid [96]byte +} + +type semT struct { + sem [32]byte +} + +type sharedRegionT struct { + initializedFlag int32 + smInitFlag int32 + ownerPid uint32 + sem semT + num uint64 + uuids [16]uuid + + limit [16]uint64 + smLimit [16]uint64 + procs [1024]shrregProcSlotT + + procnum int32 + utilizationSwitch int32 + recentKernel int32 + priority int32 +} + +type nvidiaCollector struct { + // Exposed for testing + cudevshrPath string + at *mmap.ReaderAt + cudaCache *sharedRegionT +} + +func mmapcachefile(filename string, nc *nvidiaCollector) error { + var m = &sharedRegionT{} + f, err := os.OpenFile(filename, os.O_RDWR, 0666) + if err != nil { + fmt.Println("openfile error=", err.Error()) + return err + } + data, err := syscall.Mmap(int(f.Fd()), 0, int(unsafe.Sizeof(*m)), syscall.PROT_WRITE|syscall.PROT_READ, syscall.MAP_SHARED) + if err != nil { + return err + } + var cachestr *sharedRegionT = *(**sharedRegionT)(unsafe.Pointer(&data)) + fmt.Println("sizeof=", unsafe.Sizeof(*m), "cachestr=", cachestr.utilizationSwitch, cachestr.recentKernel) + nc.cudaCache = cachestr + return nil +} + +func getvGPUMemoryInfo(nc *nvidiaCollector) (*sharedRegionT, error) { + if len(nc.cudevshrPath) > 0 { + if nc.cudaCache == nil { + mmapcachefile(nc.cudevshrPath, nc) + } + return nc.cudaCache, nil + } + return &sharedRegionT{}, errors.New("not found path") +} diff --git a/cmd/vGPUmonitor/feedback.go b/cmd/vGPUmonitor/feedback.go new file mode 100644 index 0000000..9953c50 --- /dev/null +++ b/cmd/vGPUmonitor/feedback.go @@ -0,0 +1,135 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "k8s.io/klog/v2" +) + +type UtilizationPerDevice []int + +var srPodList map[string]podusage + +func init() { + srPodList = make(map[string]podusage) +} + +func CheckBlocking(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool { + for _, devuuid := range pu.sr.uuids { + _, ok := utSwitchOn[string(devuuid.uuid[:])] + if ok { + for i := 0; i < p; i++ { + if utSwitchOn[string(devuuid.uuid[:])][i] > 0 { + return true + } + } + return false + } + } + return false +} + +// Check whether task with higher priority use GPU or there are other tasks with the same priority. +func CheckPriority(utSwitchOn map[string]UtilizationPerDevice, p int, pu podusage) bool { + for _, devuuid := range pu.sr.uuids { + _, ok := utSwitchOn[string(devuuid.uuid[:])] + if ok { + for i := 0; i < p; i++ { + if utSwitchOn[string(devuuid.uuid[:])][i] > 0 { + return true + } + } + if utSwitchOn[string(devuuid.uuid[:])][p] > 1 { + return true + } + } + } + return false +} + +func Observe(srlist *map[string]podusage) error { + utSwitchOn := map[string]UtilizationPerDevice{} + + for idx, val := range *srlist { + if val.sr == nil { + continue + } + if val.sr.recentKernel > 0 { + (*srlist)[idx].sr.recentKernel-- + if (*srlist)[idx].sr.recentKernel > 0 { + for _, devuuid := range val.sr.uuids { + // Null device condition + if devuuid.uuid[0] == 0 { + continue + } + if len(utSwitchOn[string(devuuid.uuid[:])]) == 0 { + utSwitchOn[string(devuuid.uuid[:])] = []int{0, 0} + } + utSwitchOn[string(devuuid.uuid[:])][val.sr.priority]++ + } + } + } + } + for idx, val := range *srlist { + if val.sr == nil { + continue + } + if CheckBlocking(utSwitchOn, int(val.sr.priority), val) { + if (*srlist)[idx].sr.recentKernel >= 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting Blocking to on %v", idx) + (*srlist)[idx].sr.recentKernel = -1 + } + } else { + if (*srlist)[idx].sr.recentKernel < 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting Blocking to off %v", idx) + (*srlist)[idx].sr.recentKernel = 0 + } + } + if CheckPriority(utSwitchOn, int(val.sr.priority), val) { + if (*srlist)[idx].sr.utilizationSwitch != 1 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting UtilizationSwitch to on %v", idx) + (*srlist)[idx].sr.utilizationSwitch = 1 + } + } else { + if (*srlist)[idx].sr.utilizationSwitch != 0 { + klog.Infof("utSwitchon=%v", utSwitchOn) + klog.Infof("Setting UtilizationSwitch to off %v", idx) + (*srlist)[idx].sr.utilizationSwitch = 0 + } + } + } + return nil +} + +func watchAndFeedback() { + nvml.Init() + for { + time.Sleep(time.Second * 5) + err := monitorPath(srPodList) + if err != nil { + klog.Errorf("monitorPath failed %v", err.Error()) + } + klog.Infof("WatchAndFeedback srPodList=%v", srPodList) + Observe(&srPodList) + } +} diff --git a/cmd/vGPUmonitor/main.go b/cmd/vGPUmonitor/main.go new file mode 100644 index 0000000..14ff97b --- /dev/null +++ b/cmd/vGPUmonitor/main.go @@ -0,0 +1,34 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "k8s.io/klog/v2" +) + +func main() { + if err := ValidateEnvVars(); err != nil { + klog.Fatalf("Failed to validate environment variables: %v", err) + } + errchannel := make(chan error) + go initMetrics() + go watchAndFeedback() + for { + err := <-errchannel + klog.Errorf("failed to serve: %v", err) + } +} diff --git a/cmd/vGPUmonitor/metrics.go b/cmd/vGPUmonitor/metrics.go new file mode 100644 index 0000000..2ee882d --- /dev/null +++ b/cmd/vGPUmonitor/metrics.go @@ -0,0 +1,304 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "log" + "net/http" + "strings" + "time" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + listerscorev1 "k8s.io/client-go/listers/core/v1" + "k8s.io/client-go/rest" + "k8s.io/klog/v2" +) + +// ClusterManager is an example for a system that might have been built without +// Prometheus in mind. It models a central manager of jobs running in a +// cluster. Thus, we implement a custom Collector called +// ClusterManagerCollector, which collects information from a ClusterManager +// using its provided methods and turns them into Prometheus Metrics for +// collection. +// +// An additional challenge is that multiple instances of the ClusterManager are +// run within the same binary, each in charge of a different zone. We need to +// make use of wrapping Registerers to be able to register each +// ClusterManagerCollector instance with Prometheus. +type ClusterManager struct { + Zone string + // Contains many more fields not listed in this example. + PodLister listerscorev1.PodLister +} + +// ReallyExpensiveAssessmentOfTheSystemState is a mock for the data gathering a +// real cluster manager would have to do. Since it may actually be really +// expensive, it must only be called once per collection. This implementation, +// obviously, only returns some made-up data. +func (c *ClusterManager) ReallyExpensiveAssessmentOfTheSystemState() ( + oomCountByHost map[string]int, ramUsageByHost map[string]float64, +) { + // Just example fake data. + oomCountByHost = map[string]int{ + "foo.example.org": 42, + "bar.example.org": 2001, + } + ramUsageByHost = map[string]float64{ + "foo.example.org": 6.023e23, + "bar.example.org": 3.14, + } + return +} + +// ClusterManagerCollector implements the Collector interface. +type ClusterManagerCollector struct { + ClusterManager *ClusterManager +} + +// Descriptors used by the ClusterManagerCollector below. +var ( + hostGPUdesc = prometheus.NewDesc( + "HostGPUMemoryUsage", + "GPU device memory usage", + []string{"deviceidx", "deviceuuid"}, nil, + ) + + hostGPUUtilizationdesc = prometheus.NewDesc( + "HostCoreUtilization", + "GPU core utilization", + []string{"deviceidx", "deviceuuid"}, nil, + ) + + ctrvGPUdesc = prometheus.NewDesc( + "vGPU_device_memory_usage_in_bytes", + "vGPU device usage", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) + + ctrvGPUlimitdesc = prometheus.NewDesc( + "vGPU_device_memory_limit_in_bytes", + "vGPU device limit", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid"}, nil, + ) + ctrDeviceMemorydesc = prometheus.NewDesc( + "Device_memory_desc_of_container", + "Container device meory description", + []string{"podnamespace", "podname", "ctrname", "vdeviceid", "deviceuuid", "context", "module", "data", "offset"}, nil, + ) + clientset *kubernetes.Clientset +) + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) { + ch <- hostGPUdesc + ch <- ctrvGPUdesc + ch <- ctrvGPUlimitdesc + ch <- hostGPUUtilizationdesc + //prometheus.DescribeByCollect(cc, ch) +} + +func gettotalusage(usage podusage, vidx int) (deviceMemory, error) { + added := deviceMemory{ + bufferSize: 0, + contextSize: 0, + moduleSize: 0, + offset: 0, + total: 0, + } + for _, val := range usage.sr.procs { + added.bufferSize += val.used[vidx].bufferSize + added.contextSize += val.used[vidx].contextSize + added.moduleSize += val.used[vidx].moduleSize + added.offset += val.used[vidx].offset + added.total += val.used[vidx].total + } + return added, nil +} + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { + klog.Info("Starting to collect metrics for vGPUMonitor") + if srPodList == nil { + srPodList = make(map[string]podusage) + } + if err := monitorPath(srPodList); err != nil { + klog.Error("err=", err.Error()) + } + if clientset != nil { + nvret := nvml.Init() + if nvret != nvml.SUCCESS { + klog.Error("nvml Init err=", nvml.ErrorString(nvret)) + } + devnum, nvret := nvml.DeviceGetCount() + if nvret != nvml.SUCCESS { + klog.Error("nvml GetDeviceCount err=", nvml.ErrorString(nvret)) + } else { + for ii := 0; ii < devnum; ii++ { + hdev, nvret := nvml.DeviceGetHandleByIndex(ii) + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } + memoryUsed := 0 + memory, ret := hdev.GetMemoryInfo() + if ret == nvml.SUCCESS { + memoryUsed = int(memory.Used) + } else { + klog.Error("nvml get memory error ret=", ret) + } + + uuid, nvret := hdev.GetUUID() + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } else { + ch <- prometheus.MustNewConstMetric( + hostGPUdesc, + prometheus.GaugeValue, + float64(memoryUsed), + fmt.Sprint(ii), uuid, + ) + } + util, nvret := hdev.GetUtilizationRates() + if nvret != nvml.SUCCESS { + klog.Error(nvml.ErrorString(nvret)) + } else { + ch <- prometheus.MustNewConstMetric( + hostGPUUtilizationdesc, + prometheus.GaugeValue, + float64(util.Gpu), + fmt.Sprint(ii), uuid, + ) + } + + } + } + + pods, err := cc.ClusterManager.PodLister.List(labels.Everything()) + if err != nil { + klog.Error("failed to list pods with err=", err.Error()) + } + for _, val := range pods { + for sridx := range srPodList { + if srPodList[sridx].sr == nil { + continue + } + podUID := strings.Split(srPodList[sridx].idstr, "_")[0] + ctrName := strings.Split(srPodList[sridx].idstr, "_")[1] + if strings.Compare(string(val.UID), podUID) == 0 { + fmt.Println("Pod matched!", val.Name, val.Namespace, val.Labels) + for _, ctr := range val.Spec.Containers { + if strings.Compare(ctr.Name, ctrName) == 0 { + fmt.Println("container matched", ctr.Name) + //err := setHostPid(val, val.Status.ContainerStatuses[ctridx], &srPodList[sridx]) + //if err != nil { + // fmt.Println("setHostPid filed", err.Error()) + //} + //fmt.Println("sr.list=", srPodList[sridx].sr) + podlabels := make(map[string]string) + for idx, val := range val.Labels { + idxfix := strings.ReplaceAll(idx, "-", "_") + valfix := strings.ReplaceAll(val, "-", "_") + podlabels[idxfix] = valfix + } + for i := 0; i < int(srPodList[sridx].sr.num); i++ { + value, _ := gettotalusage(srPodList[sridx], i) + uuid := string(srPodList[sridx].sr.uuids[i].uuid[:])[0:40] + + //fmt.Println("uuid=", uuid, "length=", len(uuid)) + ch <- prometheus.MustNewConstMetric( + ctrvGPUdesc, + prometheus.GaugeValue, + float64(value.total), + val.Namespace, val.Name, ctrName, fmt.Sprint(i), uuid, /*,string(sr.sr.uuids[i].uuid[:])*/ + ) + ch <- prometheus.MustNewConstMetric( + ctrvGPUlimitdesc, + prometheus.GaugeValue, + float64(srPodList[sridx].sr.limit[i]), + val.Namespace, val.Name, ctrName, fmt.Sprint(i), uuid, /*,string(sr.sr.uuids[i].uuid[:])*/ + ) + ch <- prometheus.MustNewConstMetric( + ctrDeviceMemorydesc, + prometheus.CounterValue, + float64(value.total), + val.Namespace, val.Name, ctrName, fmt.Sprint(i), uuid, fmt.Sprint(value.contextSize), fmt.Sprint(value.moduleSize), fmt.Sprint(value.bufferSize), fmt.Sprint(value.offset), + ) + } + } + } + } + } + } + } +} + +// NewClusterManager first creates a Prometheus-ignorant ClusterManager +// instance. Then, it creates a ClusterManagerCollector for the just created +// ClusterManager. Finally, it registers the ClusterManagerCollector with a +// wrapping Registerer that adds the zone as a label. In this way, the metrics +// collected by different ClusterManagerCollectors do not collide. +func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager { + c := &ClusterManager{ + Zone: zone, + } + + informerFactory := informers.NewSharedInformerFactoryWithOptions(clientset, time.Hour*1) + c.PodLister = informerFactory.Core().V1().Pods().Lister() + stopCh := make(chan struct{}) + informerFactory.Start(stopCh) + + cc := ClusterManagerCollector{ClusterManager: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": zone}, reg).MustRegister(cc) + return c +} + +func initMetrics() { + // Since we are dealing with custom Collector implementations, it might + // be a good idea to try it out with a pedantic registry. + klog.Info("Initializing metrics for vGPUmonitor") + reg := prometheus.NewRegistry() + //reg := prometheus.NewPedanticRegistry() + config, err := rest.InClusterConfig() + if err != nil { + fmt.Println(err.Error()) + return + } + clientset, err = kubernetes.NewForConfig(config) + if err != nil { + fmt.Println(err.Error()) + return + } + + // Construct cluster managers. In real code, we would assign them to + // variables to then do something with them. + NewClusterManager("vGPU", reg) + + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + log.Fatal(http.ListenAndServe(":9394", nil)) +} diff --git a/cmd/vGPUmonitor/pathmonitor.go b/cmd/vGPUmonitor/pathmonitor.go new file mode 100644 index 0000000..d3a7b6b --- /dev/null +++ b/cmd/vGPUmonitor/pathmonitor.go @@ -0,0 +1,151 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "time" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" +) + +type podusage struct { + idstr string + sr *sharedRegionT +} + +var ( + containerPath string + nodeName string + lock sync.Mutex +) + +func init() { + hookPath, ok := os.LookupEnv("HOOK_PATH") + if ok { + containerPath = filepath.Join(hookPath, "containers") + } + nodeName = os.Getenv("NODE_NAME") +} + +func checkfiles(fpath string) (*sharedRegionT, error) { + klog.Infof("Checking path %s", fpath) + files, err := os.ReadDir(fpath) + if err != nil { + return nil, err + } + if len(files) > 2 { + return nil, errors.New("cache num not matched") + } + if len(files) == 0 { + return nil, nil + } + for _, val := range files { + if strings.Contains(val.Name(), "libvgpu.so") { + continue + } + if !strings.Contains(val.Name(), ".cache") { + continue + } + cachefile := fpath + "/" + val.Name() + nc := nvidiaCollector{ + cudevshrPath: cachefile, + at: nil, + } + sr, err := getvGPUMemoryInfo(&nc) + if err != nil { + klog.Errorf("getvGPUMemoryInfo failed: %v", err) + } else { + klog.Infof("getvGPUMemoryInfo success with utilizationSwitch=%d, recentKernel=%d, priority=%d", sr.utilizationSwitch, sr.recentKernel, sr.priority) + return sr, nil + } + } + return nil, nil +} + +func isVaildPod(name string, pods *corev1.PodList) bool { + for _, val := range pods.Items { + if strings.Contains(name, string(val.UID)) { + return true + } + } + return false +} + +func monitorPath(podmap map[string]podusage) error { + lock.Lock() + defer lock.Unlock() + files, err := os.ReadDir(containerPath) + if err != nil { + return err + } + pods, err := clientset.CoreV1().Pods("").List(context.Background(), metav1.ListOptions{ + FieldSelector: fmt.Sprintf("spec.nodeName=%s", nodeName), + }) + if err != nil { + klog.Errorf("Failed to get pods on node %s, error: %v", nodeName, err) + return nil + } + klog.Infof("Found %d pods on node %s", len(pods.Items), nodeName) + + for _, containerFile := range files { + dirname := containerPath + "/" + containerFile.Name() + if info, err1 := os.Stat(dirname); err1 != nil || !isVaildPod(info.Name(), pods) { + if info.ModTime().Add(time.Second * 300).Before(time.Now()) { + klog.Infof("Removing dirname %s in in monitorPath", dirname) + //syscall.Munmap(unsafe.Pointer(podmap[dirname].sr)) + delete(podmap, dirname) + err2 := os.RemoveAll(dirname) + if err2 != nil { + klog.Errorf("Failed to remove dirname: %s , error: %v", dirname, err) + return err2 + } + } + } else { + _, ok := podmap[dirname] + if !ok { + klog.Infof("Adding ctr dirname %s in monitorPath", dirname) + sharedRegion, err2 := checkfiles(dirname) + if err2 != nil { + klog.Errorf("Failed to checkfiles dirname: %s , error: %v", dirname, err) + return err2 + } + if sharedRegion == nil { + klog.Infof("nil shared region for dirname %s in monitorPath", dirname) + continue + } + + klog.Infof("Shared region after checking files: %v", *sharedRegion) + podmap[dirname] = podusage{ + idstr: containerFile.Name(), + sr: sharedRegion, + } + } + } + } + + klog.Infof("Monitored path map: %v", podmap) + return nil +} diff --git a/cmd/vGPUmonitor/pathmonitor_test.go b/cmd/vGPUmonitor/pathmonitor_test.go new file mode 100644 index 0000000..0f156da --- /dev/null +++ b/cmd/vGPUmonitor/pathmonitor_test.go @@ -0,0 +1,61 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestIsVaildPod(t *testing.T) { + pods := &corev1.PodList{ + Items: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + UID: "123", + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + UID: "456", + }, + }, + }, + } + + cases := []struct { + name string + expected bool + }{ + { + name: "123", + expected: true, + }, + { + name: "789", + expected: false, + }, + } + + for _, c := range cases { + if got := isVaildPod(c.name, pods); got != c.expected { + t.Errorf("isVaildPod(%q) == %v, want %v", c.name, got, c.expected) + } + } +} diff --git a/cmd/vGPUmonitor/testcollector/main.go b/cmd/vGPUmonitor/testcollector/main.go new file mode 100644 index 0000000..bf13427 --- /dev/null +++ b/cmd/vGPUmonitor/testcollector/main.go @@ -0,0 +1,145 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "log" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +// ClusterManager is an example for a system that might have been built without +// Prometheus in mind. It models a central manager of jobs running in a +// cluster. Thus, we implement a custom Collector called +// ClusterManagerCollector, which collects information from a ClusterManager +// using its provided methods and turns them into Prometheus Metrics for +// collection. +// +// An additional challenge is that multiple instances of the ClusterManager are +// run within the same binary, each in charge of a different zone. We need to +// make use of wrapping Registerers to be able to register each +// ClusterManagerCollector instance with Prometheus. +type ClusterManager struct { + Zone string + // Contains many more fields not listed in this example. +} + +// ReallyExpensiveAssessmentOfTheSystemState is a mock for the data gathering a +// real cluster manager would have to do. Since it may actually be really +// expensive, it must only be called once per collection. This implementation, +// obviously, only returns some made-up data. +func (c *ClusterManager) ReallyExpensiveAssessmentOfTheSystemState() ( + oomCountByHost map[string]int, ramUsageByHost map[string]float64, +) { + // Just example fake data. + oomCountByHost = map[string]int{ + "foo.example.org": 42, + "bar.example.org": 2001, + } + ramUsageByHost = map[string]float64{ + "foo.example.org": 6.023e23, + "bar.example.org": 3.14, + } + return +} + +// ClusterManagerCollector implements the Collector interface. +type ClusterManagerCollector struct { + ClusterManager *ClusterManager +} + +// Descriptors used by the ClusterManagerCollector below. +var ( + oomCountDesc = prometheus.NewDesc( + "clustermanager_oom_crashes_total", + "Number of OOM crashes.", + []string{"host"}, nil, + ) + ramUsageDesc = prometheus.NewDesc( + "clustermanager_ram_usage_bytes", + "RAM usage as reported to the cluster manager.", + []string{"host"}, nil, + ) +) + +// Describe is implemented with DescribeByCollect. That's possible because the +// Collect method will always return the same two metrics with the same two +// descriptors. +func (cc ClusterManagerCollector) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cc, ch) +} + +// Collect first triggers the ReallyExpensiveAssessmentOfTheSystemState. Then it +// creates constant metrics for each host on the fly based on the returned data. +// +// Note that Collect could be called concurrently, so we depend on +// ReallyExpensiveAssessmentOfTheSystemState to be concurrency-safe. +func (cc ClusterManagerCollector) Collect(ch chan<- prometheus.Metric) { + oomCountByHost, ramUsageByHost := cc.ClusterManager.ReallyExpensiveAssessmentOfTheSystemState() + for host, oomCount := range oomCountByHost { + ch <- prometheus.MustNewConstMetric( + oomCountDesc, + prometheus.CounterValue, + float64(oomCount), + host, + ) + } + for host, ramUsage := range ramUsageByHost { + ch <- prometheus.MustNewConstMetric( + ramUsageDesc, + prometheus.GaugeValue, + ramUsage, + host, + ) + } +} + +// NewClusterManager first creates a Prometheus-ignorant ClusterManager +// instance. Then, it creates a ClusterManagerCollector for the just created +// ClusterManager. Finally, it registers the ClusterManagerCollector with a +// wrapping Registerer that adds the zone as a label. In this way, the metrics +// collected by different ClusterManagerCollectors do not collide. +func NewClusterManager(zone string, reg prometheus.Registerer) *ClusterManager { + c := &ClusterManager{ + Zone: zone, + } + cc := ClusterManagerCollector{ClusterManager: c} + prometheus.WrapRegistererWith(prometheus.Labels{"zone": zone}, reg).MustRegister(cc) + return c +} + +func main() { + // Since we are dealing with custom Collector implementations, it might + // be a good idea to try it out with a pedantic registry. + reg := prometheus.NewPedanticRegistry() + + // Construct cluster managers. In real code, we would assign them to + // variables to then do something with them. + NewClusterManager("db", reg) + NewClusterManager("ca", reg) + + // Add the standard process and Go metrics to the custom registry. + reg.MustRegister( + prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}), + prometheus.NewGoCollector(), + ) + + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) + log.Fatal(http.ListenAndServe(":8080", nil)) +} diff --git a/cmd/vGPUmonitor/validation.go b/cmd/vGPUmonitor/validation.go new file mode 100644 index 0000000..fbba208 --- /dev/null +++ b/cmd/vGPUmonitor/validation.go @@ -0,0 +1,37 @@ +/* +Copyright 2024 The HAMi Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "fmt" + "os" +) + +var requiredEnvVars = map[string]bool{ + "HOOK_PATH": true, + "OTHER_ENV_VAR": false, +} + +func ValidateEnvVars() error { + for envVar, required := range requiredEnvVars { + _, exists := os.LookupEnv(envVar) + if required && !exists { + return fmt.Errorf("required environment variable %s not set", envVar) + } + } + return nil +} diff --git a/doc/vgpu_device_plugin_metrics.png b/doc/vgpu_device_plugin_metrics.png new file mode 100644 index 0000000..6b1fde9 Binary files /dev/null and b/doc/vgpu_device_plugin_metrics.png differ diff --git a/docker/amd64/Dockerfile.vgpu-monitor-ubuntu20.04 b/docker/amd64/Dockerfile.vgpu-monitor-ubuntu20.04 new file mode 100644 index 0000000..c5bd2e8 --- /dev/null +++ b/docker/amd64/Dockerfile.vgpu-monitor-ubuntu20.04 @@ -0,0 +1,20 @@ +FROM ubuntu:20.04 AS builder +RUN apt-get update && apt-get install -y --no-install-recommends \ + g++ \ + ca-certificates \ + wget && \ + rm -rf /var/lib/apt/lists/* +ENV GOLANG_VERSION 1.19.3 +ENV GOPATH /go +ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH +WORKDIR /go/src/volcano.sh/devices +COPY . . +RUN wget -nv https://storage.googleapis.com/golang/go$GOLANG_VERSION.linux-amd64.tar.gz +RUN tar -C /usr/local -xzf go$GOLANG_VERSION.linux-amd64.tar.gz +RUN go env -w GO111MODULE=on +RUN go env -w GOPROXY=https://goproxy.cn,direct +RUN export CGO_LDFLAGS_ALLOW='-Wl,--unresolved-symbols=ignore-in-object-files' && \ + go build -ldflags="-s -w" -o vGPUmonitor ./cmd/vGPUmonitor + +FROM debian:stretch-slim +COPY --from=builder /go/src/volcano.sh/devices/vGPUmonitor /usr/bin/vGPUmonitor diff --git a/go.mod b/go.mod index 007353b..129e741 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,31 @@ module volcano.sh/k8s-device-plugin go 1.14 +require ( + github.com/NVIDIA/go-gpuallocator v0.2.3 + github.com/NVIDIA/go-nvml v0.12.4-0 + github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4 + github.com/fsnotify/fsnotify v1.4.9 + github.com/prometheus/client_golang v1.0.0 + github.com/prometheus/common v0.4.1 + github.com/spf13/cobra v0.0.5 + github.com/spf13/viper v1.3.2 + github.com/stretchr/testify v1.9.0 + github.com/urfave/cli/v2 v2.4.0 + golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495 + golang.org/x/net v0.0.0-20200421231249-e086a090c8fd + google.golang.org/grpc v1.32.0 + google.golang.org/protobuf v1.34.2 + k8s.io/api v0.18.2 + k8s.io/apimachinery v0.18.2 + k8s.io/client-go v0.18.2 + k8s.io/klog v1.0.0 + k8s.io/klog/v2 v2.80.1 + k8s.io/kubelet v0.0.0 + k8s.io/kubernetes v1.18.2 + sigs.k8s.io/yaml v1.2.0 +) + replace ( k8s.io/api => k8s.io/api v0.18.2 k8s.io/apiextensions-apiserver => k8s.io/apiextensions-apiserver v0.18.2 @@ -26,25 +51,3 @@ replace ( k8s.io/metrics => k8s.io/metrics v0.18.2 k8s.io/sample-apiserver => k8s.io/sample-apiserver v0.18.2 ) - -require ( - github.com/NVIDIA/go-gpuallocator v0.2.3 - github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4 - github.com/fsnotify/fsnotify v1.4.9 - github.com/mitchellh/gox v1.0.1 // indirect - github.com/prometheus/common v0.4.1 - github.com/spf13/cobra v0.0.5 - github.com/spf13/viper v1.3.2 - github.com/stretchr/testify v1.5.1 - github.com/urfave/cli/v2 v2.4.0 - golang.org/x/net v0.0.0-20200421231249-e086a090c8fd - google.golang.org/grpc v1.29.0 - k8s.io/api v0.18.2 - k8s.io/apimachinery v0.18.2 - k8s.io/client-go v0.18.2 - k8s.io/klog v1.0.0 - k8s.io/klog/v2 v2.80.1 - k8s.io/kubelet v0.0.0 - k8s.io/kubernetes v1.18.2 - sigs.k8s.io/yaml v1.2.0 -) diff --git a/go.sum b/go.sum index 8ccac0b..f5eacb1 100644 --- a/go.sum +++ b/go.sum @@ -23,6 +23,8 @@ github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jB github.com/Microsoft/hcsshim v0.0.0-20190417211021-672e52e9209d/go.mod h1:Op3hHsoHPAvb6lceZHDtd9OkTew38wNoXnJs8iY7rUg= github.com/NVIDIA/go-gpuallocator v0.2.3 h1:YTXxNpHo71u16DPBWSsHpxV/Eac76ElF5B/rDOM9zqc= github.com/NVIDIA/go-gpuallocator v0.2.3/go.mod h1:cNlWZtJeN15qXGoOzZnOA9yY3CiJrUtUsfGJHFefiDA= +github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= +github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4 h1:6KSetbMgb2MieLm34BNJKiEuiP5Tj9Tr94wTipnlYDA= github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20201109160820-d08ea3cdcce4/go.mod h1:l0Cq257MSJMvg9URCXUjc8pgKY2SK1oSvIx6qG0bzzc= github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= @@ -55,6 +57,7 @@ github.com/bazelbuild/buildtools v0.0.0-20190731111112-f720930ceb60/go.mod h1:5J github.com/bazelbuild/buildtools v0.0.0-20190917191645-69366ca98f89/go.mod h1:5JP0TXzWDHXv8qvxRC4InIazwdyDseBDbzESUMKk1yU= github.com/bazelbuild/rules_go v0.0.0-20190719190356-6dae44dc5cab/go.mod h1:MC23Dc/wkXEyk3Wpq6lCqz0ZAYOZDw2DR5y3N1q2i7M= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= +github.com/beorn7/perks v1.0.0 h1:HWo1m869IqiPhD389kmkxeTalrjNbbJTC8LXupb+sl0= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= github.com/bifurcation/mint v0.0.0-20180715133206-93c51c6ce115/go.mod h1:zVt7zX3K/aDCk9Tj+VM7YymsX66ERvzCJzw8rFCX2JU= @@ -211,6 +214,7 @@ github.com/gogo/protobuf v1.3.1 h1:DqDEcV5aeaTmdFBePNpYsp3FlcVH/2ISVVM9Qf8PSls= github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXPKa29o= github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903 h1:LbsanbbD6LieFkXbj9YNNBupiGHJgFeLpO0j0Fza1h8= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.0.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= @@ -220,8 +224,9 @@ github.com/golang/protobuf v0.0.0-20161109072736-4bd1920723d7/go.mod h1:6lQm79b+ github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= -github.com/golang/protobuf v1.3.3 h1:gyjaxf+svBWX08ZjK86iN9geUJF0H6gp2IRKX6Nf6/I= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.5.0 h1:LUVKkCeviFUMKqHa4tXIIij/lbhnMbP7Fn5wKdKkRh4= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golangci/check v0.0.0-20180506172741-cfe4005ccda2/go.mod h1:k9Qvh+8juN+UKMCS/3jFtGICgW8O96FVaZsaxdzDkR4= github.com/golangci/dupl v0.0.0-20180902072040-3e9179ac440a/go.mod h1:ryS0uhF+x9jgbj/N71xsEqODy9BN81/GonCZiOzirOk= github.com/golangci/errcheck v0.0.0-20181223084120-ef45e06d44b6/go.mod h1:DbHgvLiFKX1Sh2T1w8Q/h4NAI8MHIpzCdnBUDTXU3I0= @@ -246,8 +251,10 @@ github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Z github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/cadvisor v0.35.0/go.mod h1:1nql6U13uTHaLYB8rLS5x9IJc2qT6Xd/Tr1sTX6NE48= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= -github.com/google/go-cmp v0.3.0 h1:crn/baboCvb5fXaQ0IJ1SGTsTVrWpDsCWC8EGETZijY= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-github v17.0.0+incompatible/go.mod h1:zLgOLi98H3fifZn+44m+umXrS52loVEgC2AApnigrVQ= github.com/google/go-querystring v1.0.0/go.mod h1:odCYkC5MyYFN7vkCjXpyrEuKhc/BUO6wN/zVPAxq5ck= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -257,8 +264,9 @@ github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXi github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/google/uuid v1.1.1 h1:Gkbcsh/GbpXz7lPftLA3P6TYMwjCLYm83jiFQZF/3gY= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gnostic v0.0.0-20170729233727-0c5108395e2d/go.mod h1:sJBsCZ4ayReDTBIg8b9dl28c5xFWyhBTVRp3pOg5EKY= github.com/googleapis/gnostic v0.1.0 h1:rVsPeBmXbYv4If/cumu1AzZPwV58q433hvONV1UEZoI= @@ -281,6 +289,7 @@ github.com/hashicorp/go-version v1.0.0 h1:21MVWPKDphxa7ineQQTrCU5brh7OuVVAzGOCnn github.com/hashicorp/go-version v1.0.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v0.0.0-20180201235237-0fb14efe8c47/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/hcl v0.0.0-20180404174102-ef8a98b0bbce/go.mod h1:oZtUIOe8dh44I2q6ScRibXws4Ajl+d+nod3AaR9vL5w= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= @@ -352,6 +361,7 @@ github.com/mattn/go-isatty v0.0.9/go.mod h1:YNRxwqDuOph6SZLI9vUUz6OYw3QyUt7WiY2y github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-shellwords v1.0.5/go.mod h1:3xCvwCdWdlDJUrvuMn7Wuy9eWs4pE8vqg+NOMyg4B2o= github.com/mattn/goveralls v0.0.2/go.mod h1:8d1ZMHsd7fW6IRPKQh46F2WRpyib5/X4FOpevwGNQEw= +github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/mesos/mesos-go v0.0.9/go.mod h1:kPYCMQ9gsOXVAle1OsoY4I1+9kPu8GHkf88aV59fDr4= github.com/mholt/certmagic v0.6.2-0.20190624175158-6a42ef9fe8c2/go.mod h1:g4cOPxcjV0oFq3qwpjSA30LReKD8AoIfwAY9VvG35NY= @@ -418,14 +428,17 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/pquerna/cachecontrol v0.0.0-20171018203845-0dec1b30a021/go.mod h1:prYjPmNq4d1NPVmpShWobRqXY3q7Vp+80DqgxxUrUIA= github.com/pquerna/ffjson v0.0.0-20180717144149-af8b230fcd20/go.mod h1:YARuvh7BUWHNhzDq2OM5tzR2RiCcN2D7sapiKyCel/M= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= +github.com/prometheus/client_golang v1.0.0 h1:vrDKnkGzuGvhNAL56c7DBz29ZL+KxnoR0x7enabFceM= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/prometheus/client_model v0.2.0 h1:uq5h0d+GuxiXLJLNABMgp2qUWDPiLvgCzz2dUR+/W/M= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/common v0.4.1 h1:K0MGApIoQvMw27RTdJkPbr3JZ7DNbtxQNyi5STVM6Kw= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= +github.com/prometheus/procfs v0.0.2 h1:6LJUbpNm42llc4HRCuvApCSWB/WfhuNo9K98Q9sNGfs= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/quasilyte/go-consistent v0.0.0-20190521200055-c6f3937de18c/go.mod h1:5STLWrekHfjyYwxBRVRXNOSewLJ3PWfDJd1VyTS21fI= github.com/quobyte/api v0.1.2/go.mod h1:jL7lIHrmqQ7yh05OJ+eEEdHr0u/kmT1Ff9iHd+4H6VI= @@ -487,11 +500,18 @@ github.com/storageos/go-api v0.0.0-20180912212459-343b3eff91fc/go.mod h1:ZrLn+e0 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= -github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= github.com/tarm/serial v0.0.0-20180830185346-98f6abe2eb07/go.mod h1:kDXzergiv9cbyO7IOYJZWg1U88JhDg3PB6klq9Hg2pA= github.com/thecodeteam/goscaleio v0.1.0/go.mod h1:68sdkZAsK8bvEwBlbQnlLS+xU+hvLYM/iQ8KXej1AwM= @@ -548,6 +568,7 @@ golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495 h1:I6A9Ag9FpEKOjcKrRNjQkPHawoXIhKyTGfvvjFAiiAk= golang.org/x/exp v0.0.0-20190312203227-4b39c73a6495/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= @@ -663,6 +684,8 @@ golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190909030654-5b82db07426d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190920225731-5eefd052ad72/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.0.0-20190331200053-3d26580ed485/go.mod h1:2ltnJ7xHfj0zHS40VVPYEAAMTa3ZGguvHGBSJeRWqE0= gonum.org/v1/gonum v0.6.2/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= @@ -688,8 +711,11 @@ google.golang.org/grpc v1.23.1/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.28.1/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= -google.golang.org/grpc v1.29.0 h1:2pJjwYOdkZ9HlN4sWRYBg9ttH5bCOlsueaM+b/oYjwo= -google.golang.org/grpc v1.29.0/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= +google.golang.org/grpc v1.32.0 h1:zWTV+LMdc3kaiJMSTOFz2UgSBgx8RNQoTGiZu3fR9S0= +google.golang.org/grpc v1.32.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.34.2 h1:6xV6lTsCfpGD21XK49h7MhtcApnLqkfYgPcdHftf6hg= +google.golang.org/protobuf v1.34.2/go.mod h1:qYOHts0dSfpeUzUFpOMr/WGzszTmLH+DiWniOlNbLDw= gopkg.in/airbrake/gobrake.v2 v2.0.9/go.mod h1:/h5ZAUhDkGaJfjzjKLSjv6zCL6O0LLBxU4K+aSYdM/U= gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= @@ -717,6 +743,9 @@ gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools v2.1.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools v2.2.0+incompatible/go.mod h1:DsYFclhRJ6vuDpmuTbkuFWG+y2sxOXAzmJt81HFBacw= gotest.tools/gotestsum v0.3.5/go.mod h1:Mnf3e5FUzXbkCfynWBGOwLssY7gTQgCHObK9tMpAriY= diff --git a/volcano-vgpu-device-plugin.yml b/volcano-vgpu-device-plugin.yml index e37ebad..ccd3124 100644 --- a/volcano-vgpu-device-plugin.yml +++ b/volcano-vgpu-device-plugin.yml @@ -108,13 +108,70 @@ spec: mountPath: /usr/local/vgpu - name: hosttmp mountPath: /tmp + - image: docker.io/projecthami/volcano-vgpu-monitor:v1.9.3 + imagePullPolicy: Always + name: monitor + command: + - /bin/bash + - -c + - vGPUmonitor + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_MIG_MONITOR_DEVICES + value: "all" + - name: HOOK_PATH + value: "/tmp/vgpu" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + add: ["SYS_ADMIN"] + volumeMounts: + - name: dockers + mountPath: /run/docker + - name: containerds + mountPath: /run/containerd + - name: sysinfo + mountPath: /sysinfo + - name: hostvar + mountPath: /hostvar + - name: hosttmp + mountPath: /tmp volumes: - - name: device-plugin - hostPath: + - hostPath: path: /var/lib/kubelet/device-plugins - - name: lib - hostPath: + type: Directory + name: device-plugin + - hostPath: path: /usr/local/vgpu + type: DirectoryOrCreate + name: lib - name: hosttmp hostPath: path: /tmp + type: DirectoryOrCreate + - name: dockers + hostPath: + path: /run/docker + type: DirectoryOrCreate + - name: containerds + hostPath: + path: /run/containerd + type: DirectoryOrCreate + - name: usrbin + hostPath: + path: /usr/bin + type: Directory + - name: sysinfo + hostPath: + path: /sys + type: Directory + - name: hostvar + hostPath: + path: /var + type: Directory \ No newline at end of file