-
Notifications
You must be signed in to change notification settings - Fork 0
/
nvidia.go
119 lines (105 loc) · 3.43 KB
/
nvidia.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package main
import (
"bytes"
"encoding/binary"
"fmt"
"log/slog"
"sort"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/pmetric"
)
type perDeviceState struct {
d nvml.Device
lastTimestamp uint64
}
type producer struct {
devices []perDeviceState
}
type byTs []nvml.Sample
func (a byTs) Len() int { return len(a) }
func (a byTs) Less(i, j int) bool { return a[i].TimeStamp < a[j].TimeStamp }
func (a byTs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func NewNvidiaProducer() (*producer, error) {
ret := nvml.Init()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("Failed to initialize NVML library: %v", nvml.ErrorString(ret))
}
count, ret := nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("Failed to get count of Nvidia devices: %v", nvml.ErrorString(ret))
}
devices := make([]perDeviceState, count)
for i := 0; i < count; i++ {
device, ret := nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("Failed to get handle for Nvidia device %d: %v", i, nvml.ErrorString(ret))
}
devices[i] = perDeviceState{
d: device,
lastTimestamp: 0,
}
}
return &producer{
devices: devices,
}, nil
}
func (p *producer) Produce(ms pmetric.MetricSlice) error {
for i, pds := range p.devices {
uuid, ret := pds.d.GetUUID()
if ret != nvml.SUCCESS {
slog.Error("Failed to get device UUID", "index", i, "error", nvml.ErrorString(ret))
continue
}
slog.Debug("Collecting metrics for device", "uuid", uuid, "index", i)
m := ms.AppendEmpty()
g := m.SetEmptyGauge()
valueType, utilSamps, ret := pds.d.GetSamples(nvml.GPU_UTILIZATION_SAMPLES, pds.lastTimestamp)
if ret != nvml.SUCCESS {
slog.Error("Failed to get GPU utilization for device", "uuid", uuid, "index", i)
continue
}
var setVal func(pmetric.NumberDataPoint, [8]byte)
switch valueType {
case nvml.VALUE_TYPE_DOUBLE:
setVal = func(dp pmetric.NumberDataPoint, val [8]byte) {
var value float64
// TODO - test this on a big-endian machine
err := binary.Read(bytes.NewReader(val[:]), binary.NativeEndian, &value)
if err != nil {
// justification for panic: this can never happen unless we've made
// a programming error.
panic(err)
}
dp.SetDoubleValue(value)
}
case nvml.VALUE_TYPE_UNSIGNED_INT, nvml.VALUE_TYPE_UNSIGNED_LONG, nvml.VALUE_TYPE_UNSIGNED_LONG_LONG, nvml.VALUE_TYPE_SIGNED_LONG_LONG, nvml.VALUE_TYPE_SIGNED_INT, nvml.VALUE_TYPE_COUNT:
setVal = func(dp pmetric.NumberDataPoint, val [8]byte) {
var value int64
// TODO - test this on a big-endian machine
err := binary.Read(bytes.NewReader(val[:]), binary.NativeEndian, &value)
if err != nil {
// justification for panic: this can never happen unless we've made
// a programming error.
panic(err)
}
dp.SetIntValue(value)
}
default:
slog.Error("Unknown value data type in GPU metrics", "type", valueType)
continue
}
sort.Sort(byTs(utilSamps))
for _, samp := range utilSamps {
pds.lastTimestamp = max(pds.lastTimestamp, samp.TimeStamp)
dp := g.DataPoints().AppendEmpty()
setVal(dp, samp.SampleValue)
// samp.TimeStamp is micros since epoch; pcommon.Timestamp expects
// nanos since epoch
dp.SetTimestamp(pcommon.Timestamp(samp.TimeStamp * 1000))
dp.Attributes().PutStr("UUID", uuid)
dp.Attributes().PutInt("index", int64(i))
}
}
return nil
}