From b4cda95934cfa40f9e9c86ff5446a1a053790d57 Mon Sep 17 00:00:00 2001 From: Mateusz Hurbol Date: Tue, 17 Jun 2025 14:17:20 +0000 Subject: [PATCH] Fix the restart issues with perfdash The perfdash pod have been restarting every few hours due to `concurrent map write`. Added a RWMutex to guard the map access --- perfdash/Dockerfile | 2 +- perfdash/Makefile | 2 +- perfdash/deployment.yaml | 2 +- perfdash/metrics-downloader-helper.go | 47 ++++++++++++++++++-- perfdash/metrics-downloader.go | 2 +- perfdash/parser.go | 28 ++++++------ perfdash/parser_test.go | 63 ++++++++++++++++++++++++--- 7 files changed, 117 insertions(+), 29 deletions(-) diff --git a/perfdash/Dockerfile b/perfdash/Dockerfile index 995b5c153c..c57f47c78e 100644 --- a/perfdash/Dockerfile +++ b/perfdash/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG GOLANG_VERSION=1.23 +ARG GOLANG_VERSION=1.24 FROM golang:${GOLANG_VERSION} AS builder WORKDIR /workspace diff --git a/perfdash/Makefile b/perfdash/Makefile index 46437b82e8..805ca76a75 100644 --- a/perfdash/Makefile +++ b/perfdash/Makefile @@ -1,7 +1,7 @@ all: push # See deployment.yaml for the version currently running-- bump this ahead before rebuilding! -TAG?=2.54 +TAG?=2.55 REPO?=gcr.io/k8s-staging-perf-tests diff --git a/perfdash/deployment.yaml b/perfdash/deployment.yaml index 609a094db3..16b4098e7b 100644 --- a/perfdash/deployment.yaml +++ b/perfdash/deployment.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: perfdash - image: gcr.io/k8s-staging-perf-tests/perfdash:2.54 + image: gcr.io/k8s-staging-perf-tests/perfdash:2.55 command: - /perfdash - --www=true diff --git a/perfdash/metrics-downloader-helper.go b/perfdash/metrics-downloader-helper.go index a7810162cd..da65cdd30e 100644 --- a/perfdash/metrics-downloader-helper.go +++ b/perfdash/metrics-downloader-helper.go @@ -19,18 +19,57 @@ package main import ( "encoding/json" "fmt" - "k8s.io/klog" "net/http" "sort" + "sync" + "k8s.io/klog" "k8s.io/kubernetes/test/e2e/perftype" ) // BuildData contains job name and a map from build number to perf data. type BuildData struct { - Builds map[string][]perftype.DataItem `json:"builds"` - Job string `json:"job"` - Version string `json:"version"` + Builds Builds `json:"builds"` + Job string `json:"job"` + Version string `json:"version"` +} + +// Builds is a structure contains build number to perf data map guarded against concurent modification +type Builds struct { + builds map[string][]perftype.DataItem + mu *sync.RWMutex +} + +func (b *Builds) MarshalJSON() ([]byte, error) { + return json.Marshal(b.builds) +} + +func (b *Builds) UnmarshalJSON(data []byte) error { + b.mu = &sync.RWMutex{} + return json.Unmarshal(data, &b.builds) +} + +// NewBuilds returns a guarded build number to perf data map. +func NewBuilds(builds map[string][]perftype.DataItem) Builds { + if builds == nil { + builds = make(map[string][]perftype.DataItem) + } + return Builds{ + builds: builds, + mu: &sync.RWMutex{}, + } +} + +func (b *Builds) AddBuildData(buildNumber string, data perftype.DataItem) { + b.mu.Lock() + defer b.mu.Unlock() + b.builds[buildNumber] = append(b.builds[buildNumber], data) +} + +func (b *Builds) Builds(buildNumber string) []perftype.DataItem { + b.mu.RLock() + defer b.mu.RUnlock() + return b.builds[buildNumber] } // MetricToBuildData is a map from metric name to BuildData pointer. diff --git a/perfdash/metrics-downloader.go b/perfdash/metrics-downloader.go index d5a577f8db..732f835275 100644 --- a/perfdash/metrics-downloader.go +++ b/perfdash/metrics-downloader.go @@ -224,7 +224,7 @@ func getBuildData(result JobToCategoryData, prefix string, category string, labe result[prefix][category] = make(MetricToBuildData) } if _, found := result[prefix][category][label]; !found { - result[prefix][category][label] = &BuildData{Job: job, Version: "", Builds: map[string][]perftype.DataItem{}} + result[prefix][category][label] = &BuildData{Job: job, Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{})} } return result[prefix][category][label] } diff --git a/perfdash/parser.go b/perfdash/parser.go index 30fc732d46..d58675187d 100644 --- a/perfdash/parser.go +++ b/perfdash/parser.go @@ -73,7 +73,7 @@ func parseContainerRestarts(data []byte, buildNumber int, testResult *BuildData) } res := perftype.DataItem{Unit: "", Labels: map[string]string{"RestartCount": "RestartCount"}, Data: restartsByContainer} - testResult.Builds[build] = append(testResult.Builds[build], res) + testResult.Builds.AddBuildData(build, res) } func parsePerfData(data []byte, buildNumber int, testResult *BuildData) { @@ -89,7 +89,7 @@ func parsePerfData(data []byte, buildNumber int, testResult *BuildData) { if testResult.Version == obj.Version { for i := range obj.DataItems { stripCount(&obj.DataItems[i]) - testResult.Builds[build] = append(testResult.Builds[build], obj.DataItems[i]) + testResult.Builds.AddBuildData(build, obj.DataItems[i]) } } } @@ -142,8 +142,8 @@ func parseResourceUsageData(data []byte, buildNumber int, testResult *BuildData) cpu.Data[percentile] = usage.CPU memory.Data[percentile] = usage.Memory / (1024 * 1024) } - testResult.Builds[build] = append(testResult.Builds[build], cpu) - testResult.Builds[build] = append(testResult.Builds[build], memory) + testResult.Builds.AddBuildData(build, cpu) + testResult.Builds.AddBuildData(build, memory) } } @@ -164,7 +164,7 @@ func parseRequestCountData(data []byte, buildNumber int, testResult *BuildData) continue } stripCount(&obj.DataItems[i]) - testResult.Builds[build] = append(testResult.Builds[build], obj.DataItems[i]) + testResult.Builds.AddBuildData(build, obj.DataItems[i]) } } } @@ -217,7 +217,7 @@ func parseApiserverRequestCount(data []byte, buildNumber int, testResult *BuildD continue } resultMap[key] = &perfData - testResult.Builds[build] = append(testResult.Builds[build], perfData) + testResult.Builds.AddBuildData(build, perfData) } } @@ -245,7 +245,7 @@ func parseApiserverInitEventsCount(data []byte, buildNumber int, testResult *Bui } delete(perfData.Labels, "__name__") perfData.Data["InitEventsCount"] = float64(metric[i].Value) - testResult.Builds[build] = append(testResult.Builds[build], perfData) + testResult.Builds.AddBuildData(build, perfData) } } @@ -296,15 +296,15 @@ func parseSchedulingLatency(testName string) func([]byte, int, *BuildData) { return } preemptionEvaluation := parseOperationLatency(obj.PreemptionEvaluationLatency, testName, "preemption_evaluation") - testResult.Builds[build] = append(testResult.Builds[build], preemptionEvaluation) + testResult.Builds.AddBuildData(build, preemptionEvaluation) e2eScheduling := parseOperationLatency(obj.E2eSchedulingLatency, testName, "e2eScheduling") - testResult.Builds[build] = append(testResult.Builds[build], e2eScheduling) + testResult.Builds.AddBuildData(build, e2eScheduling) scheduling := parseOperationLatency(obj.SchedulingLatency, testName, "scheduling") - testResult.Builds[build] = append(testResult.Builds[build], scheduling) + testResult.Builds.AddBuildData(build, scheduling) for name, metric := range obj.FrameworkExtensionPointDuration { frameworkExtensionPointDuration := parseOperationLatency(metric, testName, name) - testResult.Builds[build] = append(testResult.Builds[build], frameworkExtensionPointDuration) + testResult.Builds.AddBuildData(build, frameworkExtensionPointDuration) } } } @@ -332,7 +332,7 @@ func parseSchedulingThroughputCL(testName string) func([]byte, int, *BuildData) perfData.Data["Perc99"] = obj.Perc99 perfData.Data["Average"] = obj.Average perfData.Data["Max"] = obj.Max - testResult.Builds[build] = append(testResult.Builds[build], perfData) + testResult.Builds.AddBuildData(build, perfData) } } @@ -393,7 +393,7 @@ func parseHistogramMetric(metricName string) func(data []byte, buildNumber int, perfData.Data["<= "+bucket+"s"] = float64(buckerVal) / float64(count) * 100 } } - testResult.Builds[build] = append(testResult.Builds[build], perfData) + testResult.Builds.AddBuildData(build, perfData) } } } @@ -439,5 +439,5 @@ func parseSystemPodMetrics(data []byte, buildNumber int, testResult *BuildData) }, Data: restartCounts, } - testResult.Builds[build] = append(testResult.Builds[build], perfData) + testResult.Builds.AddBuildData(build, perfData) } diff --git a/perfdash/parser_test.go b/perfdash/parser_test.go index e526af0022..45f95de15a 100644 --- a/perfdash/parser_test.go +++ b/perfdash/parser_test.go @@ -17,6 +17,7 @@ limitations under the License. package main import ( + "encoding/json" "reflect" "testing" @@ -37,8 +38,8 @@ func Test_parseSystemPodMetrics(t *testing.T) { name: "same-container-in-two-pods", buildNumber: 123, data: sameContainerInTwoPodsSummary(), - testResult: &BuildData{Job: "", Version: "", Builds: map[string][]perftype.DataItem{}}, - want: &BuildData{Job: "", Version: "", Builds: map[string][]perftype.DataItem{ + testResult: &BuildData{Job: "", Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{})}, + want: &BuildData{Job: "", Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{ "123": { perftype.DataItem{ Data: map[string]float64{ @@ -51,7 +52,8 @@ func Test_parseSystemPodMetrics(t *testing.T) { Unit: "", }, }, - }}, + }), + }, }, } for _, tt := range tests { @@ -122,7 +124,7 @@ func Test_parseContainerRestarts(t *testing.T) { "RestartCount": 4 }]`), want: &BuildData{ - Builds: map[string][]perftype.DataItem{ + Builds: NewBuilds(map[string][]perftype.DataItem{ "123": { { Data: map[string]float64{ @@ -132,16 +134,63 @@ func Test_parseContainerRestarts(t *testing.T) { Labels: map[string]string{"RestartCount": "RestartCount"}, }, }, - }, + }), }, }, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { - got := &BuildData{Builds: map[string][]perftype.DataItem{}} + got := &BuildData{Builds: NewBuilds(nil)} parseContainerRestarts(tc.data, 123, got) require.NotNil(t, got.Builds) - assert.ElementsMatch(t, tc.want.Builds["123"], got.Builds["123"]) + assert.ElementsMatch(t, tc.want.Builds.Builds("123"), got.Builds.Builds("123")) + }) + } +} + +func Test_BuildDataToJson(t *testing.T) { + tests := []struct { + name string + buildData *BuildData + want string + }{ + { + name: "ToJsonTest", + buildData: &BuildData{Job: "job", Version: "version", Builds: NewBuilds(map[string][]perftype.DataItem{ + "123": { + perftype.DataItem{ + Data: map[string]float64{ + "c1": 2, + "c2": 1, + }, + Labels: map[string]string{ + "RestartCount": "RestartCount", + }, + Unit: "unit", + }, + }, + }), + }, + want: `{"builds":{"123":[{"data":{"c1":2,"c2":1},"unit":"unit","labels":{"RestartCount":"RestartCount"}}]},"job":"job","version":"version"}`, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + jsonData, err := json.Marshal(tc.buildData) + if err != nil { + t.Fatalf("failed to marshal data into json %v", err) + } + require.NotNil(t, jsonData) + assert.Equal(t, tc.want, string(jsonData)) + got := &BuildData{} + err = json.Unmarshal(jsonData, got) + if err != nil { + t.Fatalf("failed to unmarshal data into json %v", err) + } + if !reflect.DeepEqual(*got, *tc.buildData) { + t.Errorf("want %v, got %v", *tc.buildData, *got) + } }) } }