From b4cda95934cfa40f9e9c86ff5446a1a053790d57 Mon Sep 17 00:00:00 2001
From: Mateusz Hurbol <hurbol@google.com>
Date: Tue, 17 Jun 2025 14:17:20 +0000
Subject: [PATCH] Fix the restart issues with perfdash

The perfdash pod have been restarting every few hours due to
`concurrent map write`. Added a RWMutex to guard the map access
---
 perfdash/Dockerfile                   |  2 +-
 perfdash/Makefile                     |  2 +-
 perfdash/deployment.yaml              |  2 +-
 perfdash/metrics-downloader-helper.go | 47 ++++++++++++++++++--
 perfdash/metrics-downloader.go        |  2 +-
 perfdash/parser.go                    | 28 ++++++------
 perfdash/parser_test.go               | 63 ++++++++++++++++++++++++---
 7 files changed, 117 insertions(+), 29 deletions(-)

diff --git a/perfdash/Dockerfile b/perfdash/Dockerfile
index 995b5c153c..c57f47c78e 100644
--- a/perfdash/Dockerfile
+++ b/perfdash/Dockerfile
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG GOLANG_VERSION=1.23
+ARG GOLANG_VERSION=1.24
 FROM golang:${GOLANG_VERSION} AS builder
 
 WORKDIR /workspace
diff --git a/perfdash/Makefile b/perfdash/Makefile
index 46437b82e8..805ca76a75 100644
--- a/perfdash/Makefile
+++ b/perfdash/Makefile
@@ -1,7 +1,7 @@
 all: push
 
 # See deployment.yaml for the version currently running-- bump this ahead before rebuilding!
-TAG?=2.54
+TAG?=2.55
 
 REPO?=gcr.io/k8s-staging-perf-tests
 
diff --git a/perfdash/deployment.yaml b/perfdash/deployment.yaml
index 609a094db3..16b4098e7b 100644
--- a/perfdash/deployment.yaml
+++ b/perfdash/deployment.yaml
@@ -16,7 +16,7 @@ spec:
     spec:
       containers:
       - name: perfdash
-        image: gcr.io/k8s-staging-perf-tests/perfdash:2.54
+        image: gcr.io/k8s-staging-perf-tests/perfdash:2.55
         command:
           - /perfdash
           -   --www=true
diff --git a/perfdash/metrics-downloader-helper.go b/perfdash/metrics-downloader-helper.go
index a7810162cd..da65cdd30e 100644
--- a/perfdash/metrics-downloader-helper.go
+++ b/perfdash/metrics-downloader-helper.go
@@ -19,18 +19,57 @@ package main
 import (
 	"encoding/json"
 	"fmt"
-	"k8s.io/klog"
 	"net/http"
 	"sort"
+	"sync"
 
+	"k8s.io/klog"
 	"k8s.io/kubernetes/test/e2e/perftype"
 )
 
 // BuildData contains job name and a map from build number to perf data.
 type BuildData struct {
-	Builds  map[string][]perftype.DataItem `json:"builds"`
-	Job     string                         `json:"job"`
-	Version string                         `json:"version"`
+	Builds  Builds `json:"builds"`
+	Job     string `json:"job"`
+	Version string `json:"version"`
+}
+
+// Builds is a structure contains build number to perf data map guarded against concurent modification
+type Builds struct {
+	builds map[string][]perftype.DataItem
+	mu     *sync.RWMutex
+}
+
+func (b *Builds) MarshalJSON() ([]byte, error) {
+	return json.Marshal(b.builds)
+}
+
+func (b *Builds) UnmarshalJSON(data []byte) error {
+	b.mu = &sync.RWMutex{}
+	return json.Unmarshal(data, &b.builds)
+}
+
+// NewBuilds returns a guarded build number to perf data map.
+func NewBuilds(builds map[string][]perftype.DataItem) Builds {
+	if builds == nil {
+		builds = make(map[string][]perftype.DataItem)
+	}
+	return Builds{
+		builds: builds,
+		mu:     &sync.RWMutex{},
+	}
+}
+
+func (b *Builds) AddBuildData(buildNumber string, data perftype.DataItem) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	b.builds[buildNumber] = append(b.builds[buildNumber], data)
+}
+
+func (b *Builds) Builds(buildNumber string) []perftype.DataItem {
+	b.mu.RLock()
+	defer b.mu.RUnlock()
+	return b.builds[buildNumber]
 }
 
 // MetricToBuildData is a map from metric name to BuildData pointer.
diff --git a/perfdash/metrics-downloader.go b/perfdash/metrics-downloader.go
index d5a577f8db..732f835275 100644
--- a/perfdash/metrics-downloader.go
+++ b/perfdash/metrics-downloader.go
@@ -224,7 +224,7 @@ func getBuildData(result JobToCategoryData, prefix string, category string, labe
 		result[prefix][category] = make(MetricToBuildData)
 	}
 	if _, found := result[prefix][category][label]; !found {
-		result[prefix][category][label] = &BuildData{Job: job, Version: "", Builds: map[string][]perftype.DataItem{}}
+		result[prefix][category][label] = &BuildData{Job: job, Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{})}
 	}
 	return result[prefix][category][label]
 }
diff --git a/perfdash/parser.go b/perfdash/parser.go
index 30fc732d46..d58675187d 100644
--- a/perfdash/parser.go
+++ b/perfdash/parser.go
@@ -73,7 +73,7 @@ func parseContainerRestarts(data []byte, buildNumber int, testResult *BuildData)
 	}
 
 	res := perftype.DataItem{Unit: "", Labels: map[string]string{"RestartCount": "RestartCount"}, Data: restartsByContainer}
-	testResult.Builds[build] = append(testResult.Builds[build], res)
+	testResult.Builds.AddBuildData(build, res)
 }
 
 func parsePerfData(data []byte, buildNumber int, testResult *BuildData) {
@@ -89,7 +89,7 @@ func parsePerfData(data []byte, buildNumber int, testResult *BuildData) {
 	if testResult.Version == obj.Version {
 		for i := range obj.DataItems {
 			stripCount(&obj.DataItems[i])
-			testResult.Builds[build] = append(testResult.Builds[build], obj.DataItems[i])
+			testResult.Builds.AddBuildData(build, obj.DataItems[i])
 		}
 	}
 }
@@ -142,8 +142,8 @@ func parseResourceUsageData(data []byte, buildNumber int, testResult *BuildData)
 			cpu.Data[percentile] = usage.CPU
 			memory.Data[percentile] = usage.Memory / (1024 * 1024)
 		}
-		testResult.Builds[build] = append(testResult.Builds[build], cpu)
-		testResult.Builds[build] = append(testResult.Builds[build], memory)
+		testResult.Builds.AddBuildData(build, cpu)
+		testResult.Builds.AddBuildData(build, memory)
 	}
 }
 
@@ -164,7 +164,7 @@ func parseRequestCountData(data []byte, buildNumber int, testResult *BuildData)
 				continue
 			}
 			stripCount(&obj.DataItems[i])
-			testResult.Builds[build] = append(testResult.Builds[build], obj.DataItems[i])
+			testResult.Builds.AddBuildData(build, obj.DataItems[i])
 		}
 	}
 }
@@ -217,7 +217,7 @@ func parseApiserverRequestCount(data []byte, buildNumber int, testResult *BuildD
 			continue
 		}
 		resultMap[key] = &perfData
-		testResult.Builds[build] = append(testResult.Builds[build], perfData)
+		testResult.Builds.AddBuildData(build, perfData)
 	}
 }
 
@@ -245,7 +245,7 @@ func parseApiserverInitEventsCount(data []byte, buildNumber int, testResult *Bui
 		}
 		delete(perfData.Labels, "__name__")
 		perfData.Data["InitEventsCount"] = float64(metric[i].Value)
-		testResult.Builds[build] = append(testResult.Builds[build], perfData)
+		testResult.Builds.AddBuildData(build, perfData)
 	}
 }
 
@@ -296,15 +296,15 @@ func parseSchedulingLatency(testName string) func([]byte, int, *BuildData) {
 			return
 		}
 		preemptionEvaluation := parseOperationLatency(obj.PreemptionEvaluationLatency, testName, "preemption_evaluation")
-		testResult.Builds[build] = append(testResult.Builds[build], preemptionEvaluation)
+		testResult.Builds.AddBuildData(build, preemptionEvaluation)
 		e2eScheduling := parseOperationLatency(obj.E2eSchedulingLatency, testName, "e2eScheduling")
-		testResult.Builds[build] = append(testResult.Builds[build], e2eScheduling)
+		testResult.Builds.AddBuildData(build, e2eScheduling)
 		scheduling := parseOperationLatency(obj.SchedulingLatency, testName, "scheduling")
-		testResult.Builds[build] = append(testResult.Builds[build], scheduling)
+		testResult.Builds.AddBuildData(build, scheduling)
 
 		for name, metric := range obj.FrameworkExtensionPointDuration {
 			frameworkExtensionPointDuration := parseOperationLatency(metric, testName, name)
-			testResult.Builds[build] = append(testResult.Builds[build], frameworkExtensionPointDuration)
+			testResult.Builds.AddBuildData(build, frameworkExtensionPointDuration)
 		}
 	}
 }
@@ -332,7 +332,7 @@ func parseSchedulingThroughputCL(testName string) func([]byte, int, *BuildData)
 		perfData.Data["Perc99"] = obj.Perc99
 		perfData.Data["Average"] = obj.Average
 		perfData.Data["Max"] = obj.Max
-		testResult.Builds[build] = append(testResult.Builds[build], perfData)
+		testResult.Builds.AddBuildData(build, perfData)
 	}
 }
 
@@ -393,7 +393,7 @@ func parseHistogramMetric(metricName string) func(data []byte, buildNumber int,
 					perfData.Data["<= "+bucket+"s"] = float64(buckerVal) / float64(count) * 100
 				}
 			}
-			testResult.Builds[build] = append(testResult.Builds[build], perfData)
+			testResult.Builds.AddBuildData(build, perfData)
 		}
 	}
 }
@@ -439,5 +439,5 @@ func parseSystemPodMetrics(data []byte, buildNumber int, testResult *BuildData)
 		},
 		Data: restartCounts,
 	}
-	testResult.Builds[build] = append(testResult.Builds[build], perfData)
+	testResult.Builds.AddBuildData(build, perfData)
 }
diff --git a/perfdash/parser_test.go b/perfdash/parser_test.go
index e526af0022..45f95de15a 100644
--- a/perfdash/parser_test.go
+++ b/perfdash/parser_test.go
@@ -17,6 +17,7 @@ limitations under the License.
 package main
 
 import (
+	"encoding/json"
 	"reflect"
 	"testing"
 
@@ -37,8 +38,8 @@ func Test_parseSystemPodMetrics(t *testing.T) {
 			name:        "same-container-in-two-pods",
 			buildNumber: 123,
 			data:        sameContainerInTwoPodsSummary(),
-			testResult:  &BuildData{Job: "", Version: "", Builds: map[string][]perftype.DataItem{}},
-			want: &BuildData{Job: "", Version: "", Builds: map[string][]perftype.DataItem{
+			testResult:  &BuildData{Job: "", Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{})},
+			want: &BuildData{Job: "", Version: "", Builds: NewBuilds(map[string][]perftype.DataItem{
 				"123": {
 					perftype.DataItem{
 						Data: map[string]float64{
@@ -51,7 +52,8 @@ func Test_parseSystemPodMetrics(t *testing.T) {
 						Unit: "",
 					},
 				},
-			}},
+			}),
+			},
 		},
 	}
 	for _, tt := range tests {
@@ -122,7 +124,7 @@ func Test_parseContainerRestarts(t *testing.T) {
 					"RestartCount": 4
 				}]`),
 			want: &BuildData{
-				Builds: map[string][]perftype.DataItem{
+				Builds: NewBuilds(map[string][]perftype.DataItem{
 					"123": {
 						{
 							Data: map[string]float64{
@@ -132,16 +134,63 @@ func Test_parseContainerRestarts(t *testing.T) {
 							Labels: map[string]string{"RestartCount": "RestartCount"},
 						},
 					},
-				},
+				}),
 			},
 		},
 	}
 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			got := &BuildData{Builds: map[string][]perftype.DataItem{}}
+			got := &BuildData{Builds: NewBuilds(nil)}
 			parseContainerRestarts(tc.data, 123, got)
 			require.NotNil(t, got.Builds)
-			assert.ElementsMatch(t, tc.want.Builds["123"], got.Builds["123"])
+			assert.ElementsMatch(t, tc.want.Builds.Builds("123"), got.Builds.Builds("123"))
+		})
+	}
+}
+
+func Test_BuildDataToJson(t *testing.T) {
+	tests := []struct {
+		name      string
+		buildData *BuildData
+		want      string
+	}{
+		{
+			name: "ToJsonTest",
+			buildData: &BuildData{Job: "job", Version: "version", Builds: NewBuilds(map[string][]perftype.DataItem{
+				"123": {
+					perftype.DataItem{
+						Data: map[string]float64{
+							"c1": 2,
+							"c2": 1,
+						},
+						Labels: map[string]string{
+							"RestartCount": "RestartCount",
+						},
+						Unit: "unit",
+					},
+				},
+			}),
+			},
+			want: `{"builds":{"123":[{"data":{"c1":2,"c2":1},"unit":"unit","labels":{"RestartCount":"RestartCount"}}]},"job":"job","version":"version"}`,
+		},
+	}
+
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			jsonData, err := json.Marshal(tc.buildData)
+			if err != nil {
+				t.Fatalf("failed to marshal data into json %v", err)
+			}
+			require.NotNil(t, jsonData)
+			assert.Equal(t, tc.want, string(jsonData))
+			got := &BuildData{}
+			err = json.Unmarshal(jsonData, got)
+			if err != nil {
+				t.Fatalf("failed to unmarshal data into json %v", err)
+			}
+			if !reflect.DeepEqual(*got, *tc.buildData) {
+				t.Errorf("want %v, got %v", *tc.buildData, *got)
+			}
 		})
 	}
 }