Skip to content

Commit 0d6deaf

Browse files
Merge pull request #9299 from barbacbd/OCPBUGS-45280
OCPBUGS-45280: Allow more time for Service Account Creation
2 parents 04e7a5d + ad61345 commit 0d6deaf

File tree

1 file changed

+20
-5
lines changed
  • pkg/infrastructure/gcp/clusterapi

1 file changed

+20
-5
lines changed

pkg/infrastructure/gcp/clusterapi/iam.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import (
1010
"github.com/sirupsen/logrus"
1111
resourcemanager "google.golang.org/api/cloudresourcemanager/v1"
1212
"google.golang.org/api/googleapi"
13-
iam "google.golang.org/api/iam/v1"
13+
"google.golang.org/api/iam/v1"
1414
"google.golang.org/api/option"
1515
"k8s.io/apimachinery/pkg/util/wait"
1616

@@ -103,7 +103,8 @@ func CreateServiceAccount(ctx context.Context, infraID, projectID, role string)
103103
// AddServiceAccountRoles adds predefined roles for service account.
104104
func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID string, roles []string) error {
105105
// Get cloudresourcemanager service
106-
ctx, cancel := context.WithTimeout(ctx, time.Minute*1)
106+
// The context timeout must be greater in time than the exponential backoff below
107+
ctx, cancel := context.WithTimeout(ctx, time.Minute*2)
107108
defer cancel()
108109

109110
ssn, err := gcp.GetSession(ctx)
@@ -117,8 +118,9 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
117118

118119
backoff := wait.Backoff{
119120
Duration: 2 * time.Second,
121+
Factor: 2.0,
120122
Jitter: 1.0,
121-
Steps: 5,
123+
Steps: retryCount,
122124
}
123125
// Get and set the policy in a backoff loop.
124126
// If the policy set fails, the policy must be retrieved again via the get before retrying the set.
@@ -135,8 +137,7 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
135137

136138
member := fmt.Sprintf("serviceAccount:%s", serviceAccountID)
137139
for _, role := range roles {
138-
err = addMemberToRole(policy, role, member)
139-
if err != nil {
140+
if err := addMemberToRole(policy, role, member); err != nil {
140141
return false, fmt.Errorf("failed to add role %s to %s: %w", role, member, err)
141142
}
142143
}
@@ -147,6 +148,15 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
147148
lastErr = err
148149
logrus.Debugf("Concurrent IAM policy changes, restarting read/modify/write")
149150
return false, nil
151+
} else if isBadStatusError(err) {
152+
// Documented here, https://cloud.google.com/iam/docs/retry-strategy, google
153+
// indicates that a service account may be created but not active for up to
154+
// 60 seconds. This behavior was causing a failure here when setting the policy
155+
// resulting in a 400 error from the API. If this error occurs retry with an
156+
// exponential backoff.
157+
lastErr = err
158+
logrus.Debugf("bad request, unexpected error: %s", err.Error())
159+
return false, nil
150160
}
151161
return false, fmt.Errorf("failed to set IAM policy, unexpected error: %w", err)
152162
}
@@ -224,3 +234,8 @@ func isQuotaExceededError(err error) bool {
224234
}
225235
return false
226236
}
237+
238+
func isBadStatusError(err error) bool {
239+
var ae *googleapi.Error
240+
return errors.As(err, &ae) && (ae.Code == http.StatusBadRequest)
241+
}

0 commit comments

Comments
 (0)