@@ -10,7 +10,7 @@ import (
10
10
"github.com/sirupsen/logrus"
11
11
resourcemanager "google.golang.org/api/cloudresourcemanager/v1"
12
12
"google.golang.org/api/googleapi"
13
- iam "google.golang.org/api/iam/v1"
13
+ "google.golang.org/api/iam/v1"
14
14
"google.golang.org/api/option"
15
15
"k8s.io/apimachinery/pkg/util/wait"
16
16
@@ -103,7 +103,8 @@ func CreateServiceAccount(ctx context.Context, infraID, projectID, role string)
103
103
// AddServiceAccountRoles adds predefined roles for service account.
104
104
func AddServiceAccountRoles (ctx context.Context , projectID , serviceAccountID string , roles []string ) error {
105
105
// Get cloudresourcemanager service
106
- ctx , cancel := context .WithTimeout (ctx , time .Minute * 1 )
106
+ // The context timeout must be greater in time than the exponential backoff below
107
+ ctx , cancel := context .WithTimeout (ctx , time .Minute * 2 )
107
108
defer cancel ()
108
109
109
110
ssn , err := gcp .GetSession (ctx )
@@ -117,8 +118,9 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
117
118
118
119
backoff := wait.Backoff {
119
120
Duration : 2 * time .Second ,
121
+ Factor : 2.0 ,
120
122
Jitter : 1.0 ,
121
- Steps : 5 ,
123
+ Steps : retryCount ,
122
124
}
123
125
// Get and set the policy in a backoff loop.
124
126
// If the policy set fails, the policy must be retrieved again via the get before retrying the set.
@@ -135,8 +137,7 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
135
137
136
138
member := fmt .Sprintf ("serviceAccount:%s" , serviceAccountID )
137
139
for _ , role := range roles {
138
- err = addMemberToRole (policy , role , member )
139
- if err != nil {
140
+ if err := addMemberToRole (policy , role , member ); err != nil {
140
141
return false , fmt .Errorf ("failed to add role %s to %s: %w" , role , member , err )
141
142
}
142
143
}
@@ -147,6 +148,15 @@ func AddServiceAccountRoles(ctx context.Context, projectID, serviceAccountID str
147
148
lastErr = err
148
149
logrus .Debugf ("Concurrent IAM policy changes, restarting read/modify/write" )
149
150
return false , nil
151
+ } else if isBadStatusError (err ) {
152
+ // Documented here, https://cloud.google.com/iam/docs/retry-strategy, google
153
+ // indicates that a service account may be created but not active for up to
154
+ // 60 seconds. This behavior was causing a failure here when setting the policy
155
+ // resulting in a 400 error from the API. If this error occurs retry with an
156
+ // exponential backoff.
157
+ lastErr = err
158
+ logrus .Debugf ("bad request, unexpected error: %s" , err .Error ())
159
+ return false , nil
150
160
}
151
161
return false , fmt .Errorf ("failed to set IAM policy, unexpected error: %w" , err )
152
162
}
@@ -224,3 +234,8 @@ func isQuotaExceededError(err error) bool {
224
234
}
225
235
return false
226
236
}
237
+
238
+ func isBadStatusError (err error ) bool {
239
+ var ae * googleapi.Error
240
+ return errors .As (err , & ae ) && (ae .Code == http .StatusBadRequest )
241
+ }
0 commit comments