Skip to content

Commit 6a19287

Browse files
beautifulentropyaarongable
authored andcommitted
grpc: Enable client-side health_v1 health checking (letsencrypt#8254)
- Configure all gRPC clients to check the overall serving status of each endpoint via the `grpc_health_v1` service. - Configure all gRPC servers to expose the `grpc_health_v1` service to any client permitted to access one of the server’s services. - Modify long-running, deep health checks to set and transition the overall (empty string) health status of the gRPC server in addition to the specific service they were configured for. Fixes letsencrypt#8227 (cherry picked from commit 1bfc318)
1 parent 5ad5f85 commit 6a19287

File tree

4 files changed

+46
-13
lines changed

4 files changed

+46
-13
lines changed

cmd/config.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ type GRPCServerConfig struct {
465465
// These service names must match the service names advertised by gRPC itself,
466466
// which are identical to the names set in our gRPC .proto files prefixed by
467467
// the package names set in those files (e.g. "ca.CertificateAuthority").
468-
Services map[string]GRPCServiceConfig `json:"services" validate:"required,dive,required"`
468+
Services map[string]*GRPCServiceConfig `json:"services" validate:"required,dive,required"`
469469
// MaxConnectionAge specifies how long a connection may live before the server sends a GoAway to the
470470
// client. Because gRPC connections re-resolve DNS after a connection close,
471471
// this controls how long it takes before a client learns about changes to its
@@ -476,10 +476,10 @@ type GRPCServerConfig struct {
476476

477477
// GRPCServiceConfig contains the information needed to configure a gRPC service.
478478
type GRPCServiceConfig struct {
479-
// PerServiceClientNames is a map of gRPC service names to client certificate
480-
// SANs. The upstream listening server will reject connections from clients
481-
// which do not appear in this list, and the server interceptor will reject
482-
// RPC calls for this service from clients which are not listed here.
479+
// ClientNames is the list of accepted gRPC client certificate SANs.
480+
// Connections from clients not in this list will be rejected by the
481+
// upstream listener, and RPCs from unlisted clients will be denied by the
482+
// server interceptor.
483483
ClientNames []string `json:"clientNames" validate:"min=1,dive,hostname,required"`
484484
}
485485

grpc/client.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@ import (
1414
"github.com/letsencrypt/boulder/cmd"
1515
bcreds "github.com/letsencrypt/boulder/grpc/creds"
1616

17-
// 'grpc/health' is imported for its init function, which causes clients to
18-
// rely on the Health Service for load-balancing.
1917
// 'grpc/internal/resolver/dns' is imported for its init function, which
2018
// registers the SRV resolver.
2119
"google.golang.org/grpc/balancer/roundrobin"
20+
21+
// 'grpc/health' is imported for its init function, which causes clients to
22+
// rely on the Health Service for load-balancing as long as a
23+
// "healthCheckConfig" is specified in the gRPC service config.
2224
_ "google.golang.org/grpc/health"
2325

2426
_ "github.com/letsencrypt/boulder/grpc/internal/resolver/dns"
@@ -61,7 +63,21 @@ func ClientSetup(c *cmd.GRPCClientConfig, tlsConfig *tls.Config, statsRegistry p
6163
creds := bcreds.NewClientCredentials(tlsConfig.RootCAs, tlsConfig.Certificates, hostOverride)
6264
return grpc.NewClient(
6365
target,
64-
grpc.WithDefaultServiceConfig(fmt.Sprintf(`{"loadBalancingConfig": [{"%s":{}}]}`, roundrobin.Name)),
66+
grpc.WithDefaultServiceConfig(
67+
fmt.Sprintf(
68+
// By setting the service name to an empty string in
69+
// healthCheckConfig, we're instructing the gRPC client to query
70+
// the overall health status of each server. The grpc-go health
71+
// server, as constructed by health.NewServer(), unconditionally
72+
// sets the overall service (e.g. "") status to SERVING. If a
73+
// specific service name were set, the server would need to
74+
// explicitly transition that service to SERVING; otherwise,
75+
// clients would receive a NOT_FOUND status and the connection
76+
// would be marked as unhealthy (TRANSIENT_FAILURE).
77+
`{"healthCheckConfig": {"serviceName": ""},"loadBalancingConfig": [{"%s":{}}]}`,
78+
roundrobin.Name,
79+
),
80+
),
6581
grpc.WithTransportCredentials(creds),
6682
grpc.WithChainUnaryInterceptor(unaryInterceptors...),
6783
grpc.WithChainStreamInterceptor(streamInterceptors...),

grpc/server.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"errors"
77
"fmt"
88
"net"
9+
"slices"
910
"strings"
1011
"time"
1112

@@ -123,12 +124,21 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
123124
// This is the names which are allowlisted at the server level, plus the union
124125
// of all names which are allowlisted for any individual service.
125126
acceptedSANs := make(map[string]struct{})
127+
var acceptedSANsSlice []string
126128
for _, service := range sb.cfg.Services {
127129
for _, name := range service.ClientNames {
128130
acceptedSANs[name] = struct{}{}
131+
if !slices.Contains(acceptedSANsSlice, name) {
132+
acceptedSANsSlice = append(acceptedSANsSlice, name)
133+
}
129134
}
130135
}
131136

137+
// Ensure that the health service has the same ClientNames as the other
138+
// services, so that health checks can be performed by clients which are
139+
// allowed to connect to the server.
140+
sb.cfg.Services[healthpb.Health_ServiceDesc.ServiceName].ClientNames = acceptedSANsSlice
141+
132142
creds, err := bcreds.NewServerCredentials(tlsConfig, acceptedSANs)
133143
if err != nil {
134144
return nil, err
@@ -224,8 +234,12 @@ func (sb *serverBuilder) Build(tlsConfig *tls.Config, statsRegistry prometheus.R
224234

225235
// initLongRunningCheck initializes a goroutine which will periodically check
226236
// the health of the provided service and update the health server accordingly.
237+
//
238+
// TODO(#8255): Remove the service parameter and instead rely on transitioning
239+
// the overall health of the server (e.g. "") instead of individual services.
227240
func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, service string, checkImpl func(context.Context) error) {
228241
// Set the initial health status for the service.
242+
sb.healthSrv.SetServingStatus("", healthpb.HealthCheckResponse_NOT_SERVING)
229243
sb.healthSrv.SetServingStatus(service, healthpb.HealthCheckResponse_NOT_SERVING)
230244

231245
// check is a helper function that checks the health of the service and, if
@@ -249,10 +263,13 @@ func (sb *serverBuilder) initLongRunningCheck(shutdownCtx context.Context, servi
249263
}
250264

251265
if next != healthpb.HealthCheckResponse_SERVING {
266+
sb.logger.Errf("transitioning overall health from %q to %q, due to: %s", last, next, err)
252267
sb.logger.Errf("transitioning health of %q from %q to %q, due to: %s", service, last, next, err)
253268
} else {
269+
sb.logger.Infof("transitioning overall health from %q to %q", last, next)
254270
sb.logger.Infof("transitioning health of %q from %q to %q", service, last, next)
255271
}
272+
sb.healthSrv.SetServingStatus("", next)
256273
sb.healthSrv.SetServingStatus(service, next)
257274
return next
258275
}

grpc/server_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
"google.golang.org/grpc/health"
1212
)
1313

14-
func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
14+
func TestServerBuilderInitLongRunningCheck(t *testing.T) {
1515
t.Parallel()
1616
hs := health.NewServer()
1717
mockLogger := blog.NewMock()
@@ -41,8 +41,8 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
4141
// - ~100ms 3rd check failed, SERVING to NOT_SERVING
4242
serving := mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
4343
notServing := mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
44-
test.Assert(t, len(serving) == 1, "expected one serving log line")
45-
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
44+
test.Assert(t, len(serving) == 2, "expected two serving log lines")
45+
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
4646

4747
mockLogger.Clear()
4848

@@ -67,6 +67,6 @@ func Test_serverBuilder_initLongRunningCheck(t *testing.T) {
6767
// - ~100ms 3rd check passed, NOT_SERVING to SERVING
6868
serving = mockLogger.GetAllMatching(".*\"NOT_SERVING\" to \"SERVING\"")
6969
notServing = mockLogger.GetAllMatching((".*\"SERVING\" to \"NOT_SERVING\""))
70-
test.Assert(t, len(serving) == 2, "expected two serving log lines")
71-
test.Assert(t, len(notServing) == 1, "expected one not serving log line")
70+
test.Assert(t, len(serving) == 4, "expected four serving log lines")
71+
test.Assert(t, len(notServing) == 2, "expected two not serving log lines")
7272
}

0 commit comments

Comments
 (0)