From 82de062203ddb875d1d057514ef6715cf3f692fb Mon Sep 17 00:00:00 2001 From: Arbin Date: Wed, 26 Nov 2025 12:31:31 +0800 Subject: [PATCH 01/37] aws_msk_iam: optimize MSK IAM authentication and credential management Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 342 ++++++++++++++++++++++++++++++-------- 1 file changed, 274 insertions(+), 68 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index cf8af7d0cc8..42be1466c9c 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -37,11 +38,18 @@ #include #include -/* Lightweight config - NO persistent AWS provider */ +/* Lightweight config with credential caching to prevent principal changes */ struct flb_aws_msk_iam { - struct flb_config *flb_config; /* For creating AWS provider on-demand */ + struct flb_config *flb_config; flb_sds_t region; flb_sds_t cluster_arn; + struct flb_tls *cred_tls; /* TLS instance for AWS credentials (STS) */ + struct flb_aws_provider *provider; /* AWS credentials provider (created once, reused) */ + + /* Credential caching to maintain consistent principal during re-authentication */ + struct flb_aws_credentials *cached_creds; /* Cached AWS credentials */ + time_t creds_expiration; /* Credential expiration time */ + pthread_mutex_t creds_lock; /* Thread-safe access to cached credentials */ }; /* Utility functions - same as before */ @@ -162,12 +170,153 @@ static char *extract_region(const char *arn) return out; } -/* Stateless payload generator - creates AWS provider on demand */ -static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, - const char *host) +/* + * Duplicate AWS credentials structure + * Returns NULL on failure + */ +static struct flb_aws_credentials* duplicate_credentials(struct flb_aws_credentials *src) { - struct flb_aws_provider *temp_provider = NULL; + struct flb_aws_credentials *dst; + + if (!src) { + return NULL; + } + + dst = flb_calloc(1, sizeof(struct flb_aws_credentials)); + if (!dst) { + return NULL; + } + + if (src->access_key_id) { + dst->access_key_id = flb_sds_create(src->access_key_id); + if (!dst->access_key_id) { + flb_free(dst); + return NULL; + } + } + + if (src->secret_access_key) { + dst->secret_access_key = flb_sds_create(src->secret_access_key); + if (!dst->secret_access_key) { + if (dst->access_key_id) { + flb_sds_destroy(dst->access_key_id); + } + flb_free(dst); + return NULL; + } + } + + if (src->session_token) { + dst->session_token = flb_sds_create(src->session_token); + if (!dst->session_token) { + if (dst->access_key_id) { + flb_sds_destroy(dst->access_key_id); + } + if (dst->secret_access_key) { + flb_sds_destroy(dst->secret_access_key); + } + flb_free(dst); + return NULL; + } + } + + return dst; +} + +/* + * Get cached credentials or refresh if expired + * This function ensures the same AWS temporary credentials (with the same session ID) + * are reused across multiple token refreshes, preventing "principal change" errors. + * + * Returns a COPY of credentials that the caller must destroy. + * Returns NULL on failure. + */ +static struct flb_aws_credentials* get_cached_or_refresh_credentials( + struct flb_aws_msk_iam *config, time_t *expiration) +{ + time_t now; struct flb_aws_credentials *creds = NULL; + struct flb_aws_credentials *creds_copy = NULL; + int needs_refresh = FLB_FALSE; + + now = time(NULL); + + pthread_mutex_lock(&config->creds_lock); + + /* Check if cached credentials are still valid */ + if (config->cached_creds && + config->creds_expiration > now + FLB_AWS_REFRESH_WINDOW) { + /* Credentials are still valid, return a copy */ + creds_copy = duplicate_credentials(config->cached_creds); + if (expiration) { + *expiration = config->creds_expiration; + } + pthread_mutex_unlock(&config->creds_lock); + + if (creds_copy) { + flb_info("[aws_msk_iam] reusing cached AWS credentials (valid until %ld, %ld seconds remaining)", + config->creds_expiration, config->creds_expiration - now); + } + return creds_copy; + } + + needs_refresh = FLB_TRUE; + pthread_mutex_unlock(&config->creds_lock); + + /* Credentials expired or don't exist, need to refresh */ + if (needs_refresh) { + flb_info("[aws_msk_iam] AWS credentials expired or not cached, fetching new credentials"); + + /* Get new credentials using the long-lived provider */ + creds = config->provider->provider_vtable->get_credentials(config->provider); + if (!creds) { + flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); + return NULL; + } + + /* Update cache with new credentials */ + pthread_mutex_lock(&config->creds_lock); + + if (config->cached_creds) { + flb_aws_credentials_destroy(config->cached_creds); + config->cached_creds = NULL; + } + + config->cached_creds = duplicate_credentials(creds); + if (!config->cached_creds) { + pthread_mutex_unlock(&config->creds_lock); + flb_error("[aws_msk_iam] failed to cache credentials"); + flb_aws_credentials_destroy(creds); + return NULL; + } + + /* + * Set expiration time. AWS temporary credentials typically last 1 hour. + * We use a conservative estimate if we can't determine the exact expiration. + */ + config->creds_expiration = now + 3600; /* Default: 1 hour */ + + if (expiration) { + *expiration = config->creds_expiration; + } + + pthread_mutex_unlock(&config->creds_lock); + + flb_info("[aws_msk_iam] successfully cached new AWS credentials (valid until %ld, %ld seconds remaining)", + config->creds_expiration, config->creds_expiration - now); + + /* Return the credentials (caller owns them) */ + return creds; + } + + return NULL; +} + +/* Payload generator using cached credentials to maintain consistent principal */ +static flb_sds_t build_msk_iam_payload_with_creds(struct flb_aws_msk_iam *config, + const char *host, + struct flb_aws_credentials *creds) +{ flb_sds_t payload = NULL; int encode_result; char *p; @@ -214,37 +363,17 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, return NULL; } - flb_info("[aws_msk_iam] build_msk_iam_payload: generating payload for host: %s, region: %s", + flb_info("[aws_msk_iam] build_msk_iam_payload_with_creds: generating payload for host: %s, region: %s", host, config->region); - /* Create AWS provider on-demand */ - temp_provider = flb_standard_chain_provider_create(config->flb_config, NULL, - config->region, NULL, NULL, - flb_aws_client_generator(), - NULL); - if (!temp_provider) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to create AWS credentials provider"); - return NULL; - } - - if (temp_provider->provider_vtable->init(temp_provider) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to initialize AWS credentials provider"); - flb_aws_provider_destroy(temp_provider); - return NULL; - } - - /* Get credentials */ - creds = temp_provider->provider_vtable->get_credentials(temp_provider); + /* Validate credentials */ if (!creds) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to get credentials"); - flb_aws_provider_destroy(temp_provider); + flb_error("[aws_msk_iam] build_msk_iam_payload_with_creds: credentials are NULL"); return NULL; } if (!creds->access_key_id || !creds->secret_access_key) { - flb_error("[aws_msk_iam] build_msk_iam_payload: incomplete credentials"); - flb_aws_credentials_destroy(creds); - flb_aws_provider_destroy(temp_provider); + flb_error("[aws_msk_iam] build_msk_iam_payload_with_creds: incomplete credentials"); return NULL; } @@ -547,12 +676,6 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, if (session_token_enc) { flb_sds_destroy(session_token_enc); } - if (creds) { - flb_aws_credentials_destroy(creds); - } - if (temp_provider) { - flb_aws_provider_destroy(temp_provider); - } return payload; @@ -594,18 +717,12 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, if (session_token_enc) { flb_sds_destroy(session_token_enc); } - if (creds) { - flb_aws_credentials_destroy(creds); - } - if (temp_provider) { - flb_aws_provider_destroy(temp_provider); - } return NULL; } -/* Stateless callback - creates AWS provider on-demand for each refresh */ +/* OAuth token refresh callback with credential caching */ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, const char *oauthbearer_config, void *opaque) @@ -622,7 +739,6 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, struct flb_aws_msk_iam *config; struct flb_aws_credentials *creds = NULL; struct flb_kafka_opaque *kafka_opaque; - struct flb_aws_provider *temp_provider = NULL; (void) oauthbearer_config; kafka_opaque = (struct flb_kafka_opaque *) opaque; @@ -644,45 +760,50 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, return; } - /* Determine host endpoint */ + /* + * Use MSK generic endpoint for IAM authentication. + * AWS MSK IAM supports both cluster-specific and generic regional endpoints. + * Generic endpoints are recommended as they work across all brokers in the region. + */ if (config->cluster_arn) { arn_len = strlen(config->cluster_arn); suffix_len = strlen(s3_suffix); if (arn_len >= suffix_len && strcmp(config->cluster_arn + arn_len - suffix_len, s3_suffix) == 0) { snprintf(host, sizeof(host), "kafka-serverless.%s.amazonaws.com", config->region); - flb_info("[aws_msk_iam] MSK Serverless cluster, using generic endpoint: %s", host); + flb_debug("[aws_msk_iam] using MSK Serverless generic endpoint: %s", host); } else { snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); - flb_info("[aws_msk_iam] Regular MSK cluster, using generic endpoint: %s", host); + flb_debug("[aws_msk_iam] using MSK generic endpoint: %s", host); } } else { snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); - flb_info("[aws_msk_iam] Regular MSK cluster, using generic endpoint: %s", host); + flb_debug("[aws_msk_iam] using MSK generic endpoint: %s", host); } flb_info("[aws_msk_iam] requesting MSK IAM payload for region: %s, host: %s", config->region, host); - /* Generate payload using stateless function - creates and destroys AWS provider internally */ - payload = build_msk_iam_payload(config, host); + /* + * CRITICAL FIX: Use cached credentials to maintain consistent principal + * This prevents "Cannot change principals during re-authentication" errors + */ + creds = get_cached_or_refresh_credentials(config, NULL); + if (!creds) { + flb_error("[aws_msk_iam] failed to get AWS credentials (cached or refreshed)"); + rd_kafka_oauthbearer_set_token_failure(rk, "credential retrieval failed"); + return; + } + + /* Generate payload using cached credentials */ + payload = build_msk_iam_payload_with_creds(config, host, creds); if (!payload) { flb_error("[aws_msk_iam] failed to generate MSK IAM payload"); + flb_aws_credentials_destroy(creds); rd_kafka_oauthbearer_set_token_failure(rk, "payload generation failed"); return; } - /* Get credentials for principal (create temporary provider just for this) */ - temp_provider = flb_standard_chain_provider_create(config->flb_config, NULL, - config->region, NULL, NULL, - flb_aws_client_generator(), - NULL); - if (temp_provider) { - if (temp_provider->provider_vtable->init(temp_provider) == 0) { - creds = temp_provider->provider_vtable->get_credentials(temp_provider); - } - } - now = time(NULL); md_lifetime_ms = (now + 900) * 1000; @@ -703,14 +824,10 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_info("[aws_msk_iam] OAuth bearer token successfully set"); } - /* Clean up everything immediately - no memory leaks possible! */ + /* Clean up - credentials and payload */ if (creds) { flb_aws_credentials_destroy(creds); } - if (temp_provider) { - flb_aws_provider_destroy(temp_provider); - } - if (payload) { flb_sds_destroy(payload); } @@ -771,6 +888,74 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_info("[aws_msk_iam] extracted region: %s", ctx->region); + /* Create TLS instance for AWS credentials (STS) - CRITICAL FIX */ + ctx->cred_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, + FLB_TRUE, + FLB_LOG_DEBUG, + NULL, /* vhost */ + NULL, /* ca_path */ + NULL, /* ca_file */ + NULL, /* crt_file */ + NULL, /* key_file */ + NULL); /* key_passwd */ + if (!ctx->cred_tls) { + flb_error("[aws_msk_iam] failed to create TLS instance for AWS credentials"); + flb_sds_destroy(ctx->region); + flb_sds_destroy(ctx->cluster_arn); + flb_free(ctx); + return NULL; + } + + flb_info("[aws_msk_iam] TLS instance created for AWS credentials"); + + /* Initialize credential caching fields */ + ctx->cached_creds = NULL; + ctx->creds_expiration = 0; + + /* Initialize mutex for thread-safe credential access */ + if (pthread_mutex_init(&ctx->creds_lock, NULL) != 0) { + flb_error("[aws_msk_iam] failed to initialize credentials mutex"); + flb_tls_destroy(ctx->cred_tls); + flb_sds_destroy(ctx->region); + flb_sds_destroy(ctx->cluster_arn); + flb_free(ctx); + return NULL; + } + + flb_info("[aws_msk_iam] Credential cache initialized with mutex protection"); + + /* Create AWS provider once - will be reused for credential refresh */ + ctx->provider = flb_standard_chain_provider_create(config, + ctx->cred_tls, + ctx->region, + NULL, /* sts_endpoint */ + NULL, /* proxy */ + flb_aws_client_generator(), + NULL); /* profile */ + if (!ctx->provider) { + flb_error("[aws_msk_iam] failed to create AWS credentials provider"); + pthread_mutex_destroy(&ctx->creds_lock); + flb_tls_destroy(ctx->cred_tls); + flb_sds_destroy(ctx->region); + flb_sds_destroy(ctx->cluster_arn); + flb_free(ctx); + return NULL; + } + + /* Initialize provider */ + if (ctx->provider->provider_vtable->init(ctx->provider) != 0) { + flb_error("[aws_msk_iam] failed to initialize AWS credentials provider"); + flb_aws_provider_destroy(ctx->provider); + pthread_mutex_destroy(&ctx->creds_lock); + flb_tls_destroy(ctx->cred_tls); + flb_sds_destroy(ctx->region); + flb_sds_destroy(ctx->cluster_arn); + flb_free(ctx); + return NULL; + } + + flb_info("[aws_msk_iam] AWS credentials provider created and initialized successfully"); + /* Set the callback and opaque */ rd_kafka_conf_set_oauthbearer_token_refresh_cb(kconf, oauthbearer_token_refresh_cb); flb_kafka_opaque_set(opaque, NULL, ctx); @@ -781,7 +966,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con return ctx; } -/* Simple destroy - just config cleanup, no AWS provider to leak! */ +/* Destroy MSK IAM config - includes cached credentials cleanup */ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) { if (!ctx) { @@ -790,7 +975,26 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) flb_info("[aws_msk_iam] destroying MSK IAM config"); - /* NO AWS provider to destroy! */ + /* Clean up cached credentials */ + if (ctx->cached_creds) { + flb_aws_credentials_destroy(ctx->cached_creds); + ctx->cached_creds = NULL; + } + + /* Destroy AWS provider */ + if (ctx->provider) { + flb_aws_provider_destroy(ctx->provider); + } + + /* Destroy mutex */ + pthread_mutex_destroy(&ctx->creds_lock); + + /* Clean up TLS instance */ + if (ctx->cred_tls) { + flb_tls_destroy(ctx->cred_tls); + } + + /* Clean up other resources */ if (ctx->region) { flb_sds_destroy(ctx->region); } @@ -798,4 +1002,6 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) flb_sds_destroy(ctx->cluster_arn); } flb_free(ctx); + + flb_info("[aws_msk_iam] MSK IAM config destroyed, cached credentials and provider cleared"); } From 219fd905ec2d9ac593ad29baa46fd984adb55a6e Mon Sep 17 00:00:00 2001 From: Arbin Date: Wed, 26 Nov 2025 06:28:16 +0000 Subject: [PATCH 02/37] aws_msk_iam: optimize MSK IAM authentication and credential management Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 198 +++----------------------------------- 1 file changed, 12 insertions(+), 186 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 42be1466c9c..fb2bfec0931 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -38,18 +38,13 @@ #include #include -/* Lightweight config with credential caching to prevent principal changes */ +/* Lightweight config - provider manages credential caching and refresh internally */ struct flb_aws_msk_iam { struct flb_config *flb_config; flb_sds_t region; flb_sds_t cluster_arn; struct flb_tls *cred_tls; /* TLS instance for AWS credentials (STS) */ struct flb_aws_provider *provider; /* AWS credentials provider (created once, reused) */ - - /* Credential caching to maintain consistent principal during re-authentication */ - struct flb_aws_credentials *cached_creds; /* Cached AWS credentials */ - time_t creds_expiration; /* Credential expiration time */ - pthread_mutex_t creds_lock; /* Thread-safe access to cached credentials */ }; /* Utility functions - same as before */ @@ -170,150 +165,8 @@ static char *extract_region(const char *arn) return out; } -/* - * Duplicate AWS credentials structure - * Returns NULL on failure - */ -static struct flb_aws_credentials* duplicate_credentials(struct flb_aws_credentials *src) -{ - struct flb_aws_credentials *dst; - - if (!src) { - return NULL; - } - - dst = flb_calloc(1, sizeof(struct flb_aws_credentials)); - if (!dst) { - return NULL; - } - - if (src->access_key_id) { - dst->access_key_id = flb_sds_create(src->access_key_id); - if (!dst->access_key_id) { - flb_free(dst); - return NULL; - } - } - - if (src->secret_access_key) { - dst->secret_access_key = flb_sds_create(src->secret_access_key); - if (!dst->secret_access_key) { - if (dst->access_key_id) { - flb_sds_destroy(dst->access_key_id); - } - flb_free(dst); - return NULL; - } - } - - if (src->session_token) { - dst->session_token = flb_sds_create(src->session_token); - if (!dst->session_token) { - if (dst->access_key_id) { - flb_sds_destroy(dst->access_key_id); - } - if (dst->secret_access_key) { - flb_sds_destroy(dst->secret_access_key); - } - flb_free(dst); - return NULL; - } - } - - return dst; -} - -/* - * Get cached credentials or refresh if expired - * This function ensures the same AWS temporary credentials (with the same session ID) - * are reused across multiple token refreshes, preventing "principal change" errors. - * - * Returns a COPY of credentials that the caller must destroy. - * Returns NULL on failure. - */ -static struct flb_aws_credentials* get_cached_or_refresh_credentials( - struct flb_aws_msk_iam *config, time_t *expiration) -{ - time_t now; - struct flb_aws_credentials *creds = NULL; - struct flb_aws_credentials *creds_copy = NULL; - int needs_refresh = FLB_FALSE; - - now = time(NULL); - - pthread_mutex_lock(&config->creds_lock); - - /* Check if cached credentials are still valid */ - if (config->cached_creds && - config->creds_expiration > now + FLB_AWS_REFRESH_WINDOW) { - /* Credentials are still valid, return a copy */ - creds_copy = duplicate_credentials(config->cached_creds); - if (expiration) { - *expiration = config->creds_expiration; - } - pthread_mutex_unlock(&config->creds_lock); - - if (creds_copy) { - flb_info("[aws_msk_iam] reusing cached AWS credentials (valid until %ld, %ld seconds remaining)", - config->creds_expiration, config->creds_expiration - now); - } - return creds_copy; - } - - needs_refresh = FLB_TRUE; - pthread_mutex_unlock(&config->creds_lock); - - /* Credentials expired or don't exist, need to refresh */ - if (needs_refresh) { - flb_info("[aws_msk_iam] AWS credentials expired or not cached, fetching new credentials"); - - /* Get new credentials using the long-lived provider */ - creds = config->provider->provider_vtable->get_credentials(config->provider); - if (!creds) { - flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); - return NULL; - } - - /* Update cache with new credentials */ - pthread_mutex_lock(&config->creds_lock); - - if (config->cached_creds) { - flb_aws_credentials_destroy(config->cached_creds); - config->cached_creds = NULL; - } - - config->cached_creds = duplicate_credentials(creds); - if (!config->cached_creds) { - pthread_mutex_unlock(&config->creds_lock); - flb_error("[aws_msk_iam] failed to cache credentials"); - flb_aws_credentials_destroy(creds); - return NULL; - } - - /* - * Set expiration time. AWS temporary credentials typically last 1 hour. - * We use a conservative estimate if we can't determine the exact expiration. - */ - config->creds_expiration = now + 3600; /* Default: 1 hour */ - - if (expiration) { - *expiration = config->creds_expiration; - } - - pthread_mutex_unlock(&config->creds_lock); - - flb_info("[aws_msk_iam] successfully cached new AWS credentials (valid until %ld, %ld seconds remaining)", - config->creds_expiration, config->creds_expiration - now); - - /* Return the credentials (caller owns them) */ - return creds; - } - - return NULL; -} - -/* Payload generator using cached credentials to maintain consistent principal */ -static flb_sds_t build_msk_iam_payload_with_creds(struct flb_aws_msk_iam *config, +/* Payload generator - builds MSK IAM authentication payload */ +static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, const char *host, struct flb_aws_credentials *creds) { @@ -785,18 +638,18 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_info("[aws_msk_iam] requesting MSK IAM payload for region: %s, host: %s", config->region, host); /* - * CRITICAL FIX: Use cached credentials to maintain consistent principal - * This prevents "Cannot change principals during re-authentication" errors + * Get credentials from provider. The provider handles caching and expiration internally. + * The provider automatically manages credential refresh when needed. */ - creds = get_cached_or_refresh_credentials(config, NULL); + creds = config->provider->provider_vtable->get_credentials(config->provider); if (!creds) { - flb_error("[aws_msk_iam] failed to get AWS credentials (cached or refreshed)"); + flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); rd_kafka_oauthbearer_set_token_failure(rk, "credential retrieval failed"); return; } - /* Generate payload using cached credentials */ - payload = build_msk_iam_payload_with_creds(config, host, creds); + /* Generate payload using credentials from provider */ + payload = build_msk_iam_payload(config, host, creds); if (!payload) { flb_error("[aws_msk_iam] failed to generate MSK IAM payload"); flb_aws_credentials_destroy(creds); @@ -908,22 +761,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_info("[aws_msk_iam] TLS instance created for AWS credentials"); - /* Initialize credential caching fields */ - ctx->cached_creds = NULL; - ctx->creds_expiration = 0; - - /* Initialize mutex for thread-safe credential access */ - if (pthread_mutex_init(&ctx->creds_lock, NULL) != 0) { - flb_error("[aws_msk_iam] failed to initialize credentials mutex"); - flb_tls_destroy(ctx->cred_tls); - flb_sds_destroy(ctx->region); - flb_sds_destroy(ctx->cluster_arn); - flb_free(ctx); - return NULL; - } - - flb_info("[aws_msk_iam] Credential cache initialized with mutex protection"); - /* Create AWS provider once - will be reused for credential refresh */ ctx->provider = flb_standard_chain_provider_create(config, ctx->cred_tls, @@ -934,7 +771,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con NULL); /* profile */ if (!ctx->provider) { flb_error("[aws_msk_iam] failed to create AWS credentials provider"); - pthread_mutex_destroy(&ctx->creds_lock); flb_tls_destroy(ctx->cred_tls); flb_sds_destroy(ctx->region); flb_sds_destroy(ctx->cluster_arn); @@ -946,7 +782,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con if (ctx->provider->provider_vtable->init(ctx->provider) != 0) { flb_error("[aws_msk_iam] failed to initialize AWS credentials provider"); flb_aws_provider_destroy(ctx->provider); - pthread_mutex_destroy(&ctx->creds_lock); flb_tls_destroy(ctx->cred_tls); flb_sds_destroy(ctx->region); flb_sds_destroy(ctx->cluster_arn); @@ -966,7 +801,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con return ctx; } -/* Destroy MSK IAM config - includes cached credentials cleanup */ +/* Destroy MSK IAM config */ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) { if (!ctx) { @@ -975,20 +810,11 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) flb_info("[aws_msk_iam] destroying MSK IAM config"); - /* Clean up cached credentials */ - if (ctx->cached_creds) { - flb_aws_credentials_destroy(ctx->cached_creds); - ctx->cached_creds = NULL; - } - - /* Destroy AWS provider */ + /* Destroy AWS provider (provider manages its own credential caching) */ if (ctx->provider) { flb_aws_provider_destroy(ctx->provider); } - /* Destroy mutex */ - pthread_mutex_destroy(&ctx->creds_lock); - /* Clean up TLS instance */ if (ctx->cred_tls) { flb_tls_destroy(ctx->cred_tls); @@ -1003,5 +829,5 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) } flb_free(ctx); - flb_info("[aws_msk_iam] MSK IAM config destroyed, cached credentials and provider cleared"); + flb_info("[aws_msk_iam] MSK IAM config destroyed"); } From e75e4e6e71bf7dc2597452efe738f349f50a55c3 Mon Sep 17 00:00:00 2001 From: Arbin Date: Wed, 26 Nov 2025 16:50:55 +0800 Subject: [PATCH 03/37] aws_msk_iam: optimize MSK IAM authentication and credential management Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index fb2bfec0931..f8db0c2ecee 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -663,12 +663,16 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, err = rd_kafka_oauthbearer_set_token(rk, payload, md_lifetime_ms, - creds ? creds->access_key_id : "unknown", + creds->access_key_id, NULL, 0, errstr, sizeof(errstr)); + /* Destroy credentials immediately after use (standard pattern) */ + flb_aws_credentials_destroy(creds); + creds = NULL; + if (err != RD_KAFKA_RESP_ERR_NO_ERROR) { flb_error("[aws_msk_iam] failed to set OAuth bearer token: %s", errstr); rd_kafka_oauthbearer_set_token_failure(rk, errstr); @@ -677,10 +681,7 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_info("[aws_msk_iam] OAuth bearer token successfully set"); } - /* Clean up - credentials and payload */ - if (creds) { - flb_aws_credentials_destroy(creds); - } + /* Clean up - payload only (creds already destroyed) */ if (payload) { flb_sds_destroy(payload); } From c9d51a0212fe107308ce4a4447d4dcc616dfd4fd Mon Sep 17 00:00:00 2001 From: Arbin Date: Wed, 26 Nov 2025 09:31:09 +0000 Subject: [PATCH 04/37] aws_msk_iam: initialize AWS provider in sync mode for MSK IAM - Switch provider to sync mode before initialization to prevent hanging - Initialize provider with sync mode (required before event loop is available) - Switch back to async mode after successful initialization - Follows pattern used by other AWS credential providers This fixes potential credential initialization failures in IRSA/EKS deployments where HTTP requests during init would hang without the event loop. Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index f8db0c2ecee..eb4d1cc0b0a 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -779,7 +779,8 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con return NULL; } - /* Initialize provider */ + /* Initialize provider in sync mode (required before event loop is available) */ + ctx->provider->provider_vtable->sync(ctx->provider); if (ctx->provider->provider_vtable->init(ctx->provider) != 0) { flb_error("[aws_msk_iam] failed to initialize AWS credentials provider"); flb_aws_provider_destroy(ctx->provider); @@ -789,6 +790,8 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_free(ctx); return NULL; } + /* Switch back to async mode */ + ctx->provider->provider_vtable->async(ctx->provider); flb_info("[aws_msk_iam] AWS credentials provider created and initialized successfully"); From 26551daf9ad9d93e1f23b0262c568ed99f8ac7dc Mon Sep 17 00:00:00 2001 From: Arbin Date: Thu, 27 Nov 2025 12:20:03 +0800 Subject: [PATCH 05/37] aws_msk_iam: force credential refresh in provider refresh functions - Add force refresh logic to EC2, STS, and EKS credential providers - Set next_refresh to 0 in refresh functions to ensure immediate credential update - Fixes MSK IAM authentication failures after ~1 hour due to stale credentials - Aligns with AWS SDK behavior where refresh() means force refresh This resolves the issue where OAuth token refresh (every ~15 minutes) would not actually refresh AWS credentials until next_refresh time was reached (typically 1 hour later), causing MSK connection failures with 'Access denied' errors. The fix ensures that every OAuth callback will fetch fresh credentials from AWS, matching the behavior of official AWS SDKs (Python, Java). Signed-off-by: Arbin --- src/aws/flb_aws_credentials_ec2.c | 4 ++++ src/aws/flb_aws_credentials_profile.c | 3 +-- src/aws/flb_aws_credentials_sts.c | 7 +++++++ src/aws/flb_aws_msk_iam.c | 21 ++++++++++++++------- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/src/aws/flb_aws_credentials_ec2.c b/src/aws/flb_aws_credentials_ec2.c index 2722e26d223..e56dc467fbd 100644 --- a/src/aws/flb_aws_credentials_ec2.c +++ b/src/aws/flb_aws_credentials_ec2.c @@ -130,6 +130,10 @@ int refresh_fn_ec2(struct flb_aws_provider *provider) { int ret = -1; flb_debug("[aws_credentials] Refresh called on the EC2 IMDS provider"); + + /* Force credential refresh by marking as expired */ + implementation->next_refresh = 0; + if (try_lock_provider(provider)) { ret = get_creds_ec2(implementation); unlock_provider(provider); diff --git a/src/aws/flb_aws_credentials_profile.c b/src/aws/flb_aws_credentials_profile.c index 48cb9299572..7ad7099ff45 100644 --- a/src/aws/flb_aws_credentials_profile.c +++ b/src/aws/flb_aws_credentials_profile.c @@ -663,8 +663,7 @@ static int get_shared_credentials(char* credentials_path, if (flb_read_file(credentials_path, &buf, &size) < 0) { if (errno == ENOENT) { - AWS_CREDS_ERROR_OR_DEBUG(debug_only, "Shared credentials file %s does not exist", - credentials_path); + AWS_CREDS_DEBUG("Shared credentials file %s does not exist", credentials_path); } else { flb_errno(); AWS_CREDS_ERROR_OR_DEBUG(debug_only, "Could not read shared credentials file %s", diff --git a/src/aws/flb_aws_credentials_sts.c b/src/aws/flb_aws_credentials_sts.c index 554fac20353..5fbac774cf7 100644 --- a/src/aws/flb_aws_credentials_sts.c +++ b/src/aws/flb_aws_credentials_sts.c @@ -175,6 +175,9 @@ int refresh_fn_sts(struct flb_aws_provider *provider) { struct flb_aws_provider_sts *implementation = provider->implementation; flb_debug("[aws_credentials] Refresh called on the STS provider"); + + /* Force credential refresh by marking as expired */ + implementation->next_refresh = 0; if (try_lock_provider(provider)) { ret = sts_assume_role_request(implementation->sts_client, @@ -480,6 +483,10 @@ int refresh_fn_eks(struct flb_aws_provider *provider) { struct flb_aws_provider_eks *implementation = provider->implementation; flb_debug("[aws_credentials] Refresh called on the EKS provider"); + + /* Force credential refresh by marking as expired */ + implementation->next_refresh = 0; + if (try_lock_provider(provider)) { ret = assume_with_web_identity(implementation); unlock_provider(provider); diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index eb4d1cc0b0a..cc65093dacd 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -216,17 +216,17 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, return NULL; } - flb_info("[aws_msk_iam] build_msk_iam_payload_with_creds: generating payload for host: %s, region: %s", - host, config->region); + flb_debug("[aws_msk_iam] build_msk_iam_payload: generating payload for host: %s, region: %s", + host, config->region); /* Validate credentials */ if (!creds) { - flb_error("[aws_msk_iam] build_msk_iam_payload_with_creds: credentials are NULL"); + flb_error("[aws_msk_iam] build_msk_iam_payload: credentials are NULL"); return NULL; } if (!creds->access_key_id || !creds->secret_access_key) { - flb_error("[aws_msk_iam] build_msk_iam_payload_with_creds: incomplete credentials"); + flb_error("[aws_msk_iam] build_msk_iam_payload: incomplete credentials"); return NULL; } @@ -635,12 +635,19 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_debug("[aws_msk_iam] using MSK generic endpoint: %s", host); } - flb_info("[aws_msk_iam] requesting MSK IAM payload for region: %s, host: %s", config->region, host); + flb_debug("[aws_msk_iam] requesting MSK IAM payload for region: %s, host: %s", config->region, host); /* - * Get credentials from provider. The provider handles caching and expiration internally. - * The provider automatically manages credential refresh when needed. + * Refresh credentials before generating OAuth token. + * This is necessary because provider's passive refresh only triggers when + * get_credentials is called and detects expiration. However, OAuth tokens + * are refreshed every ~15 minutes while IAM credentials expire after ~1 hour. + * If OAuth callbacks are spaced far apart, the passive refresh may not trigger + * before credentials expire, causing authentication failures. */ + config->provider->provider_vtable->refresh(config->provider); + + /* Get credentials from provider */ creds = config->provider->provider_vtable->get_credentials(config->provider); if (!creds) { flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); From bf5d9f2fdb5d92c95b0dcbb12135302ccb197986 Mon Sep 17 00:00:00 2001 From: Arbin Date: Thu, 27 Nov 2025 12:33:50 +0800 Subject: [PATCH 06/37] aws_msk_iam: Minor leak on empty_payload_hex when canonical request build fails Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index cc65093dacd..7da72441ab5 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -570,6 +570,9 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, if (session_token_enc) { flb_sds_destroy(session_token_enc); } + if (empty_payload_hex) { + flb_sds_destroy(empty_payload_hex); + } return NULL; } From 86999b50bad80dccf6d0de79e7f1716918560863 Mon Sep 17 00:00:00 2001 From: Arbin Date: Thu, 27 Nov 2025 12:46:00 +0800 Subject: [PATCH 07/37] aws_msk_iam: optimize MSK IAM authentication and credential management Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 7da72441ab5..38910bb3515 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -648,7 +648,10 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, * If OAuth callbacks are spaced far apart, the passive refresh may not trigger * before credentials expire, causing authentication failures. */ - config->provider->provider_vtable->refresh(config->provider); + int rc = config->provider->provider_vtable->refresh(config->provider); + if (rc < 0) { + flb_warn("[aws_msk_iam] AWS provider refresh() failed (rc=%d), continuing to get_credentials()", rc); + } /* Get credentials from provider */ creds = config->provider->provider_vtable->get_credentials(config->provider); @@ -829,7 +832,7 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) flb_aws_provider_destroy(ctx->provider); } - /* Clean up TLS instance */ + /* Clean up TLS instance - caller owns TLS lifecycle with flb_standard_chain_provider_create */ if (ctx->cred_tls) { flb_tls_destroy(ctx->cred_tls); } From 5bc78d16ce24a84dbfb18dde9f8e7e9d51154f9d Mon Sep 17 00:00:00 2001 From: Arbin Date: Thu, 27 Nov 2025 13:51:29 +0800 Subject: [PATCH 08/37] aws_msk_iam: AWS MSK IAM authentication failures caused by stale credentials Signed-off-by: Arbin --- src/aws/flb_aws_credentials_ec2.c | 11 ++++++++--- src/aws/flb_aws_credentials_http.c | 8 ++++++++ src/aws/flb_aws_credentials_sts.c | 22 ++++++++++++++++------ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/aws/flb_aws_credentials_ec2.c b/src/aws/flb_aws_credentials_ec2.c index e56dc467fbd..d4ca79befc4 100644 --- a/src/aws/flb_aws_credentials_ec2.c +++ b/src/aws/flb_aws_credentials_ec2.c @@ -131,10 +131,15 @@ int refresh_fn_ec2(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EC2 IMDS provider"); - /* Force credential refresh by marking as expired */ - implementation->next_refresh = 0; - if (try_lock_provider(provider)) { + /* Force credential refresh by clearing cache and setting expired time */ + if (implementation->creds) { + flb_aws_credentials_destroy(implementation->creds); + implementation->creds = NULL; + } + /* Set to 1 (epoch start) to trigger immediate refresh via time check */ + implementation->next_refresh = 1; + ret = get_creds_ec2(implementation); unlock_provider(provider); } diff --git a/src/aws/flb_aws_credentials_http.c b/src/aws/flb_aws_credentials_http.c index 8ba78b788fd..b7da7f0d2d9 100644 --- a/src/aws/flb_aws_credentials_http.c +++ b/src/aws/flb_aws_credentials_http.c @@ -158,6 +158,14 @@ int refresh_fn_http(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the http provider"); if (try_lock_provider(provider)) { + /* Force credential refresh by clearing cache and setting expired time */ + if (implementation->creds) { + flb_aws_credentials_destroy(implementation->creds); + implementation->creds = NULL; + } + /* Set to 1 (epoch start) to trigger immediate refresh via time check */ + implementation->next_refresh = 1; + ret = http_credentials_request(implementation); unlock_provider(provider); } diff --git a/src/aws/flb_aws_credentials_sts.c b/src/aws/flb_aws_credentials_sts.c index 5fbac774cf7..ec130762cdc 100644 --- a/src/aws/flb_aws_credentials_sts.c +++ b/src/aws/flb_aws_credentials_sts.c @@ -176,10 +176,15 @@ int refresh_fn_sts(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the STS provider"); - /* Force credential refresh by marking as expired */ - implementation->next_refresh = 0; - if (try_lock_provider(provider)) { + /* Force credential refresh by clearing cache and setting expired time */ + if (implementation->creds) { + flb_aws_credentials_destroy(implementation->creds); + implementation->creds = NULL; + } + /* Set to 1 (epoch start) to trigger immediate refresh via time check */ + implementation->next_refresh = 1; + ret = sts_assume_role_request(implementation->sts_client, &implementation->creds, implementation->uri, &implementation->next_refresh); @@ -484,10 +489,15 @@ int refresh_fn_eks(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EKS provider"); - /* Force credential refresh by marking as expired */ - implementation->next_refresh = 0; - if (try_lock_provider(provider)) { + /* Force credential refresh by clearing cache and setting expired time */ + if (implementation->creds) { + flb_aws_credentials_destroy(implementation->creds); + implementation->creds = NULL; + } + /* Set to 1 (epoch start) to trigger immediate refresh via time check */ + implementation->next_refresh = 1; + ret = assume_with_web_identity(implementation); unlock_provider(provider); } From 7b30c74b402345a9294337db25f7fe6c454f3d6f Mon Sep 17 00:00:00 2001 From: Arbin Date: Thu, 27 Nov 2025 14:11:19 +0800 Subject: [PATCH 09/37] aws_msk_iam: optimize MSK IAM authentication and credential management Signed-off-by: Arbin --- src/aws/flb_aws_credentials_ec2.c | 5 ----- src/aws/flb_aws_credentials_http.c | 5 ----- src/aws/flb_aws_credentials_sts.c | 10 ---------- 3 files changed, 20 deletions(-) diff --git a/src/aws/flb_aws_credentials_ec2.c b/src/aws/flb_aws_credentials_ec2.c index d4ca79befc4..1d3ad695b8c 100644 --- a/src/aws/flb_aws_credentials_ec2.c +++ b/src/aws/flb_aws_credentials_ec2.c @@ -132,11 +132,6 @@ int refresh_fn_ec2(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EC2 IMDS provider"); if (try_lock_provider(provider)) { - /* Force credential refresh by clearing cache and setting expired time */ - if (implementation->creds) { - flb_aws_credentials_destroy(implementation->creds); - implementation->creds = NULL; - } /* Set to 1 (epoch start) to trigger immediate refresh via time check */ implementation->next_refresh = 1; diff --git a/src/aws/flb_aws_credentials_http.c b/src/aws/flb_aws_credentials_http.c index b7da7f0d2d9..a4ceeca2c74 100644 --- a/src/aws/flb_aws_credentials_http.c +++ b/src/aws/flb_aws_credentials_http.c @@ -158,11 +158,6 @@ int refresh_fn_http(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the http provider"); if (try_lock_provider(provider)) { - /* Force credential refresh by clearing cache and setting expired time */ - if (implementation->creds) { - flb_aws_credentials_destroy(implementation->creds); - implementation->creds = NULL; - } /* Set to 1 (epoch start) to trigger immediate refresh via time check */ implementation->next_refresh = 1; diff --git a/src/aws/flb_aws_credentials_sts.c b/src/aws/flb_aws_credentials_sts.c index ec130762cdc..7546adfcc94 100644 --- a/src/aws/flb_aws_credentials_sts.c +++ b/src/aws/flb_aws_credentials_sts.c @@ -177,11 +177,6 @@ int refresh_fn_sts(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the STS provider"); if (try_lock_provider(provider)) { - /* Force credential refresh by clearing cache and setting expired time */ - if (implementation->creds) { - flb_aws_credentials_destroy(implementation->creds); - implementation->creds = NULL; - } /* Set to 1 (epoch start) to trigger immediate refresh via time check */ implementation->next_refresh = 1; @@ -490,11 +485,6 @@ int refresh_fn_eks(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EKS provider"); if (try_lock_provider(provider)) { - /* Force credential refresh by clearing cache and setting expired time */ - if (implementation->creds) { - flb_aws_credentials_destroy(implementation->creds); - implementation->creds = NULL; - } /* Set to 1 (epoch start) to trigger immediate refresh via time check */ implementation->next_refresh = 1; From 59f143fa8c5129a3c269a684e0735e393ce8f6e6 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 08:41:03 +0800 Subject: [PATCH 10/37] aws_msk_iam: fix auth failures on low traffic and missing TLS Signed-off-by: Arbin --- src/aws/flb_aws_credentials_ec2.c | 3 - src/aws/flb_aws_credentials_http.c | 3 - src/aws/flb_aws_credentials_sts.c | 6 - src/aws/flb_aws_msk_iam.c | 294 ++++++++--------------------- 4 files changed, 74 insertions(+), 232 deletions(-) diff --git a/src/aws/flb_aws_credentials_ec2.c b/src/aws/flb_aws_credentials_ec2.c index 1d3ad695b8c..9aa1444f1fb 100644 --- a/src/aws/flb_aws_credentials_ec2.c +++ b/src/aws/flb_aws_credentials_ec2.c @@ -132,9 +132,6 @@ int refresh_fn_ec2(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EC2 IMDS provider"); if (try_lock_provider(provider)) { - /* Set to 1 (epoch start) to trigger immediate refresh via time check */ - implementation->next_refresh = 1; - ret = get_creds_ec2(implementation); unlock_provider(provider); } diff --git a/src/aws/flb_aws_credentials_http.c b/src/aws/flb_aws_credentials_http.c index a4ceeca2c74..8ba78b788fd 100644 --- a/src/aws/flb_aws_credentials_http.c +++ b/src/aws/flb_aws_credentials_http.c @@ -158,9 +158,6 @@ int refresh_fn_http(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the http provider"); if (try_lock_provider(provider)) { - /* Set to 1 (epoch start) to trigger immediate refresh via time check */ - implementation->next_refresh = 1; - ret = http_credentials_request(implementation); unlock_provider(provider); } diff --git a/src/aws/flb_aws_credentials_sts.c b/src/aws/flb_aws_credentials_sts.c index 7546adfcc94..155a41d3998 100644 --- a/src/aws/flb_aws_credentials_sts.c +++ b/src/aws/flb_aws_credentials_sts.c @@ -177,9 +177,6 @@ int refresh_fn_sts(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the STS provider"); if (try_lock_provider(provider)) { - /* Set to 1 (epoch start) to trigger immediate refresh via time check */ - implementation->next_refresh = 1; - ret = sts_assume_role_request(implementation->sts_client, &implementation->creds, implementation->uri, &implementation->next_refresh); @@ -485,9 +482,6 @@ int refresh_fn_eks(struct flb_aws_provider *provider) { flb_debug("[aws_credentials] Refresh called on the EKS provider"); if (try_lock_provider(provider)) { - /* Set to 1 (epoch start) to trigger immediate refresh via time check */ - implementation->next_refresh = 1; - ret = assume_with_web_identity(implementation); unlock_provider(provider); } diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 38910bb3515..e6358d32175 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -38,16 +38,22 @@ #include #include -/* Lightweight config - provider manages credential caching and refresh internally */ +/* + * Fixed token lifetime of 3 minutes. + * This short lifetime ensures that idle Kafka connections (e.g., low-traffic inputs) + * will quickly detect token expiration when new data arrives and trigger a refresh callback, + * preventing "Access denied" errors from using expired tokens on idle connections. + */ +#define MSK_IAM_TOKEN_LIFETIME_SECONDS 180 + struct flb_aws_msk_iam { struct flb_config *flb_config; flb_sds_t region; flb_sds_t cluster_arn; - struct flb_tls *cred_tls; /* TLS instance for AWS credentials (STS) */ - struct flb_aws_provider *provider; /* AWS credentials provider (created once, reused) */ + struct flb_tls *cred_tls; + struct flb_aws_provider *provider; }; -/* Utility functions - same as before */ static int to_encode(char c) { if ((c >= '0' && c <= '9') || @@ -167,8 +173,8 @@ static char *extract_region(const char *arn) /* Payload generator - builds MSK IAM authentication payload */ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, - const char *host, - struct flb_aws_credentials *creds) + const char *host, + struct flb_aws_credentials *creds) { flb_sds_t payload = NULL; int encode_result; @@ -207,26 +213,17 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, /* Validate inputs */ if (!config || !config->region || flb_sds_len(config->region) == 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: region is not set or invalid"); + flb_error("[aws_msk_iam] region is not set or invalid"); return NULL; } if (!host || strlen(host) == 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: host is required"); - return NULL; - } - - flb_debug("[aws_msk_iam] build_msk_iam_payload: generating payload for host: %s, region: %s", - host, config->region); - - /* Validate credentials */ - if (!creds) { - flb_error("[aws_msk_iam] build_msk_iam_payload: credentials are NULL"); + flb_error("[aws_msk_iam] host is required"); return NULL; } - if (!creds->access_key_id || !creds->secret_access_key) { - flb_error("[aws_msk_iam] build_msk_iam_payload: incomplete credentials"); + if (!creds || !creds->access_key_id || !creds->secret_access_key) { + flb_error("[aws_msk_iam] invalid or incomplete credentials"); return NULL; } @@ -251,19 +248,17 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, goto error; } - /* CRITICAL: Encode the action parameter */ action_enc = uri_encode_params("kafka-cluster:Connect", 21); if (!action_enc) { goto error; } - /* Build canonical query string with ACTION parameter first (alphabetical order) */ + /* Build canonical query string */ query = flb_sds_create_size(8192); if (!query) { goto error; } - /* note: Action must be FIRST in alphabetical order */ query = flb_sds_printf(&query, "Action=%s&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=%s" "&X-Amz-Date=%s&X-Amz-Expires=900", @@ -272,27 +267,23 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, goto error; } - /* Add session token if present (before SignedHeaders alphabetically) */ + /* Add session token if present */ if (creds->session_token && flb_sds_len(creds->session_token) > 0) { session_token_enc = uri_encode_params(creds->session_token, flb_sds_len(creds->session_token)); if (!session_token_enc) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to encode session token"); goto error; } tmp = flb_sds_printf(&query, "&X-Amz-Security-Token=%s", session_token_enc); if (!tmp) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to append session token to query"); goto error; } query = tmp; } - /* Add SignedHeaders LAST (alphabetically after Security-Token) */ tmp = flb_sds_printf(&query, "&X-Amz-SignedHeaders=host"); if (!tmp) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to append SignedHeaders"); goto error; } query = tmp; @@ -303,10 +294,8 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, goto error; } - /* CRITICAL: MSK IAM canonical request format - use SHA256 of empty string, not UNSIGNED-PAYLOAD */ if (flb_hash_simple(FLB_HASH_SHA256, (unsigned char *) "", 0, empty_payload_hash, sizeof(empty_payload_hash)) != FLB_CRYPTO_SUCCESS) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to hash empty payload"); goto error; } @@ -320,17 +309,15 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, query, host, empty_payload_hex); flb_sds_destroy(empty_payload_hex); - empty_payload_hex = NULL; /* Prevent double-free */ + empty_payload_hex = NULL; if (!canonical) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to build canonical request"); goto error; } - /* Hash canonical request immediately */ + /* Hash canonical request */ if (flb_hash_simple(FLB_HASH_SHA256, (unsigned char *) canonical, flb_sds_len(canonical), sha256_buf, sizeof(sha256_buf)) != FLB_CRYPTO_SUCCESS) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to hash canonical request"); goto error; } @@ -366,34 +353,28 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, len = strlen(datestamp); if (hmac_sha256_sign(key_date, (unsigned char *) key, flb_sds_len(key), (unsigned char *) datestamp, len) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to sign date"); goto error; } - /* Clean up key immediately after use - prevent double-free */ flb_sds_destroy(key); key = NULL; len = strlen(config->region); if (hmac_sha256_sign(key_region, key_date, 32, (unsigned char *) config->region, len) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to sign region"); goto error; } if (hmac_sha256_sign(key_service, key_region, 32, (unsigned char *) "kafka-cluster", 13) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to sign service"); goto error; } if (hmac_sha256_sign(key_signing, key_service, 32, (unsigned char *) "aws4_request", 12) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to create signing key"); goto error; } if (hmac_sha256_sign(sig, key_signing, 32, (unsigned char *) string_to_sign, flb_sds_len(string_to_sign)) != 0) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to sign request"); goto error; } @@ -402,85 +383,28 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, goto error; } - /* Append signature to query */ tmp = flb_sds_printf(&query, "&X-Amz-Signature=%s", hexsig); if (!tmp) { goto error; } query = tmp; - /* Build the complete presigned URL */ - presigned_url = flb_sds_create_size(16384); - if (!presigned_url) { - goto error; - } - - presigned_url = flb_sds_printf(&presigned_url, "https://%s/?%s", host, query); - if (!presigned_url) { - goto error; - } - - /* Base64 URL encode the presigned URL */ - url_len = flb_sds_len(presigned_url); - encoded_len = ((url_len + 2) / 3) * 4 + 1; /* Base64 encoding size + null terminator */ - - payload = flb_sds_create_size(encoded_len); - if (!payload) { - goto error; - } - - encode_result = flb_base64_encode((unsigned char*) payload, encoded_len, &actual_encoded_len, - (const unsigned char*) presigned_url, url_len); - if (encode_result == -1) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to base64 encode URL"); - goto error; - } - flb_sds_len_set(payload, actual_encoded_len); - - /* Convert to Base64 URL encoding (replace + with -, / with _, remove padding =) */ - p = payload; - while (*p) { - if (*p == '+') { - *p = '-'; - } - else if (*p == '/') { - *p = '_'; - } - p++; - } - - /* Remove padding */ - len = flb_sds_len(payload); - while (len > 0 && payload[len-1] == '=') { - len--; - } - flb_sds_len_set(payload, len); - payload[len] = '\0'; - - /* Build the complete presigned URL */ - flb_sds_destroy(presigned_url); + /* Build complete presigned URL */ presigned_url = flb_sds_create_size(16384); if (!presigned_url) { goto error; } - presigned_url = flb_sds_printf(&presigned_url, "https://%s/?%s", host, query); + presigned_url = flb_sds_printf(&presigned_url, "https://%s/?%s&User-Agent=fluent-bit-msk-iam", + host, query); if (!presigned_url) { goto error; } - /* Add User-Agent parameter to the signed URL (like Go implementation) */ - tmp = flb_sds_printf(&presigned_url, "&User-Agent=fluent-bit-msk-iam"); - if (!tmp) { - goto error; - } - presigned_url = tmp; - - /* Base64 URL encode the presigned URL (RawURLEncoding - no padding like Go) */ + /* Base64 URL encode */ url_len = flb_sds_len(presigned_url); - encoded_len = ((url_len + 2) / 3) * 4 + 1; /* Base64 encoding size + null terminator */ + encoded_len = ((url_len + 2) / 3) * 4 + 1; - flb_sds_destroy(payload); payload = flb_sds_create_size(encoded_len); if (!payload) { goto error; @@ -489,14 +413,12 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, encode_result = flb_base64_encode((unsigned char*) payload, encoded_len, &actual_encoded_len, (const unsigned char *) presigned_url, url_len); if (encode_result == -1) { - flb_error("[aws_msk_iam] build_msk_iam_payload: failed to base64 encode URL"); goto error; } - /* Update the SDS length to match actual encoded length */ flb_sds_len_set(payload, actual_encoded_len); - /* Convert to Base64 URL encoding AND remove padding (RawURLEncoding like Go) */ + /* Convert to Base64 URL encoding and remove padding */ p = payload; while (*p) { if (*p == '+') { @@ -508,7 +430,6 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, p++; } - /* Remove ALL padding (RawURLEncoding) */ final_len = flb_sds_len(payload); while (final_len > 0 && payload[final_len-1] == '=') { final_len--; @@ -516,7 +437,7 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, flb_sds_len_set(payload, final_len); payload[final_len] = '\0'; - /* Clean up before successful return */ + /* Clean up */ flb_sds_destroy(credential); flb_sds_destroy(credential_enc); flb_sds_destroy(canonical); @@ -533,52 +454,24 @@ static flb_sds_t build_msk_iam_payload(struct flb_aws_msk_iam *config, return payload; error: - /* Clean up everything - check for NULL to prevent double-free */ - if (credential) { - flb_sds_destroy(credential); - } - if (credential_enc) { - flb_sds_destroy(credential_enc); - } - if (canonical) { - flb_sds_destroy(canonical); - } - if (hexhash) { - flb_sds_destroy(hexhash); - } - if (string_to_sign) { - flb_sds_destroy(string_to_sign); - } - if (hexsig) { - flb_sds_destroy(hexsig); - } - if (query) { - flb_sds_destroy(query); - } - if (action_enc) { - flb_sds_destroy(action_enc); - } - if (presigned_url) { - flb_sds_destroy(presigned_url); - } - if (key) { /* Only destroy if not already destroyed */ - flb_sds_destroy(key); - } - if (payload) { - flb_sds_destroy(payload); - } - if (session_token_enc) { - flb_sds_destroy(session_token_enc); - } - if (empty_payload_hex) { - flb_sds_destroy(empty_payload_hex); - } + if (credential) flb_sds_destroy(credential); + if (credential_enc) flb_sds_destroy(credential_enc); + if (canonical) flb_sds_destroy(canonical); + if (hexhash) flb_sds_destroy(hexhash); + if (string_to_sign) flb_sds_destroy(string_to_sign); + if (hexsig) flb_sds_destroy(hexsig); + if (query) flb_sds_destroy(query); + if (action_enc) flb_sds_destroy(action_enc); + if (presigned_url) flb_sds_destroy(presigned_url); + if (key) flb_sds_destroy(key); + if (payload) flb_sds_destroy(payload); + if (session_token_enc) flb_sds_destroy(session_token_enc); + if (empty_payload_hex) flb_sds_destroy(empty_payload_hex); return NULL; } - -/* OAuth token refresh callback with credential caching */ +/* OAuth token refresh callback */ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, const char *oauthbearer_config, void *opaque) @@ -587,7 +480,7 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_sds_t payload = NULL; rd_kafka_resp_err_t err; char errstr[512]; - int64_t now; + time_t now; int64_t md_lifetime_ms; const char *s3_suffix = "-s3"; size_t arn_len; @@ -599,61 +492,44 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, kafka_opaque = (struct flb_kafka_opaque *) opaque; if (!kafka_opaque || !kafka_opaque->msk_iam_ctx) { - flb_error("[aws_msk_iam] oauthbearer_token_refresh_cb: invalid opaque context"); + flb_error("[aws_msk_iam] invalid opaque context"); rd_kafka_oauthbearer_set_token_failure(rk, "invalid context"); return; } - flb_debug("[aws_msk_iam] running OAuth bearer token refresh callback"); - - /* get the msk_iam config (not persistent context!) */ config = kafka_opaque->msk_iam_ctx; - /* validate region (mandatory) */ if (!config->region || flb_sds_len(config->region) == 0) { - flb_error("[aws_msk_iam] region is not set or invalid"); + flb_error("[aws_msk_iam] region is not set"); rd_kafka_oauthbearer_set_token_failure(rk, "region not set"); return; } - /* - * Use MSK generic endpoint for IAM authentication. - * AWS MSK IAM supports both cluster-specific and generic regional endpoints. - * Generic endpoints are recommended as they work across all brokers in the region. - */ + /* Determine MSK endpoint */ if (config->cluster_arn) { arn_len = strlen(config->cluster_arn); suffix_len = strlen(s3_suffix); if (arn_len >= suffix_len && strcmp(config->cluster_arn + arn_len - suffix_len, s3_suffix) == 0) { snprintf(host, sizeof(host), "kafka-serverless.%s.amazonaws.com", config->region); - flb_debug("[aws_msk_iam] using MSK Serverless generic endpoint: %s", host); } else { snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); - flb_debug("[aws_msk_iam] using MSK generic endpoint: %s", host); } } else { snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); - flb_debug("[aws_msk_iam] using MSK generic endpoint: %s", host); } - flb_debug("[aws_msk_iam] requesting MSK IAM payload for region: %s, host: %s", config->region, host); + flb_debug("[aws_msk_iam] OAuth token refresh callback triggered"); - /* - * Refresh credentials before generating OAuth token. - * This is necessary because provider's passive refresh only triggers when - * get_credentials is called and detects expiration. However, OAuth tokens - * are refreshed every ~15 minutes while IAM credentials expire after ~1 hour. - * If OAuth callbacks are spaced far apart, the passive refresh may not trigger - * before credentials expire, causing authentication failures. - */ - int rc = config->provider->provider_vtable->refresh(config->provider); - if (rc < 0) { - flb_warn("[aws_msk_iam] AWS provider refresh() failed (rc=%d), continuing to get_credentials()", rc); + /* Refresh credentials */ + if (config->provider->provider_vtable->refresh(config->provider) < 0) { + flb_warn("[aws_msk_iam] credential refresh failed, will retry on next callback"); + rd_kafka_oauthbearer_set_token_failure(rk, "credential refresh failed"); + return; } - /* Get credentials from provider */ + /* Get credentials */ creds = config->provider->provider_vtable->get_credentials(config->provider); if (!creds) { flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); @@ -661,7 +537,7 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, return; } - /* Generate payload using credentials from provider */ + /* Generate payload */ payload = build_msk_iam_payload(config, host, creds); if (!payload) { flb_error("[aws_msk_iam] failed to generate MSK IAM payload"); @@ -670,8 +546,16 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, return; } + /* + * Set OAuth token with fixed 3-minute lifetime. + * librdkafka will trigger a refresh callback before the token expires. + * For idle connections, the refresh may be delayed until new data arrives, + * at which point librdkafka detects the expired token and triggers the callback. + * The short 3-minute lifetime ensures credentials (typically 60 minutes) are still + * valid when the callback is eventually triggered, allowing successful token regeneration. + */ now = time(NULL); - md_lifetime_ms = (now + 900) * 1000; + md_lifetime_ms = (now + MSK_IAM_TOKEN_LIFETIME_SECONDS) * 1000; err = rd_kafka_oauthbearer_set_token(rk, payload, @@ -682,25 +566,23 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, errstr, sizeof(errstr)); - /* Destroy credentials immediately after use (standard pattern) */ flb_aws_credentials_destroy(creds); - creds = NULL; if (err != RD_KAFKA_RESP_ERR_NO_ERROR) { flb_error("[aws_msk_iam] failed to set OAuth bearer token: %s", errstr); rd_kafka_oauthbearer_set_token_failure(rk, errstr); } else { - flb_info("[aws_msk_iam] OAuth bearer token successfully set"); + flb_info("[aws_msk_iam] OAuth bearer token successfully set with %d second lifetime", + MSK_IAM_TOKEN_LIFETIME_SECONDS); } - /* Clean up - payload only (creds already destroyed) */ if (payload) { flb_sds_destroy(payload); } } -/* Register callback with lightweight config - keeps your current interface */ +/* Register OAuth callback */ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *config, rd_kafka_conf_t *kconf, const char *cluster_arn, @@ -709,26 +591,21 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con struct flb_aws_msk_iam *ctx; char *region_str; - flb_info("[aws_msk_iam] registering OAuth callback with cluster ARN: %s", cluster_arn); - if (!cluster_arn) { flb_error("[aws_msk_iam] cluster ARN is required"); return NULL; } - /* Allocate lightweight config - NO AWS provider! */ ctx = flb_calloc(1, sizeof(struct flb_aws_msk_iam)); if (!ctx) { flb_errno(); return NULL; } - /* Store the flb_config for on-demand provider creation */ ctx->flb_config = config; ctx->cluster_arn = flb_sds_create(cluster_arn); if (!ctx->cluster_arn) { - flb_error("[aws_msk_iam] failed to create cluster ARN string"); flb_free(ctx); return NULL; } @@ -736,7 +613,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con /* Extract region */ region_str = extract_region(cluster_arn); if (!region_str || strlen(region_str) == 0) { - flb_error("[aws_msk_iam] failed to extract region from cluster ARN: %s", cluster_arn); + flb_error("[aws_msk_iam] failed to extract region from ARN"); flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); if (region_str) flb_free(region_str); @@ -747,42 +624,31 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_free(region_str); if (!ctx->region) { - flb_error("[aws_msk_iam] failed to create region string"); flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } - flb_info("[aws_msk_iam] extracted region: %s", ctx->region); - - /* Create TLS instance for AWS credentials (STS) - CRITICAL FIX */ + /* Create TLS instance */ ctx->cred_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, FLB_TRUE, FLB_LOG_DEBUG, - NULL, /* vhost */ - NULL, /* ca_path */ - NULL, /* ca_file */ - NULL, /* crt_file */ - NULL, /* key_file */ - NULL); /* key_passwd */ + NULL, NULL, NULL, NULL, NULL, NULL); if (!ctx->cred_tls) { - flb_error("[aws_msk_iam] failed to create TLS instance for AWS credentials"); + flb_error("[aws_msk_iam] failed to create TLS instance"); flb_sds_destroy(ctx->region); flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } - flb_info("[aws_msk_iam] TLS instance created for AWS credentials"); - - /* Create AWS provider once - will be reused for credential refresh */ + /* Create AWS provider */ ctx->provider = flb_standard_chain_provider_create(config, ctx->cred_tls, ctx->region, - NULL, /* sts_endpoint */ - NULL, /* proxy */ + NULL, NULL, flb_aws_client_generator(), - NULL); /* profile */ + NULL); if (!ctx->provider) { flb_error("[aws_msk_iam] failed to create AWS credentials provider"); flb_tls_destroy(ctx->cred_tls); @@ -792,7 +658,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con return NULL; } - /* Initialize provider in sync mode (required before event loop is available) */ + /* Initialize provider */ ctx->provider->provider_vtable->sync(ctx->provider); if (ctx->provider->provider_vtable->init(ctx->provider) != 0) { flb_error("[aws_msk_iam] failed to initialize AWS credentials provider"); @@ -803,18 +669,13 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_free(ctx); return NULL; } - /* Switch back to async mode */ ctx->provider->provider_vtable->async(ctx->provider); - flb_info("[aws_msk_iam] AWS credentials provider created and initialized successfully"); - - /* Set the callback and opaque */ + /* Register callback */ rd_kafka_conf_set_oauthbearer_token_refresh_cb(kconf, oauthbearer_token_refresh_cb); flb_kafka_opaque_set(opaque, NULL, ctx); rd_kafka_conf_set_opaque(kconf, opaque); - flb_info("[aws_msk_iam] OAuth callback registered successfully"); - return ctx; } @@ -825,19 +686,14 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) return; } - flb_info("[aws_msk_iam] destroying MSK IAM config"); - - /* Destroy AWS provider (provider manages its own credential caching) */ if (ctx->provider) { flb_aws_provider_destroy(ctx->provider); } - /* Clean up TLS instance - caller owns TLS lifecycle with flb_standard_chain_provider_create */ if (ctx->cred_tls) { flb_tls_destroy(ctx->cred_tls); } - /* Clean up other resources */ if (ctx->region) { flb_sds_destroy(ctx->region); } @@ -845,6 +701,4 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) flb_sds_destroy(ctx->cluster_arn); } flb_free(ctx); - - flb_info("[aws_msk_iam] MSK IAM config destroyed"); } From c5039c85b542c717a3a82caf13bc8cecbe61ccf1 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 09:08:39 +0800 Subject: [PATCH 11/37] =?UTF-8?q?aws=5Fmsk=5Fiam:=20=20Fix=20potential=20o?= =?UTF-8?q?verflow=20in=20md=5Flifetime=5Fms=20on=2032=E2=80=91bit=20time?= =?UTF-8?q?=5Ft?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index e6358d32175..cf220892079 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -555,7 +555,7 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, * valid when the callback is eventually triggered, allowing successful token regeneration. */ now = time(NULL); - md_lifetime_ms = (now + MSK_IAM_TOKEN_LIFETIME_SECONDS) * 1000; + md_lifetime_ms = ((int64_t)now + MSK_IAM_TOKEN_LIFETIME_SECONDS) * 1000; err = rd_kafka_oauthbearer_set_token(rk, payload, From 2f6d7fb2dc2d0c193d00d1a4900172c38d6eb959 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 17:18:01 +0800 Subject: [PATCH 12/37] aws_msk_iam: fix OAuth token expiration and add TLS support Signed-off-by: Arbin --- plugins/out_kafka/kafka_config.c | 22 ++++++++++++++++++++++ src/aws/flb_aws_msk_iam.c | 18 +++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index b4bb9be6acf..b5eb12ace15 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -243,6 +243,28 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, return NULL; } +#ifdef FLB_HAVE_AWS_MSK_IAM + /* + * Enable SASL background callbacks for MSK IAM to ensure OAuth tokens + * are refreshed automatically even on idle connections. + * This eliminates the need for the application to call rd_kafka_poll() + * regularly for token refresh to occur. + */ + if (ctx->msk_iam) { + rd_kafka_error_t *error; + error = rd_kafka_sasl_background_callbacks_enable(ctx->kafka.rk); + if (error) { + flb_plg_warn(ctx->ins, "failed to enable SASL background callbacks: %s", + rd_kafka_error_string(error)); + rd_kafka_error_destroy(error); + } + else { + flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " + "OAuth tokens will be refreshed automatically in background thread"); + } + } +#endif + #ifdef FLB_HAVE_AVRO_ENCODER /* Config AVRO */ tmp = flb_output_get_property("schema_str", ins); diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index cf220892079..c90c3c468d2 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -39,12 +39,10 @@ #include /* - * Fixed token lifetime of 3 minutes. - * This short lifetime ensures that idle Kafka connections (e.g., low-traffic inputs) - * will quickly detect token expiration when new data arrives and trigger a refresh callback, - * preventing "Access denied" errors from using expired tokens on idle connections. + * OAuth token lifetime of 5 minutes (industry standard). + * Matches AWS Go SDK and Kafka Connect implementations. */ -#define MSK_IAM_TOKEN_LIFETIME_SECONDS 180 +#define MSK_IAM_TOKEN_LIFETIME_SECONDS 300 struct flb_aws_msk_iam { struct flb_config *flb_config; @@ -547,12 +545,10 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, } /* - * Set OAuth token with fixed 3-minute lifetime. - * librdkafka will trigger a refresh callback before the token expires. - * For idle connections, the refresh may be delayed until new data arrives, - * at which point librdkafka detects the expired token and triggers the callback. - * The short 3-minute lifetime ensures credentials (typically 60 minutes) are still - * valid when the callback is eventually triggered, allowing successful token regeneration. + * Set OAuth token with fixed 5-minute lifetime (AWS industry standard). + * librdkafka's background thread will automatically trigger a refresh callback + * at 80% of the token's lifetime (4 minutes) to ensure the token never expires, + * even on completely idle connections. */ now = time(NULL); md_lifetime_ms = ((int64_t)now + MSK_IAM_TOKEN_LIFETIME_SECONDS) * 1000; From 922d8e7130fb2db13bbbf3b90eff4dbf24cbee79 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 17:31:42 +0800 Subject: [PATCH 13/37] aws_msk_iam: fix OAuth token expiration and add TLS support Signed-off-by: Arbin --- plugins/out_kafka/kafka_config.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index b5eb12ace15..d4052889f7e 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -214,6 +214,13 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, if (ctx->aws_msk_iam && ctx->aws_msk_iam_cluster_arn && ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + /* + * Enable SASL queue for background callbacks BEFORE registering OAuth callback. + * This allows librdkafka to handle OAuth token refresh in a background thread, + * which is essential for idle connections where rd_kafka_poll() is not called. + */ + rd_kafka_conf_enable_sasl_queue(ctx->conf, 1); + ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, ctx->conf, ctx->aws_msk_iam_cluster_arn, @@ -247,21 +254,19 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, /* * Enable SASL background callbacks for MSK IAM to ensure OAuth tokens * are refreshed automatically even on idle connections. - * This eliminates the need for the application to call rd_kafka_poll() - * regularly for token refresh to occur. */ if (ctx->msk_iam) { rd_kafka_error_t *error; error = rd_kafka_sasl_background_callbacks_enable(ctx->kafka.rk); if (error) { - flb_plg_warn(ctx->ins, "failed to enable SASL background callbacks: %s", + flb_plg_error(ctx->ins, "failed to enable SASL background callbacks: %s", rd_kafka_error_string(error)); rd_kafka_error_destroy(error); + flb_out_kafka_destroy(ctx); + return NULL; } - else { - flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " - "OAuth tokens will be refreshed automatically in background thread"); - } + flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " + "OAuth tokens will be refreshed automatically in background thread"); } #endif From c86563513b9ffc050e5502be7d2d6b5a41490aee Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 17:35:14 +0800 Subject: [PATCH 14/37] aws_msk_iam: fix OAuth token expiration and add TLS support Signed-off-by: Arbin --- plugins/out_kafka/kafka_config.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index d4052889f7e..dca5bf958c9 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -259,14 +259,15 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, rd_kafka_error_t *error; error = rd_kafka_sasl_background_callbacks_enable(ctx->kafka.rk); if (error) { - flb_plg_error(ctx->ins, "failed to enable SASL background callbacks: %s", + flb_plg_warn(ctx->ins, "failed to enable SASL background callbacks: %s. " + "OAuth tokens may not refresh on idle connections.", rd_kafka_error_string(error)); rd_kafka_error_destroy(error); - flb_out_kafka_destroy(ctx); - return NULL; } - flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " - "OAuth tokens will be refreshed automatically in background thread"); + else { + flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " + "OAuth tokens will be refreshed automatically in background thread"); + } } #endif From 210accd546b9a72c5505c013511221689df3ef11 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 18:56:58 +0800 Subject: [PATCH 15/37] aws_msk_iam: Fix AWS MSK IAM remove cluster_arn dependency Signed-off-by: Arbin --- include/fluent-bit/aws/flb_aws_msk_iam.h | 1 - plugins/in_kafka/in_kafka.c | 102 +++++++-------- plugins/in_kafka/in_kafka.h | 2 - plugins/out_kafka/kafka.c | 13 -- plugins/out_kafka/kafka_config.c | 98 +++++++-------- plugins/out_kafka/kafka_config.h | 3 - src/aws/flb_aws_msk_iam.c | 153 ++++++++++++++--------- 7 files changed, 185 insertions(+), 187 deletions(-) diff --git a/include/fluent-bit/aws/flb_aws_msk_iam.h b/include/fluent-bit/aws/flb_aws_msk_iam.h index df0ea258557..127d03dbdf4 100644 --- a/include/fluent-bit/aws/flb_aws_msk_iam.h +++ b/include/fluent-bit/aws/flb_aws_msk_iam.h @@ -40,7 +40,6 @@ struct flb_msk_iam_cb { */ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *config, rd_kafka_conf_t *kconf, - const char *cluster_arn, struct flb_kafka_opaque *opaque); void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx); diff --git a/plugins/in_kafka/in_kafka.c b/plugins/in_kafka/in_kafka.c index e07d7970a7c..9ae003f91a7 100644 --- a/plugins/in_kafka/in_kafka.c +++ b/plugins/in_kafka/in_kafka.c @@ -268,40 +268,30 @@ static int in_kafka_init(struct flb_input_instance *ins, return -1; } + /* Retrieve SASL mechanism if configured */ + conf = flb_input_get_property("rdkafka.sasl.mechanism", ins); + if (conf) { + ctx->sasl_mechanism = flb_sds_create(conf); + flb_plg_info(ins, "SASL mechanism configured: %s", ctx->sasl_mechanism); + #ifdef FLB_HAVE_AWS_MSK_IAM - /* - * When MSK IAM auth is enabled, default the required - * security settings so users don't need to specify them. - */ - if (ctx->aws_msk_iam && ctx->aws_msk_iam_cluster_arn) { - conf = flb_input_get_property("rdkafka.security.protocol", ins); - if (!conf) { - flb_input_set_property(ins, "rdkafka.security.protocol", "SASL_SSL"); - } - - conf = flb_input_get_property("rdkafka.sasl.mechanism", ins); - if (!conf) { + /* Check if using aws_msk_iam as SASL mechanism */ + if (strcasecmp(conf, "aws_msk_iam") == 0) { + /* Set SASL mechanism to OAUTHBEARER for librdkafka */ flb_input_set_property(ins, "rdkafka.sasl.mechanism", "OAUTHBEARER"); + flb_sds_destroy(ctx->sasl_mechanism); ctx->sasl_mechanism = flb_sds_create("OAUTHBEARER"); + + /* Ensure security protocol is set */ + conf = flb_input_get_property("rdkafka.security.protocol", ins); + if (!conf) { + flb_input_set_property(ins, "rdkafka.security.protocol", "SASL_SSL"); + } + + flb_plg_info(ins, "AWS MSK IAM authentication enabled via rdkafka.sasl.mechanism"); } - else { - ctx->sasl_mechanism = flb_sds_create(conf); - flb_plg_info(ins, "SASL mechanism configured: %s", ctx->sasl_mechanism); - } - } - else { #endif - - /* Retrieve SASL mechanism if configured */ - conf = flb_input_get_property("rdkafka.sasl.mechanism", ins); - if (conf) { - ctx->sasl_mechanism = flb_sds_create(conf); - flb_plg_info(ins, "SASL mechanism configured: %s", ctx->sasl_mechanism); - } - -#ifdef FLB_HAVE_AWS_MSK_IAM } -#endif kafka_conf = flb_kafka_conf_create(&ctx->kafka, &ins->properties, 1); if (!kafka_conf) { @@ -348,28 +338,33 @@ static int in_kafka_init(struct flb_input_instance *ins, flb_plg_error(ins, "failed to create kafka opaque context"); goto init_error; } - flb_kafka_opaque_set(ctx->opaque, ctx, NULL); + flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); rd_kafka_conf_set_opaque(kafka_conf, ctx->opaque); #ifdef FLB_HAVE_AWS_MSK_IAM - if (ctx->aws_msk_iam && ctx->aws_msk_iam_cluster_arn && ctx->sasl_mechanism && - strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { - flb_plg_info(ins, "registering MSK IAM authentication with cluster ARN: %s", - ctx->aws_msk_iam_cluster_arn); - ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, - kafka_conf, - ctx->aws_msk_iam_cluster_arn, - ctx->opaque); - if (!ctx->msk_iam) { - flb_plg_error(ins, "failed to setup MSK IAM authentication"); - } - else { - res = rd_kafka_conf_set(kafka_conf, "sasl.oauthbearer.config", - "principal=admin", errstr, sizeof(errstr)); - if (res != RD_KAFKA_CONF_OK) { - flb_plg_error(ins, - "failed to set sasl.oauthbearer.config: %s", - errstr); + if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + /* Check if brokers are configured for MSK IAM */ + conf = flb_input_get_property("brokers", ins); + if (conf && (strstr(conf, ".kafka.") || strstr(conf, ".kafka-serverless.")) && + strstr(conf, ".amazonaws.com")) { + + /* Register MSK IAM OAuth callback - extract region from broker address */ + flb_plg_info(ins, "registering AWS MSK IAM authentication (region auto-extracted from broker)"); + ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, + kafka_conf, + ctx->opaque); + + if (!ctx->msk_iam) { + flb_plg_error(ins, "failed to setup MSK IAM authentication"); + } + else { + res = rd_kafka_conf_set(kafka_conf, "sasl.oauthbearer.config", + "principal=admin", errstr, sizeof(errstr)); + if (res != RD_KAFKA_CONF_OK) { + flb_plg_error(ins, + "failed to set sasl.oauthbearer.config: %s", + errstr); + } } } } @@ -571,19 +566,6 @@ static struct flb_config_map config_map[] = { "Rely on kafka auto-commit and commit messages in batches" }, -#ifdef FLB_HAVE_AWS_MSK_IAM - { - FLB_CONFIG_MAP_STR, "aws_msk_iam_cluster_arn", (char *)NULL, - 0, FLB_TRUE, offsetof(struct flb_in_kafka_config, aws_msk_iam_cluster_arn), - "ARN of the MSK cluster when using AWS IAM authentication" - }, - { - FLB_CONFIG_MAP_BOOL, "aws_msk_iam", "false", - 0, FLB_TRUE, offsetof(struct flb_in_kafka_config, aws_msk_iam), - "Enable AWS MSK IAM authentication" - }, -#endif - /* EOF */ {0} }; diff --git a/plugins/in_kafka/in_kafka.h b/plugins/in_kafka/in_kafka.h index 096cf1c561b..4792ae5b947 100644 --- a/plugins/in_kafka/in_kafka.h +++ b/plugins/in_kafka/in_kafka.h @@ -55,12 +55,10 @@ struct flb_in_kafka_config { struct flb_kafka_opaque *opaque; #ifdef FLB_HAVE_AWS_MSK_IAM - flb_sds_t aws_msk_iam_cluster_arn; struct flb_aws_msk_iam *msk_iam; #endif /* SASL mechanism configured in rdkafka.sasl.mechanism */ - int aws_msk_iam; flb_sds_t sasl_mechanism; }; diff --git a/plugins/out_kafka/kafka.c b/plugins/out_kafka/kafka.c index dadd4725f74..b6ff6f45307 100644 --- a/plugins/out_kafka/kafka.c +++ b/plugins/out_kafka/kafka.c @@ -678,19 +678,6 @@ static struct flb_config_map config_map[] = { "that key will be sent to Kafka." }, -#ifdef FLB_HAVE_AWS_MSK_IAM - { - FLB_CONFIG_MAP_STR, "aws_msk_iam_cluster_arn", NULL, - 0, FLB_TRUE, offsetof(struct flb_out_kafka, aws_msk_iam_cluster_arn), - "ARN of the MSK cluster when using AWS IAM authentication" - }, - { - FLB_CONFIG_MAP_BOOL, "aws_msk_iam", "false", - 0, FLB_TRUE, offsetof(struct flb_out_kafka, aws_msk_iam), - "Enable AWS MSK IAM authentication" - }, -#endif - /* EOF */ {0} }; diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index dca5bf958c9..2c9f6885734 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -58,37 +58,30 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, return NULL; } + /* Retrieve SASL mechanism if configured */ + tmp = flb_output_get_property("rdkafka.sasl.mechanism", ins); + if (tmp) { + ctx->sasl_mechanism = flb_sds_create(tmp); + flb_plg_info(ins, "SASL mechanism configured: %s", ctx->sasl_mechanism); + #ifdef FLB_HAVE_AWS_MSK_IAM - /* - * When MSK IAM auth is enabled, default the required - * security settings so users don't need to specify them. - */ - if (ctx->aws_msk_iam && ctx->aws_msk_iam_cluster_arn) { - tmp = flb_output_get_property("rdkafka.security.protocol", ins); - if (!tmp) { - flb_output_set_property(ins, "rdkafka.security.protocol", "SASL_SSL"); - } - - tmp = flb_output_get_property("rdkafka.sasl.mechanism", ins); - if (!tmp) { + /* Check if using aws_msk_iam as SASL mechanism */ + if (strcasecmp(tmp, "aws_msk_iam") == 0) { + /* Set SASL mechanism to OAUTHBEARER for librdkafka */ flb_output_set_property(ins, "rdkafka.sasl.mechanism", "OAUTHBEARER"); + flb_sds_destroy(ctx->sasl_mechanism); ctx->sasl_mechanism = flb_sds_create("OAUTHBEARER"); + + /* Ensure security protocol is set */ + tmp = flb_output_get_property("rdkafka.security.protocol", ins); + if (!tmp) { + flb_output_set_property(ins, "rdkafka.security.protocol", "SASL_SSL"); + } + + flb_plg_info(ins, "AWS MSK IAM authentication enabled via rdkafka.sasl.mechanism"); } - else { - ctx->sasl_mechanism = flb_sds_create(tmp); - } - } - else { #endif - /* Retrieve SASL mechanism if configured */ - tmp = flb_output_get_property("rdkafka.sasl.mechanism", ins); - if (tmp) { - ctx->sasl_mechanism = flb_sds_create(tmp); - } - -#ifdef FLB_HAVE_AWS_MSK_IAM } -#endif /* rdkafka config context */ ctx->conf = flb_kafka_conf_create(&ctx->kafka, &ins->properties, 0); @@ -207,34 +200,39 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, } /* store the plugin context so callbacks can log properly */ - flb_kafka_opaque_set(ctx->opaque, ctx, NULL); + flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); rd_kafka_conf_set_opaque(ctx->conf, ctx->opaque); #ifdef FLB_HAVE_AWS_MSK_IAM - if (ctx->aws_msk_iam && ctx->aws_msk_iam_cluster_arn && ctx->sasl_mechanism && - strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { - - /* - * Enable SASL queue for background callbacks BEFORE registering OAuth callback. - * This allows librdkafka to handle OAuth token refresh in a background thread, - * which is essential for idle connections where rd_kafka_poll() is not called. - */ - rd_kafka_conf_enable_sasl_queue(ctx->conf, 1); - - ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, - ctx->conf, - ctx->aws_msk_iam_cluster_arn, - ctx->opaque); - if (!ctx->msk_iam) { - flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication"); - } - else { - res = rd_kafka_conf_set(ctx->conf, "sasl.oauthbearer.config", - "principal=admin", errstr, sizeof(errstr)); - if (res != RD_KAFKA_CONF_OK) { - flb_plg_error(ctx->ins, - "failed to set sasl.oauthbearer.config: %s", - errstr); + if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + /* Check if brokers are configured for MSK IAM */ + tmp = flb_output_get_property("brokers", ins); + if (tmp && (strstr(tmp, ".kafka.") || strstr(tmp, ".kafka-serverless.")) && + strstr(tmp, ".amazonaws.com")) { + + /* + * Enable SASL queue for background callbacks BEFORE registering OAuth callback. + * This allows librdkafka to handle OAuth token refresh in a background thread, + * which is essential for idle connections where rd_kafka_poll() is not called. + */ + rd_kafka_conf_enable_sasl_queue(ctx->conf, 1); + + /* Register MSK IAM OAuth callback - extract region from broker address */ + flb_plg_info(ins, "registering AWS MSK IAM authentication (region auto-extracted from broker)"); + ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, + ctx->conf, + ctx->opaque); + if (!ctx->msk_iam) { + flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication"); + } + else { + res = rd_kafka_conf_set(ctx->conf, "sasl.oauthbearer.config", + "principal=admin", errstr, sizeof(errstr)); + if (res != RD_KAFKA_CONF_OK) { + flb_plg_error(ctx->ins, + "failed to set sasl.oauthbearer.config: %s", + errstr); + } } } } diff --git a/plugins/out_kafka/kafka_config.h b/plugins/out_kafka/kafka_config.h index e1ebc04e65c..9133113bcc0 100644 --- a/plugins/out_kafka/kafka_config.h +++ b/plugins/out_kafka/kafka_config.h @@ -126,12 +126,9 @@ struct flb_out_kafka { #endif #ifdef FLB_HAVE_AWS_MSK_IAM - flb_sds_t aws_msk_iam_cluster_arn; struct flb_aws_msk_iam *msk_iam; #endif - int aws_msk_iam; - struct flb_kafka_opaque *opaque; /* SASL mechanism configured in rdkafka.sasl.mechanism */ diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index c90c3c468d2..2aae98bb502 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -47,7 +47,7 @@ struct flb_aws_msk_iam { struct flb_config *flb_config; flb_sds_t region; - flb_sds_t cluster_arn; + int is_serverless; /* Flag to indicate if this is MSK Serverless */ struct flb_tls *cred_tls; struct flb_aws_provider *provider; }; @@ -132,40 +132,58 @@ static int hmac_sha256_sign(unsigned char out[32], return 0; } -static char *extract_region(const char *arn) +/* Extract region from MSK broker address + * MSK Standard format: b-1.example.c1.kafka..amazonaws.com:port + * MSK Serverless format: boot-.c.kafka-serverless..amazonaws.com:port + */ +static flb_sds_t extract_region_from_broker(const char *broker) { const char *p; - const char *r; + const char *start; + const char *end; size_t len; - char *out; - - /* arn:partition:service:region:... */ - p = strchr(arn, ':'); - if (!p) { + flb_sds_t out; + + if (!broker || strlen(broker) == 0) { return NULL; } - p = strchr(p + 1, ':'); + + /* Find .amazonaws.com */ + p = strstr(broker, ".amazonaws.com"); if (!p) { return NULL; } - p = strchr(p + 1, ':'); - if (!p) { + + /* Region is between the last dot before .amazonaws.com and .amazonaws.com + * Example: ...kafka.us-east-1.amazonaws.com + * or ...kafka-serverless.us-east-1.amazonaws.com + */ + end = p; /* Points to .amazonaws.com */ + + /* Find the start of region by going backwards to find the previous dot */ + start = end - 1; + while (start > broker && *start != '.') { + start--; + } + + if (*start == '.') { + start++; /* Skip the dot */ + } + + if (start >= end) { return NULL; } - - r = p + 1; - p = strchr(r, ':'); - if (!p) { + + len = end - start; + if (len == 0 || len > 64) { /* Sanity check on region length (relaxed to 64 chars) */ return NULL; } - len = p - r; - out = flb_malloc(len + 1); + + out = flb_sds_create_len(start, len); if (!out) { return NULL; } - memcpy(out, r, len); - out[len] = '\0'; - + return out; } @@ -480,9 +498,6 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, char errstr[512]; time_t now; int64_t md_lifetime_ms; - const char *s3_suffix = "-s3"; - size_t arn_len; - size_t suffix_len; struct flb_aws_msk_iam *config; struct flb_aws_credentials *creds = NULL; struct flb_kafka_opaque *kafka_opaque; @@ -503,16 +518,9 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, return; } - /* Determine MSK endpoint */ - if (config->cluster_arn) { - arn_len = strlen(config->cluster_arn); - suffix_len = strlen(s3_suffix); - if (arn_len >= suffix_len && strcmp(config->cluster_arn + arn_len - suffix_len, s3_suffix) == 0) { - snprintf(host, sizeof(host), "kafka-serverless.%s.amazonaws.com", config->region); - } - else { - snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); - } + /* Determine MSK endpoint based on cluster type */ + if (config->is_serverless) { + snprintf(host, sizeof(host), "kafka-serverless.%s.amazonaws.com", config->region); } else { snprintf(host, sizeof(host), "kafka.%s.amazonaws.com", config->region); @@ -581,16 +589,13 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, /* Register OAuth callback */ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *config, rd_kafka_conf_t *kconf, - const char *cluster_arn, struct flb_kafka_opaque *opaque) { struct flb_aws_msk_iam *ctx; - char *region_str; - - if (!cluster_arn) { - flb_error("[aws_msk_iam] cluster ARN is required"); - return NULL; - } + flb_sds_t region_str = NULL; + struct flb_kafka *kafka_ctx; + char *first_broker = NULL; + char *comma; ctx = flb_calloc(1, sizeof(struct flb_aws_msk_iam)); if (!ctx) { @@ -600,27 +605,65 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con ctx->flb_config = config; - ctx->cluster_arn = flb_sds_create(cluster_arn); - if (!ctx->cluster_arn) { + /* Extract region from broker address */ + if (!opaque || !opaque->kafka_ctx) { + flb_error("[aws_msk_iam] unable to access kafka context for broker-based region extraction"); flb_free(ctx); return NULL; } - - /* Extract region */ - region_str = extract_region(cluster_arn); - if (!region_str || strlen(region_str) == 0) { - flb_error("[aws_msk_iam] failed to extract region from ARN"); - flb_sds_destroy(ctx->cluster_arn); + + kafka_ctx = opaque->kafka_ctx; + if (!kafka_ctx->brokers || flb_sds_len(kafka_ctx->brokers) == 0) { + flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); + flb_free(ctx); + return NULL; + } + + /* Extract first broker from comma-separated list */ + first_broker = flb_strdup(kafka_ctx->brokers); + if (!first_broker) { + flb_error("[aws_msk_iam] failed to allocate memory for broker parsing"); + flb_free(ctx); + return NULL; + } + + comma = strchr(first_broker, ','); + if (comma) { + *comma = '\0'; /* Terminate at first comma */ + } + + /* Detect if this is MSK Serverless by checking broker address + * Serverless broker contains .kafka-serverless. in the hostname + * Standard broker contains .kafka. (but not .kafka-serverless.) + */ + if (strstr(first_broker, ".kafka-serverless.")) { + ctx->is_serverless = 1; + flb_info("[aws_msk_iam] detected MSK Serverless cluster"); + } + else { + ctx->is_serverless = 0; + } + + /* Extract region from broker address */ + region_str = extract_region_from_broker(first_broker); + flb_free(first_broker); + + if (!region_str || flb_sds_len(region_str) == 0) { + flb_error("[aws_msk_iam] failed to extract region from broker address: %s", + kafka_ctx->brokers); flb_free(ctx); - if (region_str) flb_free(region_str); + if (region_str) { + flb_sds_destroy(region_str); + } return NULL; } + + flb_info("[aws_msk_iam] extracted region '%s' from broker address%s", + region_str, ctx->is_serverless ? " (Serverless)" : ""); - ctx->region = flb_sds_create(region_str); - flb_free(region_str); + ctx->region = region_str; if (!ctx->region) { - flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } @@ -633,7 +676,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con if (!ctx->cred_tls) { flb_error("[aws_msk_iam] failed to create TLS instance"); flb_sds_destroy(ctx->region); - flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } @@ -649,7 +691,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_error("[aws_msk_iam] failed to create AWS credentials provider"); flb_tls_destroy(ctx->cred_tls); flb_sds_destroy(ctx->region); - flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } @@ -661,7 +702,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_aws_provider_destroy(ctx->provider); flb_tls_destroy(ctx->cred_tls); flb_sds_destroy(ctx->region); - flb_sds_destroy(ctx->cluster_arn); flb_free(ctx); return NULL; } @@ -693,8 +733,5 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) if (ctx->region) { flb_sds_destroy(ctx->region); } - if (ctx->cluster_arn) { - flb_sds_destroy(ctx->cluster_arn); - } flb_free(ctx); } From 70280b6f1c58f09208de03ceb8549576116ce86a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 06:03:00 +0000 Subject: [PATCH 16/37] workflows: bump actions/checkout from 4 to 6 Bumps [actions/checkout](https://github.com/actions/checkout) from 4 to 6. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v4...v6) --- updated-dependencies: - dependency-name: actions/checkout dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/commit-lint.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/commit-lint.yaml b/.github/workflows/commit-lint.yaml index 75b8b79cb3b..4ab654f9fdd 100644 --- a/.github/workflows/commit-lint.yaml +++ b/.github/workflows/commit-lint.yaml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 50 # needed to see ancestor commits From 7e9845f17eb6ea90524bf66a407297a5ce307870 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 3 Dec 2025 06:03:08 +0000 Subject: [PATCH 17/37] workflows: bump actions/setup-python from 5 to 6 Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5 to 6. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v5...v6) --- updated-dependencies: - dependency-name: actions/setup-python dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/commit-lint.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/commit-lint.yaml b/.github/workflows/commit-lint.yaml index 4ab654f9fdd..e3100ab99da 100644 --- a/.github/workflows/commit-lint.yaml +++ b/.github/workflows/commit-lint.yaml @@ -23,7 +23,7 @@ jobs: git fetch origin ${{ github.event.pull_request.base.ref }}:origin/${{ github.event.pull_request.base.ref }} - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.10' From 4f228782c34be4cce1f8d9d1886a0318042ea5db Mon Sep 17 00:00:00 2001 From: Marcus Sorensen Date: Fri, 14 Nov 2025 20:56:58 -0700 Subject: [PATCH 18/37] dockerfile: Docker image to support large page sizes Signed-off-by: Marcus Sorensen --- dockerfiles/Dockerfile.largepage | 277 +++++++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 dockerfiles/Dockerfile.largepage diff --git a/dockerfiles/Dockerfile.largepage b/dockerfiles/Dockerfile.largepage new file mode 100644 index 00000000000..daef9ba16cb --- /dev/null +++ b/dockerfiles/Dockerfile.largepage @@ -0,0 +1,277 @@ +# syntax=docker/dockerfile:1 +# check=skip=InvalidBaseImagePlatform + +# To use this container you may need to do the following: +# https://askubuntu.com/a/1369504 +# sudo add-apt-repository ppa:jacob/virtualisation #(for Ubuntu 20.04) +# sudo apt-get update && sudo apt-get install qemu qemu-user qemu-user-static +# https://stackoverflow.com/a/60667468 +# docker run --rm --privileged multiarch/qemu-user-static --reset -p yes +# docker buildx rm builder +# docker buildx create --name builder --use +# docker buildx inspect --bootstrap +# docker buildx build --platform "linux/amd64,linux/arm64,linux/arm/v7,linux/s390x" -f ./dockerfiles/Dockerfile.multiarch --build-arg FLB_TARBALL=https://github.com/fluent/fluent-bit/archive/v1.8.11.tar.gz ./dockerfiles/ + +# Set this to the current release version: it gets done so as part of the release. +ARG RELEASE_VERSION=4.2.1 + +# For multi-arch builds - assumption is running on an AMD64 host +FROM multiarch/qemu-user-static:x86_64-arm AS qemu-arm32 +FROM multiarch/qemu-user-static:x86_64-aarch64 AS qemu-arm64 + +FROM debian:bookworm-slim AS builder-base + +COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ +COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ + +ARG FLB_NIGHTLY_BUILD +ENV FLB_NIGHTLY_BUILD=$FLB_NIGHTLY_BUILD + +ARG FLB_CHUNK_TRACE=On +ENV FLB_CHUNK_TRACE=${FLB_CHUNK_TRACE} + +RUN mkdir -p /fluent-bit/bin /fluent-bit/etc /fluent-bit/log + +ENV DEBIAN_FRONTEND=noninteractive + +# hadolint ignore=DL3008 +RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + ca-certificates \ + git \ + make \ + tar \ + libssl-dev \ + libcurl4-openssl-dev \ + libsasl2-dev \ + pkg-config \ + libsystemd-dev/bookworm-backports \ + zlib1g-dev \ + libpq-dev \ + postgresql-server-dev-all \ + flex \ + bison \ + libyaml-dev \ + && apt-get satisfy -y cmake "cmake (<< 4.0)" \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Must be run from root of repo +WORKDIR /src/fluent-bit/ +COPY . ./ + +# We split the builder setup out so people can target it or use as a base image without doing a full build. +FROM builder-base AS builder +WORKDIR /src/fluent-bit/build/ + +# Required to be set to ARMV7 for that target +ARG WAMR_BUILD_TARGET +ARG EXTRA_CMAKE_FLAGS +ENV EXTRA_CMAKE_FLAGS=${EXTRA_CMAKE_FLAGS} + +# Enable jemalloc large page support via CMake option introduced in 5ca1c93 +ARG FLB_JEMALLOC_OPTIONS="--with-lg-page=16 --with-lg-quantum=3" +ENV FLB_JEMALLOC_OPTIONS=${FLB_JEMALLOC_OPTIONS} + +# We do not want word splitting for EXTRA_CMAKE_FLAGS in case multiple are defined +# hadolint ignore=SC2086 +RUN [ -n "${WAMR_BUILD_TARGET:-}" ] && EXTRA_CMAKE_FLAGS="$EXTRA_CMAKE_FLAGS -DWAMR_BUILD_TARGET=$WAMR_BUILD_TARGET"; \ + cmake -DFLB_SIMD=On \ + -DFLB_RELEASE=On \ + -DFLB_JEMALLOC=On \ + -DFLB_TLS=On \ + -DFLB_SHARED_LIB=Off \ + -DFLB_EXAMPLES=Off \ + -DFLB_HTTP_SERVER=On \ + -DFLB_IN_EXEC=Off \ + -DFLB_IN_SYSTEMD=On \ + -DFLB_OUT_KAFKA=On \ + -DFLB_OUT_PGSQL=On \ + -DFLB_NIGHTLY_BUILD="$FLB_NIGHTLY_BUILD" \ + -DFLB_LOG_NO_CONTROL_CHARS=On \ + -DFLB_CHUNK_TRACE="$FLB_CHUNK_TRACE" \ + -DFLB_JEMALLOC_OPTIONS="$FLB_JEMALLOC_OPTIONS" \ + $EXTRA_CMAKE_FLAGS \ + .. + +ARG CFLAGS="-v" +ENV CFLAGS=${CFLAGS} + +RUN make -j "$(getconf _NPROCESSORS_ONLN)" +RUN install bin/fluent-bit /fluent-bit/bin/ + +# Configuration files +COPY conf/fluent-bit.conf \ + conf/parsers.conf \ + conf/parsers_ambassador.conf \ + conf/parsers_java.conf \ + conf/parsers_extra.conf \ + conf/parsers_openstack.conf \ + conf/parsers_cinder.conf \ + conf/plugins.conf \ + /fluent-bit/etc/ + +# Generate schema and include as part of the container image +RUN /fluent-bit/bin/fluent-bit -J > /fluent-bit/etc/schema.json + +# Simple example of how to properly extract packages for reuse in distroless +# Taken from: https://github.com/GoogleContainerTools/distroless/issues/863 +FROM debian:bookworm-slim AS deb-extractor +COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ +COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ + +# We download all debs locally then extract them into a directory we can use as the root for distroless. +# We also include some extra handling for the status files that some tooling uses for scanning, etc. +WORKDIR /tmp +SHELL ["/bin/bash", "-o", "pipefail", "-c"] +RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ + apt-get update && \ + apt-get download \ + libssl3 \ + libcurl4 \ + libnghttp2-14 \ + librtmp1 \ + libssh2-1 \ + libpsl5 \ + libbrotli1 \ + libsasl2-2 \ + pkg-config \ + libpq5 \ + libsystemd0/bookworm-backports \ + zlib1g \ + ca-certificates \ + libatomic1 \ + libgcrypt20 \ + libzstd1 \ + liblz4-1 \ + libgssapi-krb5-2 \ + libldap-2.5 \ + libgpg-error0 \ + libkrb5-3 \ + libk5crypto3 \ + libcom-err2 \ + libkrb5support0 \ + libgnutls30 \ + libkeyutils1 \ + libp11-kit0 \ + libidn2-0 \ + libunistring2 \ + libtasn1-6 \ + libnettle8 \ + libhogweed6 \ + libgmp10 \ + libffi8 \ + liblzma5 \ + libyaml-0-2 \ + libcap2 \ + && \ + mkdir -p /dpkg/var/lib/dpkg/status.d/ && \ + for deb in *.deb; do \ + package_name=$(dpkg-deb -I "${deb}" | awk '/^ Package: .*$/ {print $2}'); \ + echo "Processing: ${package_name}"; \ + dpkg --ctrl-tarfile "$deb" | tar -Oxf - ./control > "/dpkg/var/lib/dpkg/status.d/${package_name}"; \ + dpkg --extract "$deb" /dpkg || exit 10; \ + done + +# Remove unnecessary files extracted from deb packages like man pages and docs etc. +RUN find /dpkg/ -type d -empty -delete && \ + rm -r /dpkg/usr/share/doc/ + +# We want latest at time of build +# hadolint ignore=DL3006 +FROM gcr.io/distroless/cc-debian12 AS production +ARG RELEASE_VERSION +ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} +LABEL description="Fluent Bit multi-architecture container image" \ + vendor="Fluent Organization" \ + version="${RELEASE_VERSION}" \ + author="Eduardo Silva " \ + org.opencontainers.image.description="Fluent Bit container image" \ + org.opencontainers.image.title="Fluent Bit" \ + org.opencontainers.image.licenses="Apache-2.0" \ + org.opencontainers.image.vendor="Fluent Organization" \ + org.opencontainers.image.version="${RELEASE_VERSION}" \ + org.opencontainers.image.source="https://github.com/fluent/fluent-bit" \ + org.opencontainers.image.documentation="https://docs.fluentbit.io/" \ + org.opencontainers.image.authors="Eduardo Silva " + +# Copy the libraries from the extractor stage into root +COPY --from=deb-extractor /dpkg / + +# Copy certificates +COPY --from=builder /etc/ssl/certs /etc/ssl/certs + +# Finally the binaries as most likely to change +COPY --from=builder /fluent-bit /fluent-bit + +EXPOSE 2020 + +# Entry point +ENTRYPOINT [ "/fluent-bit/bin/fluent-bit" ] +CMD ["/fluent-bit/bin/fluent-bit", "-c", "/fluent-bit/etc/fluent-bit.conf"] + +FROM debian:bookworm-slim AS debug +ARG RELEASE_VERSION +ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} +LABEL description="Fluent Bit multi-architecture debug container image" \ + vendor="Fluent Organization" \ + version="${RELEASE_VERSION}-debug" \ + author="Eduardo Silva " \ + org.opencontainers.image.description="Fluent Bit debug container image" \ + org.opencontainers.image.title="Fluent Bit Debug" \ + org.opencontainers.image.licenses="Apache-2.0" \ + org.opencontainers.image.vendor="Fluent Organization" \ + org.opencontainers.image.version="${RELEASE_VERSION}-debug" \ + org.opencontainers.image.source="https://github.com/fluent/fluent-bit" \ + org.opencontainers.image.documentation="https://docs.fluentbit.io/" \ + org.opencontainers.image.authors="Eduardo Silva " + +COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ +COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ +ENV DEBIAN_FRONTEND=noninteractive + +# hadolint ignore=DL3008 +RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libssl3 \ + libcurl4 \ + libnghttp2-14 \ + librtmp1 \ + libssh2-1 \ + libpsl5 \ + libbrotli1 \ + libsasl2-2 \ + pkg-config \ + libpq5 \ + libsystemd0/bookworm-backports \ + zlib1g \ + ca-certificates \ + libatomic1 \ + libgcrypt20 \ + libyaml-0-2 \ + bash gdb valgrind build-essential \ + git bash-completion vim tmux jq \ + dnsutils iputils-ping iputils-arping iputils-tracepath iputils-clockdiff \ + tcpdump curl nmap tcpflow iftop \ + net-tools mtr netcat-openbsd bridge-utils iperf ngrep \ + openssl \ + htop atop strace iotop sysstat ncdu logrotate hdparm pciutils psmisc tree pv \ + make tar flex bison \ + libssl-dev libsasl2-dev libsystemd-dev/bookworm-backports zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ + && apt-get satisfy -y cmake "cmake (<< 4.0)" \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN rm -f /usr/bin/qemu-*-static +COPY --from=builder /fluent-bit /fluent-bit + +EXPOSE 2020 + +# No entry point so we can just shell in +CMD ["/fluent-bit/bin/fluent-bit", "-c", "/fluent-bit/etc/fluent-bit.conf"] + + From e800300b0f1e490dfcbbc89a714f48775e7a97cd Mon Sep 17 00:00:00 2001 From: Marcus Sorensen Date: Mon, 17 Nov 2025 11:20:11 -0700 Subject: [PATCH 19/37] dockerfile: allow customization of FLB_JEMALLOC_OPTIONS Signed-off-by: Marcus Sorensen --- dockerfiles/Dockerfile | 5 + dockerfiles/Dockerfile.largepage | 277 ------------------------------- 2 files changed, 5 insertions(+), 277 deletions(-) delete mode 100644 dockerfiles/Dockerfile.largepage diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 33a832888c9..4d19e1877b0 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -72,6 +72,10 @@ ARG WAMR_BUILD_TARGET ARG EXTRA_CMAKE_FLAGS ENV EXTRA_CMAKE_FLAGS=${EXTRA_CMAKE_FLAGS} +# Optional: jemalloc configure flags (e.g., page size). Leave unset to keep defaults. +ARG FLB_JEMALLOC_OPTIONS +ENV FLB_JEMALLOC_OPTIONS=${FLB_JEMALLOC_OPTIONS} + # We do not want word splitting for EXTRA_CMAKE_FLAGS in case multiple are defined # hadolint ignore=SC2086 RUN [ -n "${WAMR_BUILD_TARGET:-}" ] && EXTRA_CMAKE_FLAGS="$EXTRA_CMAKE_FLAGS -DWAMR_BUILD_TARGET=$WAMR_BUILD_TARGET"; \ @@ -89,6 +93,7 @@ RUN [ -n "${WAMR_BUILD_TARGET:-}" ] && EXTRA_CMAKE_FLAGS="$EXTRA_CMAKE_FLAGS -DW -DFLB_NIGHTLY_BUILD="$FLB_NIGHTLY_BUILD" \ -DFLB_LOG_NO_CONTROL_CHARS=On \ -DFLB_CHUNK_TRACE="$FLB_CHUNK_TRACE" \ + -DFLB_JEMALLOC_OPTIONS="$FLB_JEMALLOC_OPTIONS" \ $EXTRA_CMAKE_FLAGS \ .. diff --git a/dockerfiles/Dockerfile.largepage b/dockerfiles/Dockerfile.largepage deleted file mode 100644 index daef9ba16cb..00000000000 --- a/dockerfiles/Dockerfile.largepage +++ /dev/null @@ -1,277 +0,0 @@ -# syntax=docker/dockerfile:1 -# check=skip=InvalidBaseImagePlatform - -# To use this container you may need to do the following: -# https://askubuntu.com/a/1369504 -# sudo add-apt-repository ppa:jacob/virtualisation #(for Ubuntu 20.04) -# sudo apt-get update && sudo apt-get install qemu qemu-user qemu-user-static -# https://stackoverflow.com/a/60667468 -# docker run --rm --privileged multiarch/qemu-user-static --reset -p yes -# docker buildx rm builder -# docker buildx create --name builder --use -# docker buildx inspect --bootstrap -# docker buildx build --platform "linux/amd64,linux/arm64,linux/arm/v7,linux/s390x" -f ./dockerfiles/Dockerfile.multiarch --build-arg FLB_TARBALL=https://github.com/fluent/fluent-bit/archive/v1.8.11.tar.gz ./dockerfiles/ - -# Set this to the current release version: it gets done so as part of the release. -ARG RELEASE_VERSION=4.2.1 - -# For multi-arch builds - assumption is running on an AMD64 host -FROM multiarch/qemu-user-static:x86_64-arm AS qemu-arm32 -FROM multiarch/qemu-user-static:x86_64-aarch64 AS qemu-arm64 - -FROM debian:bookworm-slim AS builder-base - -COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ -COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ - -ARG FLB_NIGHTLY_BUILD -ENV FLB_NIGHTLY_BUILD=$FLB_NIGHTLY_BUILD - -ARG FLB_CHUNK_TRACE=On -ENV FLB_CHUNK_TRACE=${FLB_CHUNK_TRACE} - -RUN mkdir -p /fluent-bit/bin /fluent-bit/etc /fluent-bit/log - -ENV DEBIAN_FRONTEND=noninteractive - -# hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - ca-certificates \ - git \ - make \ - tar \ - libssl-dev \ - libcurl4-openssl-dev \ - libsasl2-dev \ - pkg-config \ - libsystemd-dev/bookworm-backports \ - zlib1g-dev \ - libpq-dev \ - postgresql-server-dev-all \ - flex \ - bison \ - libyaml-dev \ - && apt-get satisfy -y cmake "cmake (<< 4.0)" \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -# Must be run from root of repo -WORKDIR /src/fluent-bit/ -COPY . ./ - -# We split the builder setup out so people can target it or use as a base image without doing a full build. -FROM builder-base AS builder -WORKDIR /src/fluent-bit/build/ - -# Required to be set to ARMV7 for that target -ARG WAMR_BUILD_TARGET -ARG EXTRA_CMAKE_FLAGS -ENV EXTRA_CMAKE_FLAGS=${EXTRA_CMAKE_FLAGS} - -# Enable jemalloc large page support via CMake option introduced in 5ca1c93 -ARG FLB_JEMALLOC_OPTIONS="--with-lg-page=16 --with-lg-quantum=3" -ENV FLB_JEMALLOC_OPTIONS=${FLB_JEMALLOC_OPTIONS} - -# We do not want word splitting for EXTRA_CMAKE_FLAGS in case multiple are defined -# hadolint ignore=SC2086 -RUN [ -n "${WAMR_BUILD_TARGET:-}" ] && EXTRA_CMAKE_FLAGS="$EXTRA_CMAKE_FLAGS -DWAMR_BUILD_TARGET=$WAMR_BUILD_TARGET"; \ - cmake -DFLB_SIMD=On \ - -DFLB_RELEASE=On \ - -DFLB_JEMALLOC=On \ - -DFLB_TLS=On \ - -DFLB_SHARED_LIB=Off \ - -DFLB_EXAMPLES=Off \ - -DFLB_HTTP_SERVER=On \ - -DFLB_IN_EXEC=Off \ - -DFLB_IN_SYSTEMD=On \ - -DFLB_OUT_KAFKA=On \ - -DFLB_OUT_PGSQL=On \ - -DFLB_NIGHTLY_BUILD="$FLB_NIGHTLY_BUILD" \ - -DFLB_LOG_NO_CONTROL_CHARS=On \ - -DFLB_CHUNK_TRACE="$FLB_CHUNK_TRACE" \ - -DFLB_JEMALLOC_OPTIONS="$FLB_JEMALLOC_OPTIONS" \ - $EXTRA_CMAKE_FLAGS \ - .. - -ARG CFLAGS="-v" -ENV CFLAGS=${CFLAGS} - -RUN make -j "$(getconf _NPROCESSORS_ONLN)" -RUN install bin/fluent-bit /fluent-bit/bin/ - -# Configuration files -COPY conf/fluent-bit.conf \ - conf/parsers.conf \ - conf/parsers_ambassador.conf \ - conf/parsers_java.conf \ - conf/parsers_extra.conf \ - conf/parsers_openstack.conf \ - conf/parsers_cinder.conf \ - conf/plugins.conf \ - /fluent-bit/etc/ - -# Generate schema and include as part of the container image -RUN /fluent-bit/bin/fluent-bit -J > /fluent-bit/etc/schema.json - -# Simple example of how to properly extract packages for reuse in distroless -# Taken from: https://github.com/GoogleContainerTools/distroless/issues/863 -FROM debian:bookworm-slim AS deb-extractor -COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ -COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ - -# We download all debs locally then extract them into a directory we can use as the root for distroless. -# We also include some extra handling for the status files that some tooling uses for scanning, etc. -WORKDIR /tmp -SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ - apt-get download \ - libssl3 \ - libcurl4 \ - libnghttp2-14 \ - librtmp1 \ - libssh2-1 \ - libpsl5 \ - libbrotli1 \ - libsasl2-2 \ - pkg-config \ - libpq5 \ - libsystemd0/bookworm-backports \ - zlib1g \ - ca-certificates \ - libatomic1 \ - libgcrypt20 \ - libzstd1 \ - liblz4-1 \ - libgssapi-krb5-2 \ - libldap-2.5 \ - libgpg-error0 \ - libkrb5-3 \ - libk5crypto3 \ - libcom-err2 \ - libkrb5support0 \ - libgnutls30 \ - libkeyutils1 \ - libp11-kit0 \ - libidn2-0 \ - libunistring2 \ - libtasn1-6 \ - libnettle8 \ - libhogweed6 \ - libgmp10 \ - libffi8 \ - liblzma5 \ - libyaml-0-2 \ - libcap2 \ - && \ - mkdir -p /dpkg/var/lib/dpkg/status.d/ && \ - for deb in *.deb; do \ - package_name=$(dpkg-deb -I "${deb}" | awk '/^ Package: .*$/ {print $2}'); \ - echo "Processing: ${package_name}"; \ - dpkg --ctrl-tarfile "$deb" | tar -Oxf - ./control > "/dpkg/var/lib/dpkg/status.d/${package_name}"; \ - dpkg --extract "$deb" /dpkg || exit 10; \ - done - -# Remove unnecessary files extracted from deb packages like man pages and docs etc. -RUN find /dpkg/ -type d -empty -delete && \ - rm -r /dpkg/usr/share/doc/ - -# We want latest at time of build -# hadolint ignore=DL3006 -FROM gcr.io/distroless/cc-debian12 AS production -ARG RELEASE_VERSION -ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} -LABEL description="Fluent Bit multi-architecture container image" \ - vendor="Fluent Organization" \ - version="${RELEASE_VERSION}" \ - author="Eduardo Silva " \ - org.opencontainers.image.description="Fluent Bit container image" \ - org.opencontainers.image.title="Fluent Bit" \ - org.opencontainers.image.licenses="Apache-2.0" \ - org.opencontainers.image.vendor="Fluent Organization" \ - org.opencontainers.image.version="${RELEASE_VERSION}" \ - org.opencontainers.image.source="https://github.com/fluent/fluent-bit" \ - org.opencontainers.image.documentation="https://docs.fluentbit.io/" \ - org.opencontainers.image.authors="Eduardo Silva " - -# Copy the libraries from the extractor stage into root -COPY --from=deb-extractor /dpkg / - -# Copy certificates -COPY --from=builder /etc/ssl/certs /etc/ssl/certs - -# Finally the binaries as most likely to change -COPY --from=builder /fluent-bit /fluent-bit - -EXPOSE 2020 - -# Entry point -ENTRYPOINT [ "/fluent-bit/bin/fluent-bit" ] -CMD ["/fluent-bit/bin/fluent-bit", "-c", "/fluent-bit/etc/fluent-bit.conf"] - -FROM debian:bookworm-slim AS debug -ARG RELEASE_VERSION -ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} -LABEL description="Fluent Bit multi-architecture debug container image" \ - vendor="Fluent Organization" \ - version="${RELEASE_VERSION}-debug" \ - author="Eduardo Silva " \ - org.opencontainers.image.description="Fluent Bit debug container image" \ - org.opencontainers.image.title="Fluent Bit Debug" \ - org.opencontainers.image.licenses="Apache-2.0" \ - org.opencontainers.image.vendor="Fluent Organization" \ - org.opencontainers.image.version="${RELEASE_VERSION}-debug" \ - org.opencontainers.image.source="https://github.com/fluent/fluent-bit" \ - org.opencontainers.image.documentation="https://docs.fluentbit.io/" \ - org.opencontainers.image.authors="Eduardo Silva " - -COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ -COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ -ENV DEBIAN_FRONTEND=noninteractive - -# hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - libssl3 \ - libcurl4 \ - libnghttp2-14 \ - librtmp1 \ - libssh2-1 \ - libpsl5 \ - libbrotli1 \ - libsasl2-2 \ - pkg-config \ - libpq5 \ - libsystemd0/bookworm-backports \ - zlib1g \ - ca-certificates \ - libatomic1 \ - libgcrypt20 \ - libyaml-0-2 \ - bash gdb valgrind build-essential \ - git bash-completion vim tmux jq \ - dnsutils iputils-ping iputils-arping iputils-tracepath iputils-clockdiff \ - tcpdump curl nmap tcpflow iftop \ - net-tools mtr netcat-openbsd bridge-utils iperf ngrep \ - openssl \ - htop atop strace iotop sysstat ncdu logrotate hdparm pciutils psmisc tree pv \ - make tar flex bison \ - libssl-dev libsasl2-dev libsystemd-dev/bookworm-backports zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ - && apt-get satisfy -y cmake "cmake (<< 4.0)" \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -RUN rm -f /usr/bin/qemu-*-static -COPY --from=builder /fluent-bit /fluent-bit - -EXPOSE 2020 - -# No entry point so we can just shell in -CMD ["/fluent-bit/bin/fluent-bit", "-c", "/fluent-bit/etc/fluent-bit.conf"] - - From 420eb671fd5b9087ab2a3899320b0dc01758ff71 Mon Sep 17 00:00:00 2001 From: Patrick Stephens Date: Thu, 9 Oct 2025 10:07:08 +0100 Subject: [PATCH 20/37] dockerfile: update to Debian Trixie Signed-off-by: Patrick Stephens --- dockerfiles/Dockerfile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 4d19e1877b0..8e8f16e8bc5 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -19,7 +19,7 @@ ARG RELEASE_VERSION=4.2.1 FROM multiarch/qemu-user-static:x86_64-arm AS qemu-arm32 FROM multiarch/qemu-user-static:x86_64-aarch64 AS qemu-arm64 -FROM debian:bookworm-slim AS builder-base +FROM debian:trixie-slim AS builder-base COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ @@ -35,7 +35,7 @@ RUN mkdir -p /fluent-bit/bin /fluent-bit/etc /fluent-bit/log ENV DEBIAN_FRONTEND=noninteractive # hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ +RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ @@ -48,7 +48,7 @@ RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/ libcurl4-openssl-dev \ libsasl2-dev \ pkg-config \ - libsystemd-dev/bookworm-backports \ + libsystemd-dev/trixie-backports \ zlib1g-dev \ libpq-dev \ postgresql-server-dev-all \ @@ -119,7 +119,7 @@ RUN /fluent-bit/bin/fluent-bit -J > /fluent-bit/etc/schema.json # Simple example of how to properly extract packages for reuse in distroless # Taken from: https://github.com/GoogleContainerTools/distroless/issues/863 -FROM debian:bookworm-slim AS deb-extractor +FROM debian:trixie-slim AS deb-extractor COPY --from=qemu-arm32 /usr/bin/qemu-arm-static /usr/bin/ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ @@ -127,7 +127,7 @@ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ # We also include some extra handling for the status files that some tooling uses for scanning, etc. WORKDIR /tmp SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ +RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get download \ libssl3 \ @@ -140,7 +140,7 @@ RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/ libsasl2-2 \ pkg-config \ libpq5 \ - libsystemd0/bookworm-backports \ + libsystemd0/trixie-backports \ zlib1g \ ca-certificates \ libatomic1 \ @@ -182,7 +182,7 @@ RUN find /dpkg/ -type d -empty -delete && \ # We want latest at time of build # hadolint ignore=DL3006 -FROM gcr.io/distroless/cc-debian12 AS production +FROM gcr.io/distroless/cc-debian13 AS production ARG RELEASE_VERSION ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} LABEL description="Fluent Bit multi-architecture container image" \ @@ -213,7 +213,7 @@ EXPOSE 2020 ENTRYPOINT [ "/fluent-bit/bin/fluent-bit" ] CMD ["/fluent-bit/bin/fluent-bit", "-c", "/fluent-bit/etc/fluent-bit.conf"] -FROM debian:bookworm-slim AS debug +FROM debian:trixie-slim AS debug ARG RELEASE_VERSION ENV FLUENT_BIT_VERSION=${RELEASE_VERSION} LABEL description="Fluent Bit multi-architecture debug container image" \ @@ -234,7 +234,7 @@ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ ENV DEBIAN_FRONTEND=noninteractive # hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/sources.list && \ +RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get install -y --no-install-recommends \ libssl3 \ @@ -247,7 +247,7 @@ RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/ libsasl2-2 \ pkg-config \ libpq5 \ - libsystemd0/bookworm-backports \ + libsystemd0/trixie-backports \ zlib1g \ ca-certificates \ libatomic1 \ @@ -261,7 +261,7 @@ RUN echo "deb http://deb.debian.org/debian bookworm-backports main" >> /etc/apt/ openssl \ htop atop strace iotop sysstat ncdu logrotate hdparm pciutils psmisc tree pv \ make tar flex bison \ - libssl-dev libsasl2-dev libsystemd-dev/bookworm-backports zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ + libssl-dev libsasl2-dev libsystemd-dev/trixie-backports zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ && apt-get satisfy -y cmake "cmake (<< 4.0)" \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From b0e4bbe1cd1fdd7bdd607cf0a039e0ed6be6efee Mon Sep 17 00:00:00 2001 From: Patrick Stephens Date: Thu, 9 Oct 2025 10:13:17 +0100 Subject: [PATCH 21/37] dockerfile: install systemd libs from normal repo Signed-off-by: Patrick Stephens --- dockerfiles/Dockerfile | 45 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/dockerfiles/Dockerfile b/dockerfiles/Dockerfile index 8e8f16e8bc5..f0cf7267f75 100644 --- a/dockerfiles/Dockerfile +++ b/dockerfiles/Dockerfile @@ -35,8 +35,7 @@ RUN mkdir -p /fluent-bit/bin /fluent-bit/etc /fluent-bit/log ENV DEBIAN_FRONTEND=noninteractive # hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ curl \ @@ -48,7 +47,7 @@ RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/so libcurl4-openssl-dev \ libsasl2-dev \ pkg-config \ - libsystemd-dev/trixie-backports \ + libsystemd-dev \ zlib1g-dev \ libpq-dev \ postgresql-server-dev-all \ @@ -127,20 +126,20 @@ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ # We also include some extra handling for the status files that some tooling uses for scanning, etc. WORKDIR /tmp SHELL ["/bin/bash", "-o", "pipefail", "-c"] -RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ +RUN apt-get update && \ apt-get download \ - libssl3 \ - libcurl4 \ + libssl3t64 \ + libcurl4t64 \ libnghttp2-14 \ + libnghttp3-9 \ librtmp1 \ - libssh2-1 \ - libpsl5 \ + libssh2-1t64 \ + libpsl5t64 \ libbrotli1 \ libsasl2-2 \ pkg-config \ libpq5 \ - libsystemd0/trixie-backports \ + libsystemd0 \ zlib1g \ ca-certificates \ libatomic1 \ @@ -154,19 +153,20 @@ RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/so libk5crypto3 \ libcom-err2 \ libkrb5support0 \ - libgnutls30 \ + libgnutls30t64 \ libkeyutils1 \ libp11-kit0 \ libidn2-0 \ - libunistring2 \ + libunistring5 \ libtasn1-6 \ - libnettle8 \ - libhogweed6 \ + libnettle8t64 \ + libhogweed6t64 \ libgmp10 \ libffi8 \ liblzma5 \ libyaml-0-2 \ libcap2 \ + libldap2 \ && \ mkdir -p /dpkg/var/lib/dpkg/status.d/ && \ for deb in *.deb; do \ @@ -234,25 +234,26 @@ COPY --from=qemu-arm64 /usr/bin/qemu-aarch64-static /usr/bin/ ENV DEBIAN_FRONTEND=noninteractive # hadolint ignore=DL3008 -RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/sources.list && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y --no-install-recommends \ - libssl3 \ - libcurl4 \ + libssl3t64 \ + libcurl4t64 \ libnghttp2-14 \ + libnghttp3-9 \ librtmp1 \ - libssh2-1 \ - libpsl5 \ + libssh2-1t64 \ + libpsl5t64 \ libbrotli1 \ libsasl2-2 \ pkg-config \ libpq5 \ - libsystemd0/trixie-backports \ + libsystemd0 \ zlib1g \ ca-certificates \ libatomic1 \ libgcrypt20 \ libyaml-0-2 \ + libldap2 \ bash gdb valgrind build-essential \ git bash-completion vim tmux jq \ dnsutils iputils-ping iputils-arping iputils-tracepath iputils-clockdiff \ @@ -261,7 +262,7 @@ RUN echo "deb http://deb.debian.org/debian trixie-backports main" >> /etc/apt/so openssl \ htop atop strace iotop sysstat ncdu logrotate hdparm pciutils psmisc tree pv \ make tar flex bison \ - libssl-dev libsasl2-dev libsystemd-dev/trixie-backports zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ + libssl-dev libsasl2-dev libsystemd-dev zlib1g-dev libpq-dev libyaml-dev postgresql-server-dev-all \ && apt-get satisfy -y cmake "cmake (<< 4.0)" \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* From e73dd18a5e0b6372301c5e866be7b0f477d6afc7 Mon Sep 17 00:00:00 2001 From: "Eric D. Schabell" Date: Fri, 28 Nov 2025 17:31:01 +0100 Subject: [PATCH 22/37] in_elasticsearch: fix missing http config parameter description (#11221) Signed-off-by: Eric D. Schabell --- plugins/in_elasticsearch/in_elasticsearch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/in_elasticsearch/in_elasticsearch.c b/plugins/in_elasticsearch/in_elasticsearch.c index db09aa281ea..34daecc13c1 100644 --- a/plugins/in_elasticsearch/in_elasticsearch.c +++ b/plugins/in_elasticsearch/in_elasticsearch.c @@ -237,7 +237,7 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_BOOL, "http2", "true", 0, FLB_TRUE, offsetof(struct flb_in_elasticsearch, enable_http2), - NULL + "Enable HTTP/2 support" }, { From 6bc2b792142e6bee4234eaa33e0b3da260c79ae6 Mon Sep 17 00:00:00 2001 From: "Eric D. Schabell" Date: Mon, 1 Dec 2025 12:25:10 +0100 Subject: [PATCH 23/37] in_exec_wasi: fix config key typo 'bool' -> 'oneshot' The config_map entry for the oneshot option was incorrectly using 'bool' as the configuration key instead of 'oneshot'. This made the configuration inconsistent with the regular in_exec plugin and confusing for users. Signed-off-by: Eric D. Schabell --- plugins/in_exec_wasi/in_exec_wasi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/in_exec_wasi/in_exec_wasi.c b/plugins/in_exec_wasi/in_exec_wasi.c index d10f763e176..14ac06acdf1 100644 --- a/plugins/in_exec_wasi/in_exec_wasi.c +++ b/plugins/in_exec_wasi/in_exec_wasi.c @@ -453,7 +453,7 @@ static struct flb_config_map config_map[] = { "Set the buffer size" }, { - FLB_CONFIG_MAP_BOOL, "bool", "false", + FLB_CONFIG_MAP_BOOL, "oneshot", "false", 0, FLB_TRUE, offsetof(struct flb_exec_wasi, oneshot), "execute the command only once" }, From 42def2794c6048580a17ac784625ade4c6bcca65 Mon Sep 17 00:00:00 2001 From: "Eric D. Schabell" Date: Tue, 2 Dec 2025 09:28:32 +0100 Subject: [PATCH 24/37] in_forward: improve configuration parameter descriptions - shared_key: clarify it's for secure forward authentication - self_hostname: explain it's used in handshake for secure forward auth - unix_perm: add trailing period for consistency - empty_shared_key: clarify it enables empty string as shared key Signed-off-by: Eric D. Schabell --- plugins/in_forward/fw.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/in_forward/fw.c b/plugins/in_forward/fw.c index d33b36fbadc..504f1c7aed2 100644 --- a/plugins/in_forward/fw.c +++ b/plugins/in_forward/fw.c @@ -481,12 +481,12 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_STR, "shared_key", NULL, 0, FLB_TRUE, offsetof(struct flb_in_fw_config, shared_key), - "Shared key for authentication" + "Shared key for secure forward authentication." }, { FLB_CONFIG_MAP_STR, "self_hostname", NULL, 0, FLB_FALSE, 0, - "Hostname" + "Hostname used in the handshake process for secure forward authentication." }, { FLB_CONFIG_MAP_STR, "security.users", NULL, @@ -501,7 +501,7 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_STR, "unix_perm", (char *)NULL, 0, FLB_TRUE, offsetof(struct flb_in_fw_config, unix_perm_str), - "Set the permissions for the UNIX socket" + "Set the permissions for the UNIX socket." }, { FLB_CONFIG_MAP_SIZE, "buffer_chunk_size", FLB_IN_FW_CHUNK_SIZE, @@ -516,7 +516,7 @@ static struct flb_config_map config_map[] = { { FLB_CONFIG_MAP_BOOL, "empty_shared_key", "false", 0, FLB_TRUE, offsetof(struct flb_in_fw_config, empty_shared_key), - "Set an empty shared key for authentication" + "Enable an empty string as the shared key for authentication." }, {0} }; From 4248c111116232222494bb453c0f090d388b9964 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Thu, 4 Dec 2025 12:37:12 +0900 Subject: [PATCH 25/37] github: scripts: commit_linter: Extend a capability of prefix inferences Signed-off-by: Hiroshi Hatake --- .github/scripts/commit_prefix_check.py | 97 ++++++++++++++++++-------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/.github/scripts/commit_prefix_check.py b/.github/scripts/commit_prefix_check.py index 338d00dea9f..6043891a0cf 100644 --- a/.github/scripts/commit_prefix_check.py +++ b/.github/scripts/commit_prefix_check.py @@ -25,30 +25,64 @@ # ------------------------------------------------ -# Identify expected prefix dynamically from file paths +# Identify expected prefixes dynamically from file paths # ------------------------------------------------ def infer_prefix_from_paths(paths): + """ + Returns: + - prefixes: a set of allowed prefixes (including build:) + - build_optional: True when commit subject does not need to be build: + (i.e., when any real component — lib/tests/plugins/src — is touched) + """ prefixes = set() + component_prefixes = set() + build_seen = False + + for raw in paths: + # Normalize path separators (Windows compatibility) + p = raw.replace(os.sep, "/") + basename = os.path.basename(p) + + # ----- Any CMakeLists.txt → build: candidate ----- + if basename == "CMakeLists.txt": + build_seen = True + + # ----- lib/ → lib: ----- + if p.startswith("lib/"): + component_prefixes.add("lib:") + + # ----- tests/ → tests: ----- + if p.startswith("tests/"): + component_prefixes.add("tests:") - for p in paths: + # ----- plugins// → : ----- if p.startswith("plugins/"): parts = p.split("/") - prefix = parts[1] - prefixes.add(f"{prefix}:") - continue + if len(parts) > 1: + component_prefixes.add(f"{parts[1]}:") + # ----- src/ → flb_xxx.* → xxx: OR src// → : ----- if p.startswith("src/"): filename = os.path.basename(p) if filename.startswith("flb_"): core = filename[4:].split(".")[0] - prefixes.add(f"{core}:") - continue + component_prefixes.add(f"{core}:") + else: + parts = p.split("/") + if len(parts) > 1: + component_prefixes.add(f"{parts[1]}:") - directory = p.split("/")[1] - prefixes.add(f"{directory}:") - continue + # prefixes = component prefixes + build: if needed + prefixes |= component_prefixes + if build_seen: + prefixes.add("build:") - return prefixes + # build_optional: + # True if ANY real component (lib/tests/plugins/src) was modified. + # False only when modifying build system files alone. + build_optional = len(component_prefixes) > 0 + + return prefixes, build_optional # ------------------------------------------------ @@ -84,27 +118,27 @@ def detect_bad_squash(body): # ------------------------------------------------ -# Validate commit per test expectations +# Validate commit based on expected behavior and test rules # ------------------------------------------------ def validate_commit(commit): msg = commit.message.strip() first_line, *rest = msg.split("\n") body = "\n".join(rest) - # Subject must have prefix + # Subject must start with a prefix subject_prefix_match = PREFIX_RE.match(first_line) if not subject_prefix_match: return False, f"Missing prefix in commit subject: '{first_line}'" subject_prefix = subject_prefix_match.group() - # detect_bad_squash must run but - # validate_commit IGNORE bad-squash reason if it was "multiple sign-offs" + # Run squash detection (but ignore multi-signoff errors) bad_squash, reason = detect_bad_squash(body) # If bad squash was caused by prefix lines in body → FAIL # If list of prefix lines in body → FAIL if bad_squash: + # Prefix-like lines are always fatal if "subject-like prefix" in reason: return False, f"Bad squash detected: {reason}" @@ -113,7 +147,7 @@ def validate_commit(commit): # validate_commit ignores multi signoff warnings. pass - # Subject length + # Subject length check if len(first_line) > 80: return False, f"Commit subject too long (>80 chars): '{first_line}'" @@ -122,30 +156,35 @@ def validate_commit(commit): if signoff_count == 0: return False, "Missing Signed-off-by line" - # Determine expected prefix + # Determine expected prefixes + build option flag files = commit.stats.files.keys() - expected = infer_prefix_from_paths(files) + expected, build_optional = infer_prefix_from_paths(files) - # Docs/CI changes + # When no prefix can be inferred (docs/tools), allow anything if len(expected) == 0: return True, "" - # *** TEST EXPECTATION *** - # For mixed components, DO NOT return custom message. - # Instead: same error shape as wrong-prefix case. - if len(expected) > 1: - # Always fail when multiple components are touched (even if prefix matches one) + expected_lower = {p.lower() for p in expected} + subj_lower = subject_prefix.lower() + + # Subject prefix must be one of the expected ones + if subj_lower not in expected_lower: + expected_list = sorted(expected) + expected_str = ", ".join(expected_list) return False, ( f"Subject prefix '{subject_prefix}' does not match files changed.\n" - f"Expected one of: {', '.join(sorted(expected))}" + f"Expected one of: {expected_str}" ) - # Normal prefix mismatch (case-insensitive comparison) - only_expected = next(iter(expected)) - if subject_prefix.lower() != only_expected.lower(): + + return False, f"Commit subject too long (>80 chars): '{first_line}'" + + # If build is NOT optional and build: exists among expected, + # then subject MUST be build: + if not build_optional and "build:" in expected_lower and subj_lower != "build:": return False, ( f"Subject prefix '{subject_prefix}' does not match files changed.\n" - f"Expected one of: {only_expected}" + f"Expected one of: build:" ) return True, "" From d6590a03ba2d7160921883acffc1829cc84709e3 Mon Sep 17 00:00:00 2001 From: Hiroshi Hatake Date: Thu, 4 Dec 2025 13:12:16 +0900 Subject: [PATCH 26/37] github: scripts: commit_linter: Fix failing test cases Signed-off-by: Hiroshi Hatake --- .github/scripts/commit_prefix_check.py | 27 +++++- .github/scripts/tests/test_commit_lint.py | 108 +++++++++------------- 2 files changed, 71 insertions(+), 64 deletions(-) diff --git a/.github/scripts/commit_prefix_check.py b/.github/scripts/commit_prefix_check.py index 6043891a0cf..fffba6d87ae 100644 --- a/.github/scripts/commit_prefix_check.py +++ b/.github/scripts/commit_prefix_check.py @@ -167,6 +167,31 @@ def validate_commit(commit): expected_lower = {p.lower() for p in expected} subj_lower = subject_prefix.lower() + # ------------------------------------------------ + # Multiple-component detection + # ------------------------------------------------ + # Treat pure build-related prefixes ("build:", "CMakeLists.txt:") as non-components. + # Additionally, allow lib: to act as an umbrella for lib subcomponents + # (e.g., ripser:, ripser_wrapper:) when subject prefix is lib:. + non_build_prefixes = { + p + for p in expected_lower + if p not in ("build:", "cmakelists.txt:") + } + + # Prefixes that are allowed to cover multiple subcomponents + umbrella_prefixes = {"lib:"} + + # If more than one non-build prefix is inferred AND the subject is not an umbrella + # prefix, require split commits. + if len(non_build_prefixes) > 1 and subj_lower not in umbrella_prefixes: + expected_list = sorted(expected) + expected_str = ", ".join(expected_list) + return False, ( + f"Subject prefix '{subject_prefix}' does not match files changed.\n" + f"Expected one of: {expected_str}" + ) + # Subject prefix must be one of the expected ones if subj_lower not in expected_lower: expected_list = sorted(expected) @@ -177,8 +202,6 @@ def validate_commit(commit): ) - return False, f"Commit subject too long (>80 chars): '{first_line}'" - # If build is NOT optional and build: exists among expected, # then subject MUST be build: if not build_optional and "build:" in expected_lower and subj_lower != "build:": diff --git a/.github/scripts/tests/test_commit_lint.py b/.github/scripts/tests/test_commit_lint.py index 438b17c1101..575a7f814b2 100644 --- a/.github/scripts/tests/test_commit_lint.py +++ b/.github/scripts/tests/test_commit_lint.py @@ -34,39 +34,37 @@ def test_infer_prefix_plugin(): When a file is in plugins//, the prefix should be :. This is the most common case for Fluent Bit commits modifying plugins. """ - assert infer_prefix_from_paths(["plugins/out_s3/s3.c"]) == {"out_s3:"} + prefixes, build_optional = infer_prefix_from_paths(["plugins/out_s3/s3.c"]) + assert prefixes == {"out_s3:"} + assert build_optional is True def test_infer_prefix_core_file(): """ Test that core source files with flb_ prefix correctly infer the component name. - - Files like src/flb_router.c should produce prefix "router:" by stripping - the "flb_" prefix and file extension. This handles core library components. """ - assert infer_prefix_from_paths(["src/flb_router.c"]) == {"router:"} + prefixes, build_optional = infer_prefix_from_paths(["src/flb_router.c"]) + assert prefixes == {"router:"} + assert build_optional is True def test_infer_prefix_new_core_file(): """ Test that core files with longer names and numbers are handled correctly. - - Ensures the prefix inference works for files like flb_super_router2.c, - extracting "super_router2:" correctly. This validates the regex handles - underscores and numbers in component names. """ - assert infer_prefix_from_paths(["src/flb_super_router2.c"]) == {"super_router2:"} + prefixes, build_optional = infer_prefix_from_paths(["src/flb_super_router2.c"]) + assert prefixes == {"super_router2:"} + assert build_optional is True def test_infer_multiple_prefixes(): """ Test that multiple files from different components produce multiple prefixes. - - When a commit touches files from different components (e.g., a plugin and - a core file), the inference should return all relevant prefixes. This helps - detect commits that mix multiple subsystems, which should be split. """ - assert infer_prefix_from_paths([ + prefixes, build_optional = infer_prefix_from_paths([ "plugins/in_tail/tail.c", "src/flb_router.c" - ]) == {"in_tail:", "router:"} + ]) + assert prefixes == {"in_tail:", "router:"} + # At least one real component touched → build is optional + assert build_optional is True # ----------------------------------------------------------- @@ -251,12 +249,8 @@ def test_error_bad_squash_detected(): def test_error_multiple_prefixes_inferred_from_files(): """ - Test that commits touching multiple components are rejected. - - When a commit modifies files from different components (e.g., both a plugin - and core code), it should be split into separate commits. This keeps - commits focused and makes reviews easier. The error message should list - all expected prefixes. + Commits touching multiple non-build components are rejected and must be + split into separate commits, even if the subject matches one component. """ commit = make_commit( "in_tail: update handler\n\nSigned-off-by: User", @@ -264,7 +258,8 @@ def test_error_multiple_prefixes_inferred_from_files(): ) ok, msg = validate_commit(commit) assert ok is False - assert "Expected one of:" in msg + assert "does not match files changed" in msg + # ----------------------------------------------------------- @@ -295,77 +290,66 @@ def test_docs_or_ci_changes_allowed(): def test_infer_prefix_empty_file_list(): """ Test that an empty file list returns an empty prefix set. - - Edge case: when no files are provided, the function should return - an empty set rather than raising an error. This handles degenerate cases. """ - assert infer_prefix_from_paths([]) == set() + prefixes, build_optional = infer_prefix_from_paths([]) + assert prefixes == set() + # No components, no CMakeLists → build not optional + assert build_optional is False def test_infer_prefix_src_subdirectory(): """ Test prefix inference for files in src/ subdirectories. - - Files in src/ subdirectories (like src/stream_processor/stream.c) that - don't have the flb_ prefix should use the subdirectory name as the prefix. - This handles organized core code that's not in the root src/ directory. """ - assert infer_prefix_from_paths(["src/stream_processor/stream.c"]) == {"stream_processor:"} + prefixes, build_optional = infer_prefix_from_paths(["src/stream_processor/stream.c"]) + assert prefixes == {"stream_processor:"} + assert build_optional is True def test_infer_prefix_unknown_paths(): """ Test that files outside plugins/ and src/ don't generate prefixes. - - Files in unknown locations (not plugins/ or src/) should not generate - any prefix. This allows commits with only documentation, CI, or other - non-code files to use generic prefixes. """ - assert infer_prefix_from_paths(["random/file.c"]) == set() + prefixes, build_optional = infer_prefix_from_paths(["random/file.c"]) + assert prefixes == set() + assert build_optional is False def test_infer_prefix_multiple_same_plugin(): """ Test that multiple files from the same plugin yield a single prefix. - - When a commit modifies multiple files within the same plugin directory - (e.g., .c, .h, and config files), they should all produce the same prefix. - This ensures commits modifying a plugin's internal structure are valid. """ - assert infer_prefix_from_paths([ + prefixes, build_optional = infer_prefix_from_paths([ "plugins/out_s3/s3.c", "plugins/out_s3/s3.h", "plugins/out_s3/config.c" - ]) == {"out_s3:"} + ]) + assert prefixes == {"out_s3:"} + assert build_optional is True def test_infer_prefix_plugin_with_underscores(): """ Test that plugin names with underscores are handled correctly. - - Plugin names can contain underscores (e.g., out_http). The prefix inference - should preserve these underscores in the generated prefix. """ - assert infer_prefix_from_paths(["plugins/out_http/http.c"]) == {"out_http:"} + prefixes, build_optional = infer_prefix_from_paths(["plugins/out_http/http.c"]) + assert prefixes == {"out_http:"} + assert build_optional is True def test_infer_prefix_core_file_with_numbers(): """ Test that core file names with numbers are handled correctly. - - Core files like flb_http2.c should produce "http2:" (not "http2.c:"). - This validates that numbers in component names are preserved correctly. """ - assert infer_prefix_from_paths(["src/flb_http2.c"]) == {"http2:"} + prefixes, build_optional = infer_prefix_from_paths(["src/flb_http2.c"]) + assert prefixes == {"http2:"} + assert build_optional is True def test_infer_prefix_mixed_known_unknown(): """ Test prefix inference with a mix of known and unknown file paths. - - When a commit contains both files that generate prefixes (plugins/, src/) - and files that don't (docs/, random files), only the known paths should - contribute to the prefix set. Unknown paths are ignored. """ - result = infer_prefix_from_paths([ + prefixes, build_optional = infer_prefix_from_paths([ "plugins/in_tail/tail.c", "random/file.txt" ]) - assert result == {"in_tail:"} + assert prefixes == {"in_tail:"} + assert build_optional is True # ----------------------------------------------------------- @@ -620,12 +604,12 @@ def test_valid_config_file_changes(): def test_error_multiple_prefixes_one_matches(): """ - Test that commits touching multiple components fail even if prefix matches one. + When a commit touches multiple different components (e.g., a plugin and a + core subsystem), the linter requires the commit to be split, even if the + subject prefix matches one of those components. - When a commit modifies files from different components, it should be rejected - even if the commit prefix matches one of the components. The error message - should list all expected prefixes to help the developer split the commit. - This enforces the principle of one logical change per commit. + In this case, both 'in_tail:' and 'router:' are valid inferred prefixes, + so the linter must reject the commit and report all expected prefixes. """ commit = make_commit( "in_tail: update\n\nSigned-off-by: User", From 492b655d88bdf905ade754dbbc43c5e1e443d760 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Fri, 5 Dec 2025 09:48:21 -0600 Subject: [PATCH 27/37] build: prevent the toolchain from emitting an executable stack Signed-off-by: Eduardo Silva --- CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 05fac0673d3..e586dec2891 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,14 @@ if (MSVC) add_compile_options(/MT) else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + # Prevent the toolchain from emitting an executable stack on Linux targets, + # which triggers kernel warnings (e.g. "started with executable stack") and + # weakens security hardening. The linker flag is not supported on macOS. + add_compile_options(-Wa,--noexecstack) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,noexecstack") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,noexecstack") + endif() # The following flags are to enhance security, but it may impact performance, # we disable them by default. if (FLB_WASM_STACK_PROTECT) From 1b83e5cc2c70e2bd06e1878a8bdd4472324c12c8 Mon Sep 17 00:00:00 2001 From: Eduardo Silva Date: Thu, 4 Dec 2025 17:18:10 -0600 Subject: [PATCH 28/37] in_forward: fix segfault and double-free in trace path handling - Incomplete error check: only checked ret == -1, but ctr_decode_msgpack_create() can return other error codes. When ctr is NULL on error, this caused NULL pointer dereference. - Double-free: called ctr_decode_msgpack_destroy() after successful flb_input_trace_append(), but that function takes ownership and destroys the context internally. Signed-off-by: Eduardo Silva --- plugins/in_forward/fw_prot.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/in_forward/fw_prot.c b/plugins/in_forward/fw_prot.c index 3fb40918c1c..03de8191166 100644 --- a/plugins/in_forward/fw_prot.c +++ b/plugins/in_forward/fw_prot.c @@ -1146,8 +1146,8 @@ static int append_log(struct flb_input_instance *ins, struct fw_conn *conn, else if (event_type == FLB_EVENT_TYPE_TRACES) { off = 0; ret = ctr_decode_msgpack_create(&ctr, (char *) data, len, &off); - if (ret == -1) { - flb_error("could not decode trace message. ret=%d", ret); + if (ret != CTR_DECODE_MSGPACK_SUCCESS) { + flb_plg_error(ins, "could not decode trace message. ret=%d", ret); return -1; } @@ -1159,7 +1159,7 @@ static int append_log(struct flb_input_instance *ins, struct fw_conn *conn, ctr_decode_msgpack_destroy(ctr); return -1; } - ctr_decode_msgpack_destroy(ctr); + /* Note: flb_input_trace_append takes ownership of ctr and destroys it on success */ } return 0; From dd40effe8a4c56e1708b4c19210db15b2d9b7e63 Mon Sep 17 00:00:00 2001 From: Pierre-Yves Rofes <3604235+piwai@users.noreply.github.com> Date: Thu, 4 Dec 2025 13:51:09 +0100 Subject: [PATCH 29/37] in_node_exporter_metrics: Increase buffer size to read /proc/stat correctly The "intr" entry of proc stat can be larger than 512 chars, and generate errors leading to stalled CPU metrics if it's the wrong length. Signed-off-by: Pierre-Yves Rofes <3604235+piwai@users.noreply.github.com> --- plugins/in_node_exporter_metrics/ne_utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/in_node_exporter_metrics/ne_utils.c b/plugins/in_node_exporter_metrics/ne_utils.c index 0d2a6574512..1bfe74251e3 100644 --- a/plugins/in_node_exporter_metrics/ne_utils.c +++ b/plugins/in_node_exporter_metrics/ne_utils.c @@ -153,7 +153,7 @@ int ne_utils_file_read_lines(const char *mount, const char *path, struct mk_list int len; int ret; FILE *f; - char line[512]; + char line[2048]; char real_path[2048]; mk_list_init(list); From 2ba282f9d756ae6e0d4d3cde578913e9bee8cd31 Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 22:56:32 +0800 Subject: [PATCH 30/37] aws_msk_iam: remove cluster_arn dependency Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 2aae98bb502..0b877f0c5ab 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -606,13 +606,13 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con ctx->flb_config = config; /* Extract region from broker address */ - if (!opaque || !opaque->kafka_ctx) { + if (!opaque || !opaque->ptr) { flb_error("[aws_msk_iam] unable to access kafka context for broker-based region extraction"); flb_free(ctx); return NULL; } - kafka_ctx = opaque->kafka_ctx; + kafka_ctx = (struct flb_kafka *) opaque->ptr; if (!kafka_ctx->brokers || flb_sds_len(kafka_ctx->brokers) == 0) { flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); flb_free(ctx); From 1441be5aea3ab0298228203e62d9b1b697b986dc Mon Sep 17 00:00:00 2001 From: Arbin Date: Fri, 28 Nov 2025 23:46:24 +0800 Subject: [PATCH 31/37] aws_msk_iam,in_kafka,out_kafka: enable AWS MSK IAM authentication Signed-off-by: Arbin --- plugins/in_kafka/in_kafka.c | 40 +++++++++++++++++++++++++++++++- plugins/in_kafka/in_kafka.h | 1 + plugins/out_kafka/kafka_config.c | 36 +++++++++++++++++----------- plugins/out_kafka/kafka_config.h | 1 + 4 files changed, 63 insertions(+), 15 deletions(-) diff --git a/plugins/in_kafka/in_kafka.c b/plugins/in_kafka/in_kafka.c index 9ae003f91a7..a4f638562ff 100644 --- a/plugins/in_kafka/in_kafka.c +++ b/plugins/in_kafka/in_kafka.c @@ -277,6 +277,9 @@ static int in_kafka_init(struct flb_input_instance *ins, #ifdef FLB_HAVE_AWS_MSK_IAM /* Check if using aws_msk_iam as SASL mechanism */ if (strcasecmp(conf, "aws_msk_iam") == 0) { + /* Mark that user explicitly requested AWS MSK IAM */ + ctx->aws_msk_iam = FLB_TRUE; + /* Set SASL mechanism to OAUTHBEARER for librdkafka */ flb_input_set_property(ins, "rdkafka.sasl.mechanism", "OAUTHBEARER"); flb_sds_destroy(ctx->sasl_mechanism); @@ -341,8 +344,21 @@ static int in_kafka_init(struct flb_input_instance *ins, flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); rd_kafka_conf_set_opaque(kafka_conf, ctx->opaque); -#ifdef FLB_HAVE_AWS_MSK_IAM + /* + * Enable SASL queue for all OAUTHBEARER configurations. + * This allows librdkafka to handle OAuth token refresh in a background thread, + * which is essential for idle connections or when poll intervals are large. + * This benefits all OAUTHBEARER methods: AWS IAM, OIDC, custom OAuth, etc. + */ if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + rd_kafka_conf_enable_sasl_queue(kafka_conf, 1); + flb_plg_debug(ins, "SASL queue enabled for OAUTHBEARER mechanism"); + } + +#ifdef FLB_HAVE_AWS_MSK_IAM + /* Only register MSK IAM if user explicitly requested it via rdkafka.sasl.mechanism=aws_msk_iam */ + if (ctx->aws_msk_iam && ctx->sasl_mechanism && + strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { /* Check if brokers are configured for MSK IAM */ conf = flb_input_get_property("brokers", ins); if (conf && (strstr(conf, ".kafka.") || strstr(conf, ".kafka-serverless.")) && @@ -378,6 +394,28 @@ static int in_kafka_init(struct flb_input_instance *ins, goto init_error; } + /* + * Enable SASL background callbacks for all OAUTHBEARER configurations. + * This ensures OAuth tokens are refreshed automatically even when: + * - Poll intervals are large + * - Topics have no messages + * - Collector is paused + * This benefits all OAUTHBEARER methods: AWS IAM, OIDC, custom OAuth, etc. + */ + if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + rd_kafka_error_t *error; + error = rd_kafka_sasl_background_callbacks_enable(ctx->kafka.rk); + if (error) { + flb_plg_warn(ins, "failed to enable SASL background callbacks: %s. " + "OAuth tokens may not refresh during idle periods.", + rd_kafka_error_string(error)); + rd_kafka_error_destroy(error); + } + else { + flb_plg_info(ins, "OAUTHBEARER: SASL background callbacks enabled"); + } + } + /* Trigger initial token refresh for OAUTHBEARER */ rd_kafka_poll(ctx->kafka.rk, 0); diff --git a/plugins/in_kafka/in_kafka.h b/plugins/in_kafka/in_kafka.h index 4792ae5b947..8319b08ec82 100644 --- a/plugins/in_kafka/in_kafka.h +++ b/plugins/in_kafka/in_kafka.h @@ -56,6 +56,7 @@ struct flb_in_kafka_config { #ifdef FLB_HAVE_AWS_MSK_IAM struct flb_aws_msk_iam *msk_iam; + int aws_msk_iam; /* Flag to indicate user explicitly requested AWS MSK IAM */ #endif /* SASL mechanism configured in rdkafka.sasl.mechanism */ diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index 2c9f6885734..37792a78d91 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -67,6 +67,9 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, #ifdef FLB_HAVE_AWS_MSK_IAM /* Check if using aws_msk_iam as SASL mechanism */ if (strcasecmp(tmp, "aws_msk_iam") == 0) { + /* Mark that user explicitly requested AWS MSK IAM */ + ctx->aws_msk_iam = FLB_TRUE; + /* Set SASL mechanism to OAUTHBEARER for librdkafka */ flb_output_set_property(ins, "rdkafka.sasl.mechanism", "OAUTHBEARER"); flb_sds_destroy(ctx->sasl_mechanism); @@ -203,20 +206,26 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); rd_kafka_conf_set_opaque(ctx->conf, ctx->opaque); -#ifdef FLB_HAVE_AWS_MSK_IAM + /* + * Enable SASL queue for all OAUTHBEARER configurations. + * This allows librdkafka to handle OAuth token refresh in a background thread, + * which is essential for idle connections where rd_kafka_poll() is not called. + * This benefits all OAUTHBEARER methods: AWS IAM, OIDC, custom OAuth, etc. + */ if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { + rd_kafka_conf_enable_sasl_queue(ctx->conf, 1); + flb_plg_debug(ins, "SASL queue enabled for OAUTHBEARER mechanism"); + } + +#ifdef FLB_HAVE_AWS_MSK_IAM + /* Only register MSK IAM if user explicitly requested it via rdkafka.sasl.mechanism=aws_msk_iam */ + if (ctx->aws_msk_iam && ctx->sasl_mechanism && + strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { /* Check if brokers are configured for MSK IAM */ tmp = flb_output_get_property("brokers", ins); if (tmp && (strstr(tmp, ".kafka.") || strstr(tmp, ".kafka-serverless.")) && strstr(tmp, ".amazonaws.com")) { - /* - * Enable SASL queue for background callbacks BEFORE registering OAuth callback. - * This allows librdkafka to handle OAuth token refresh in a background thread, - * which is essential for idle connections where rd_kafka_poll() is not called. - */ - rd_kafka_conf_enable_sasl_queue(ctx->conf, 1); - /* Register MSK IAM OAuth callback - extract region from broker address */ flb_plg_info(ins, "registering AWS MSK IAM authentication (region auto-extracted from broker)"); ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, @@ -248,12 +257,12 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, return NULL; } -#ifdef FLB_HAVE_AWS_MSK_IAM /* - * Enable SASL background callbacks for MSK IAM to ensure OAuth tokens - * are refreshed automatically even on idle connections. + * Enable SASL background callbacks for all OAUTHBEARER configurations. + * This ensures OAuth tokens are refreshed automatically even on idle connections. + * This benefits all OAUTHBEARER methods: AWS IAM, OIDC, custom OAuth, etc. */ - if (ctx->msk_iam) { + if (ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { rd_kafka_error_t *error; error = rd_kafka_sasl_background_callbacks_enable(ctx->kafka.rk); if (error) { @@ -263,11 +272,10 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, rd_kafka_error_destroy(error); } else { - flb_plg_info(ctx->ins, "MSK IAM: SASL background callbacks enabled, " + flb_plg_info(ctx->ins, "OAUTHBEARER: SASL background callbacks enabled, " "OAuth tokens will be refreshed automatically in background thread"); } } -#endif #ifdef FLB_HAVE_AVRO_ENCODER /* Config AVRO */ diff --git a/plugins/out_kafka/kafka_config.h b/plugins/out_kafka/kafka_config.h index 9133113bcc0..57bd6ae92f7 100644 --- a/plugins/out_kafka/kafka_config.h +++ b/plugins/out_kafka/kafka_config.h @@ -127,6 +127,7 @@ struct flb_out_kafka { #ifdef FLB_HAVE_AWS_MSK_IAM struct flb_aws_msk_iam *msk_iam; + int aws_msk_iam; /* Flag to indicate user explicitly requested AWS MSK IAM */ #endif struct flb_kafka_opaque *opaque; From 5956f87a246036d54ac512e1f3486848fdc70661 Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 00:23:49 +0800 Subject: [PATCH 32/37] aws_msk_iam,in_kafka,out_kafka: enable AWS MSK IAM authentication Signed-off-by: Arbin --- plugins/in_kafka/in_kafka.c | 4 ++-- plugins/out_kafka/kafka_config.c | 7 +++---- src/aws/flb_aws_msk_iam.c | 4 ++-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/plugins/in_kafka/in_kafka.c b/plugins/in_kafka/in_kafka.c index a4f638562ff..0411ccf053a 100644 --- a/plugins/in_kafka/in_kafka.c +++ b/plugins/in_kafka/in_kafka.c @@ -365,13 +365,13 @@ static int in_kafka_init(struct flb_input_instance *ins, strstr(conf, ".amazonaws.com")) { /* Register MSK IAM OAuth callback - extract region from broker address */ - flb_plg_info(ins, "registering AWS MSK IAM authentication (region auto-extracted from broker)"); + flb_plg_info(ins, "registering AWS MSK IAM authentication OAuth callback"); ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, kafka_conf, ctx->opaque); if (!ctx->msk_iam) { - flb_plg_error(ins, "failed to setup MSK IAM authentication"); + flb_plg_error(ins, "failed to setup MSK IAM authentication OAuth callback"); } else { res = rd_kafka_conf_set(kafka_conf, "sasl.oauthbearer.config", diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index 37792a78d91..9a4a598af6f 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -227,12 +227,12 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, strstr(tmp, ".amazonaws.com")) { /* Register MSK IAM OAuth callback - extract region from broker address */ - flb_plg_info(ins, "registering AWS MSK IAM authentication (region auto-extracted from broker)"); + flb_plg_info(ins, "registering AWS MSK IAM authentication OAuth callback"); ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, ctx->conf, ctx->opaque); if (!ctx->msk_iam) { - flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication"); + flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication OAuth callback"); } else { res = rd_kafka_conf_set(ctx->conf, "sasl.oauthbearer.config", @@ -272,8 +272,7 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, rd_kafka_error_destroy(error); } else { - flb_plg_info(ctx->ins, "OAUTHBEARER: SASL background callbacks enabled, " - "OAuth tokens will be refreshed automatically in background thread"); + flb_plg_info(ctx->ins, "OAUTHBEARER: SASL background callbacks enabled"); } } diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 0b877f0c5ab..1dffaf97c2d 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -606,13 +606,13 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con ctx->flb_config = config; /* Extract region from broker address */ - if (!opaque || !opaque->ptr) { + if (!opaque || !opaque->msk_iam_ctx) { flb_error("[aws_msk_iam] unable to access kafka context for broker-based region extraction"); flb_free(ctx); return NULL; } - kafka_ctx = (struct flb_kafka *) opaque->ptr; + kafka_ctx = (struct flb_kafka *) opaque->msk_iam_ctx; if (!kafka_ctx->brokers || flb_sds_len(kafka_ctx->brokers) == 0) { flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); flb_free(ctx); From 86e366f99203ca0de8998dce3393046efc11dd27 Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 01:11:02 +0800 Subject: [PATCH 33/37] aws_msk_iam: fix use strlen for non-SDS buffer Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 1dffaf97c2d..d4802b43709 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -613,7 +613,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con } kafka_ctx = (struct flb_kafka *) opaque->msk_iam_ctx; - if (!kafka_ctx->brokers || flb_sds_len(kafka_ctx->brokers) == 0) { + if (!kafka_ctx->brokers || strlen(kafka_ctx->brokers) == 0) { flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); flb_free(ctx); return NULL; From 854b3b06368fb86e923f34a4fddd8e1da5bb18be Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 01:55:03 +0800 Subject: [PATCH 34/37] aws_msk_iam: fix type confusion race in OAuth callback registration Signed-off-by: Arbin --- plugins/in_kafka/in_kafka.c | 12 +++-- plugins/out_kafka/kafka_config.c | 25 +++++++---- src/aws/flb_aws_msk_iam.c | 76 ++++++++++++++++---------------- src/flb_kafka.c | 2 +- 4 files changed, 65 insertions(+), 50 deletions(-) diff --git a/plugins/in_kafka/in_kafka.c b/plugins/in_kafka/in_kafka.c index 0411ccf053a..f356dfb8f24 100644 --- a/plugins/in_kafka/in_kafka.c +++ b/plugins/in_kafka/in_kafka.c @@ -372,6 +372,7 @@ static int in_kafka_init(struct flb_input_instance *ins, if (!ctx->msk_iam) { flb_plg_error(ins, "failed to setup MSK IAM authentication OAuth callback"); + goto init_error; } else { res = rd_kafka_conf_set(kafka_conf, "sasl.oauthbearer.config", @@ -387,6 +388,8 @@ static int in_kafka_init(struct flb_input_instance *ins, #endif ctx->kafka.rk = rd_kafka_new(RD_KAFKA_CONSUMER, kafka_conf, errstr, sizeof(errstr)); + /* rd_kafka_new takes ownership of kafka_conf regardless of success/failure */ + kafka_conf = NULL; /* Create Kafka consumer handle */ if (!ctx->kafka.rk) { @@ -482,15 +485,16 @@ static int in_kafka_init(struct flb_input_instance *ins, } if (ctx->kafka.rk) { rd_kafka_consumer_close(ctx->kafka.rk); + /* rd_kafka_destroy also destroys the conf that was passed to rd_kafka_new */ rd_kafka_destroy(ctx->kafka.rk); } - if (ctx->opaque) { - flb_kafka_opaque_destroy(ctx->opaque); - } else if (kafka_conf) { - /* conf is already destroyed when rd_kafka is initialized */ + /* If rd_kafka was never created, we need to destroy conf manually */ rd_kafka_conf_destroy(kafka_conf); } + if (ctx->opaque) { + flb_kafka_opaque_destroy(ctx->opaque); + } flb_sds_destroy(ctx->sasl_mechanism); flb_free(ctx); diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index 9a4a598af6f..c59f56629ff 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -233,15 +233,16 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, ctx->opaque); if (!ctx->msk_iam) { flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication OAuth callback"); + flb_out_kafka_destroy(ctx); + return NULL; } - else { - res = rd_kafka_conf_set(ctx->conf, "sasl.oauthbearer.config", - "principal=admin", errstr, sizeof(errstr)); - if (res != RD_KAFKA_CONF_OK) { - flb_plg_error(ctx->ins, - "failed to set sasl.oauthbearer.config: %s", - errstr); - } + + res = rd_kafka_conf_set(ctx->conf, "sasl.oauthbearer.config", + "principal=admin", errstr, sizeof(errstr)); + if (res != RD_KAFKA_CONF_OK) { + flb_plg_error(ctx->ins, + "failed to set sasl.oauthbearer.config: %s", + errstr); } } } @@ -250,6 +251,9 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, /* Kafka Producer */ ctx->kafka.rk = rd_kafka_new(RD_KAFKA_PRODUCER, ctx->conf, errstr, sizeof(errstr)); + /* rd_kafka_new takes ownership of conf regardless of success/failure */ + ctx->conf = NULL; + if (!ctx->kafka.rk) { flb_plg_error(ctx->ins, "failed to create producer: %s", errstr); @@ -334,8 +338,13 @@ int flb_out_kafka_destroy(struct flb_out_kafka *ctx) flb_kafka_topic_destroy_all(ctx); if (ctx->kafka.rk) { + /* rd_kafka_destroy also destroys the conf that was passed to rd_kafka_new */ rd_kafka_destroy(ctx->kafka.rk); } + else if (ctx->conf) { + /* If rd_kafka was never created, we need to destroy conf manually */ + rd_kafka_conf_destroy(ctx->conf); + } if (ctx->opaque) { flb_kafka_opaque_destroy(ctx->opaque); diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index d4802b43709..6f8a7ec671a 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -175,7 +175,7 @@ static flb_sds_t extract_region_from_broker(const char *broker) } len = end - start; - if (len == 0 || len > 64) { /* Sanity check on region length (relaxed to 64 chars) */ + if (len == 0 || len > 32) { /* Sanity check on region length (AWS regions are typically <= 20 chars) */ return NULL; } @@ -597,25 +597,19 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con char *first_broker = NULL; char *comma; - ctx = flb_calloc(1, sizeof(struct flb_aws_msk_iam)); - if (!ctx) { - flb_errno(); - return NULL; - } - - ctx->flb_config = config; - - /* Extract region from broker address */ + /* + * Extract region from broker address before allocating context. + * The caller must set opaque->msk_iam_ctx to point to struct flb_kafka + * before calling this function, which we use to extract the broker address. + */ if (!opaque || !opaque->msk_iam_ctx) { flb_error("[aws_msk_iam] unable to access kafka context for broker-based region extraction"); - flb_free(ctx); return NULL; } kafka_ctx = (struct flb_kafka *) opaque->msk_iam_ctx; if (!kafka_ctx->brokers || strlen(kafka_ctx->brokers) == 0) { flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); - flb_free(ctx); return NULL; } @@ -623,7 +617,6 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con first_broker = flb_strdup(kafka_ctx->brokers); if (!first_broker) { flb_error("[aws_msk_iam] failed to allocate memory for broker parsing"); - flb_free(ctx); return NULL; } @@ -632,46 +625,49 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con *comma = '\0'; /* Terminate at first comma */ } - /* Detect if this is MSK Serverless by checking broker address - * Serverless broker contains .kafka-serverless. in the hostname - * Standard broker contains .kafka. (but not .kafka-serverless.) - */ - if (strstr(first_broker, ".kafka-serverless.")) { - ctx->is_serverless = 1; - flb_info("[aws_msk_iam] detected MSK Serverless cluster"); - } - else { - ctx->is_serverless = 0; - } - /* Extract region from broker address */ region_str = extract_region_from_broker(first_broker); - flb_free(first_broker); - if (!region_str || flb_sds_len(region_str) == 0) { flb_error("[aws_msk_iam] failed to extract region from broker address: %s", kafka_ctx->brokers); - flb_free(ctx); + flb_free(first_broker); if (region_str) { flb_sds_destroy(region_str); } return NULL; } - flb_info("[aws_msk_iam] extracted region '%s' from broker address%s", - region_str, ctx->is_serverless ? " (Serverless)" : ""); + /* Detect if this is MSK Serverless by checking broker address */ + ctx = flb_calloc(1, sizeof(struct flb_aws_msk_iam)); + if (!ctx) { + flb_errno(); + flb_free(first_broker); + flb_sds_destroy(region_str); + return NULL; + } + ctx->flb_config = config; ctx->region = region_str; - - if (!ctx->region) { - flb_free(ctx); - return NULL; + + /* Detect cluster type (Standard vs Serverless) */ + if (strstr(first_broker, ".kafka-serverless.")) { + ctx->is_serverless = 1; + flb_info("[aws_msk_iam] detected MSK Serverless cluster"); } + else { + ctx->is_serverless = 0; + } + + flb_free(first_broker); + first_broker = NULL; + + flb_info("[aws_msk_iam] extracted region '%s' from broker address%s", + region_str, ctx->is_serverless ? " (Serverless)" : ""); /* Create TLS instance */ ctx->cred_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, FLB_TRUE, - FLB_LOG_DEBUG, + 0, /* TLS debug off by default */ NULL, NULL, NULL, NULL, NULL, NULL); if (!ctx->cred_tls) { flb_error("[aws_msk_iam] failed to create TLS instance"); @@ -707,10 +703,16 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con } ctx->provider->provider_vtable->async(ctx->provider); - /* Register callback */ - rd_kafka_conf_set_oauthbearer_token_refresh_cb(kconf, oauthbearer_token_refresh_cb); + /* + * CRITICAL: Set the correct context type in opaque BEFORE registering the callback. + * This eliminates the race condition where the callback could be triggered with + * the wrong context type during the initialization window. + */ flb_kafka_opaque_set(opaque, NULL, ctx); rd_kafka_conf_set_opaque(kconf, opaque); + + /* Now safe to register callback - opaque->msk_iam_ctx is already the correct type */ + rd_kafka_conf_set_oauthbearer_token_refresh_cb(kconf, oauthbearer_token_refresh_cb); return ctx; } diff --git a/src/flb_kafka.c b/src/flb_kafka.c index 316c9ba9719..6a76c0dca33 100644 --- a/src/flb_kafka.c +++ b/src/flb_kafka.c @@ -95,7 +95,7 @@ rd_kafka_conf_t *flb_kafka_conf_create(struct flb_kafka *kafka, err: if (kafka_cfg) { - flb_free(kafka_cfg); + rd_kafka_conf_destroy(kafka_cfg); } return NULL; } From 8be1297b683fc42d0bdd0efc24282486e0c7c236 Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 02:27:09 +0800 Subject: [PATCH 35/37] aws_msk_iam: fix critical concurrency and memory issues Signed-off-by: Arbin --- include/fluent-bit/aws/flb_aws_msk_iam.h | 8 +++- plugins/in_kafka/in_kafka.c | 27 +++++++---- plugins/out_kafka/kafka_config.c | 20 ++++---- src/aws/flb_aws_msk_iam.c | 61 +++++++++++++++++------- 4 files changed, 81 insertions(+), 35 deletions(-) diff --git a/include/fluent-bit/aws/flb_aws_msk_iam.h b/include/fluent-bit/aws/flb_aws_msk_iam.h index 127d03dbdf4..b745fa03d35 100644 --- a/include/fluent-bit/aws/flb_aws_msk_iam.h +++ b/include/fluent-bit/aws/flb_aws_msk_iam.h @@ -36,11 +36,17 @@ struct flb_msk_iam_cb { /* * Register the oauthbearer refresh callback for MSK IAM authentication. + * Parameters: + * - config: Fluent Bit configuration + * - kconf: rdkafka configuration + * - opaque: Kafka opaque context (will be set with MSK IAM context) + * - brokers: Comma-separated list of broker addresses (used to extract AWS region) * Returns context pointer on success or NULL on failure. */ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *config, rd_kafka_conf_t *kconf, - struct flb_kafka_opaque *opaque); + struct flb_kafka_opaque *opaque, + const char *brokers); void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx); #endif diff --git a/plugins/in_kafka/in_kafka.c b/plugins/in_kafka/in_kafka.c index f356dfb8f24..612d3b37259 100644 --- a/plugins/in_kafka/in_kafka.c +++ b/plugins/in_kafka/in_kafka.c @@ -341,7 +341,7 @@ static int in_kafka_init(struct flb_input_instance *ins, flb_plg_error(ins, "failed to create kafka opaque context"); goto init_error; } - flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); + flb_kafka_opaque_set(ctx->opaque, ctx, NULL); rd_kafka_conf_set_opaque(kafka_conf, ctx->opaque); /* @@ -360,15 +360,16 @@ static int in_kafka_init(struct flb_input_instance *ins, if (ctx->aws_msk_iam && ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { /* Check if brokers are configured for MSK IAM */ - conf = flb_input_get_property("brokers", ins); - if (conf && (strstr(conf, ".kafka.") || strstr(conf, ".kafka-serverless.")) && - strstr(conf, ".amazonaws.com")) { + if (ctx->kafka.brokers && + (strstr(ctx->kafka.brokers, ".kafka.") || strstr(ctx->kafka.brokers, ".kafka-serverless.")) && + strstr(ctx->kafka.brokers, ".amazonaws.com")) { - /* Register MSK IAM OAuth callback - extract region from broker address */ + /* Register MSK IAM OAuth callback - pass brokers string directly */ flb_plg_info(ins, "registering AWS MSK IAM authentication OAuth callback"); ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, kafka_conf, - ctx->opaque); + ctx->opaque, + ctx->kafka.brokers); if (!ctx->msk_iam) { flb_plg_error(ins, "failed to setup MSK IAM authentication OAuth callback"); @@ -388,15 +389,18 @@ static int in_kafka_init(struct flb_input_instance *ins, #endif ctx->kafka.rk = rd_kafka_new(RD_KAFKA_CONSUMER, kafka_conf, errstr, sizeof(errstr)); - /* rd_kafka_new takes ownership of kafka_conf regardless of success/failure */ - kafka_conf = NULL; /* Create Kafka consumer handle */ if (!ctx->kafka.rk) { flb_plg_error(ins, "Failed to create new consumer: %s", errstr); + /* rd_kafka_new() did NOT take ownership on failure; kafka_conf is + * still valid and will be destroyed by init_error cleanup path. */ goto init_error; } + /* rd_kafka_new() takes ownership of kafka_conf on success */ + kafka_conf = NULL; + /* * Enable SASL background callbacks for all OAUTHBEARER configurations. * This ensures OAuth tokens are refreshed automatically even when: @@ -495,6 +499,13 @@ static int in_kafka_init(struct flb_input_instance *ins, if (ctx->opaque) { flb_kafka_opaque_destroy(ctx->opaque); } + +#ifdef FLB_HAVE_AWS_MSK_IAM + if (ctx->msk_iam) { + flb_aws_msk_iam_destroy(ctx->msk_iam); + } +#endif + flb_sds_destroy(ctx->sasl_mechanism); flb_free(ctx); diff --git a/plugins/out_kafka/kafka_config.c b/plugins/out_kafka/kafka_config.c index c59f56629ff..287e61c7ba4 100644 --- a/plugins/out_kafka/kafka_config.c +++ b/plugins/out_kafka/kafka_config.c @@ -203,7 +203,7 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, } /* store the plugin context so callbacks can log properly */ - flb_kafka_opaque_set(ctx->opaque, ctx, &ctx->kafka); + flb_kafka_opaque_set(ctx->opaque, ctx, NULL); rd_kafka_conf_set_opaque(ctx->conf, ctx->opaque); /* @@ -222,15 +222,16 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, if (ctx->aws_msk_iam && ctx->sasl_mechanism && strcasecmp(ctx->sasl_mechanism, "OAUTHBEARER") == 0) { /* Check if brokers are configured for MSK IAM */ - tmp = flb_output_get_property("brokers", ins); - if (tmp && (strstr(tmp, ".kafka.") || strstr(tmp, ".kafka-serverless.")) && - strstr(tmp, ".amazonaws.com")) { + if (ctx->kafka.brokers && + (strstr(ctx->kafka.brokers, ".kafka.") || strstr(ctx->kafka.brokers, ".kafka-serverless.")) && + strstr(ctx->kafka.brokers, ".amazonaws.com")) { - /* Register MSK IAM OAuth callback - extract region from broker address */ + /* Register MSK IAM OAuth callback - pass brokers string directly */ flb_plg_info(ins, "registering AWS MSK IAM authentication OAuth callback"); ctx->msk_iam = flb_aws_msk_iam_register_oauth_cb(config, ctx->conf, - ctx->opaque); + ctx->opaque, + ctx->kafka.brokers); if (!ctx->msk_iam) { flb_plg_error(ctx->ins, "failed to setup MSK IAM authentication OAuth callback"); flb_out_kafka_destroy(ctx); @@ -251,16 +252,19 @@ struct flb_out_kafka *flb_out_kafka_create(struct flb_output_instance *ins, /* Kafka Producer */ ctx->kafka.rk = rd_kafka_new(RD_KAFKA_PRODUCER, ctx->conf, errstr, sizeof(errstr)); - /* rd_kafka_new takes ownership of conf regardless of success/failure */ - ctx->conf = NULL; if (!ctx->kafka.rk) { flb_plg_error(ctx->ins, "failed to create producer: %s", errstr); + /* rd_kafka_new() did NOT take ownership on failure; ctx->conf is + * still valid and will be destroyed by flb_out_kafka_destroy(). */ flb_out_kafka_destroy(ctx); return NULL; } + /* rd_kafka_new() takes ownership of ctx->conf on success */ + ctx->conf = NULL; + /* * Enable SASL background callbacks for all OAUTHBEARER configurations. * This ensures OAuth tokens are refreshed automatically even on idle connections. diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index 6f8a7ec671a..f7839dff731 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -37,6 +37,7 @@ #include #include #include +#include /* * OAuth token lifetime of 5 minutes (industry standard). @@ -50,6 +51,7 @@ struct flb_aws_msk_iam { int is_serverless; /* Flag to indicate if this is MSK Serverless */ struct flb_tls *cred_tls; struct flb_aws_provider *provider; + pthread_mutex_t lock; /* Protects credential provider access from concurrent threads */ }; static int to_encode(char c) @@ -528,8 +530,19 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, flb_debug("[aws_msk_iam] OAuth token refresh callback triggered"); + /* + * CRITICAL CONCURRENCY FIX: + * Lock the credential provider to prevent race conditions. + * The librdkafka refresh callback executes in its internal thread context, + * while Fluent Bit may access the same provider from other threads. + * Without synchronization, concurrent refresh/get_credentials calls can + * corrupt provider state and cause authentication failures. + */ + pthread_mutex_lock(&config->lock); + /* Refresh credentials */ if (config->provider->provider_vtable->refresh(config->provider) < 0) { + pthread_mutex_unlock(&config->lock); flb_warn("[aws_msk_iam] credential refresh failed, will retry on next callback"); rd_kafka_oauthbearer_set_token_failure(rk, "credential refresh failed"); return; @@ -538,11 +551,15 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, /* Get credentials */ creds = config->provider->provider_vtable->get_credentials(config->provider); if (!creds) { + pthread_mutex_unlock(&config->lock); flb_error("[aws_msk_iam] failed to get AWS credentials from provider"); rd_kafka_oauthbearer_set_token_failure(rk, "credential retrieval failed"); return; } + /* Unlock immediately after getting credentials - no need to hold lock during payload generation */ + pthread_mutex_unlock(&config->lock); + /* Generate payload */ payload = build_msk_iam_payload(config, host, creds); if (!payload) { @@ -589,32 +606,27 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, /* Register OAuth callback */ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *config, rd_kafka_conf_t *kconf, - struct flb_kafka_opaque *opaque) + struct flb_kafka_opaque *opaque, + const char *brokers) { struct flb_aws_msk_iam *ctx; flb_sds_t region_str = NULL; - struct flb_kafka *kafka_ctx; char *first_broker = NULL; char *comma; - /* - * Extract region from broker address before allocating context. - * The caller must set opaque->msk_iam_ctx to point to struct flb_kafka - * before calling this function, which we use to extract the broker address. - */ - if (!opaque || !opaque->msk_iam_ctx) { - flb_error("[aws_msk_iam] unable to access kafka context for broker-based region extraction"); + /* Validate inputs */ + if (!opaque) { + flb_error("[aws_msk_iam] opaque context is required"); return NULL; } - - kafka_ctx = (struct flb_kafka *) opaque->msk_iam_ctx; - if (!kafka_ctx->brokers || strlen(kafka_ctx->brokers) == 0) { + + if (!brokers || strlen(brokers) == 0) { flb_error("[aws_msk_iam] brokers configuration is required for region extraction"); return NULL; } /* Extract first broker from comma-separated list */ - first_broker = flb_strdup(kafka_ctx->brokers); + first_broker = flb_strdup(brokers); if (!first_broker) { flb_error("[aws_msk_iam] failed to allocate memory for broker parsing"); return NULL; @@ -629,7 +641,7 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con region_str = extract_region_from_broker(first_broker); if (!region_str || flb_sds_len(region_str) == 0) { flb_error("[aws_msk_iam] failed to extract region from broker address: %s", - kafka_ctx->brokers); + brokers); flb_free(first_broker); if (region_str) { flb_sds_destroy(region_str); @@ -703,15 +715,24 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con } ctx->provider->provider_vtable->async(ctx->provider); + /* Initialize mutex to protect credential provider access from concurrent threads */ + if (pthread_mutex_init(&ctx->lock, NULL) != 0) { + flb_error("[aws_msk_iam] failed to initialize credential provider mutex"); + flb_aws_provider_destroy(ctx->provider); + flb_tls_destroy(ctx->cred_tls); + flb_sds_destroy(ctx->region); + flb_free(ctx); + return NULL; + } + /* - * CRITICAL: Set the correct context type in opaque BEFORE registering the callback. - * This eliminates the race condition where the callback could be triggered with - * the wrong context type during the initialization window. + * Set MSK IAM context in opaque - now opaque->msk_iam_ctx only holds + * struct flb_aws_msk_iam * throughout its lifetime, eliminating type confusion. */ flb_kafka_opaque_set(opaque, NULL, ctx); rd_kafka_conf_set_opaque(kconf, opaque); - /* Now safe to register callback - opaque->msk_iam_ctx is already the correct type */ + /* Register OAuth token refresh callback */ rd_kafka_conf_set_oauthbearer_token_refresh_cb(kconf, oauthbearer_token_refresh_cb); return ctx; @@ -735,5 +756,9 @@ void flb_aws_msk_iam_destroy(struct flb_aws_msk_iam *ctx) if (ctx->region) { flb_sds_destroy(ctx->region); } + + /* Destroy the credential provider mutex */ + pthread_mutex_destroy(&ctx->lock); + flb_free(ctx); } From c8ffbf40f3017b22827a7671302a27f98c295fba Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 02:56:38 +0800 Subject: [PATCH 36/37] aws_msk_iam: improve log clarity for cluster detection and token refresh Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index f7839dff731..ca1a8cf25d2 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -594,8 +594,7 @@ static void oauthbearer_token_refresh_cb(rd_kafka_t *rk, rd_kafka_oauthbearer_set_token_failure(rk, errstr); } else { - flb_info("[aws_msk_iam] OAuth bearer token successfully set with %d second lifetime", - MSK_IAM_TOKEN_LIFETIME_SECONDS); + flb_info("[aws_msk_iam] OAuth bearer token refreshed"); } if (payload) { @@ -673,8 +672,9 @@ struct flb_aws_msk_iam *flb_aws_msk_iam_register_oauth_cb(struct flb_config *con flb_free(first_broker); first_broker = NULL; - flb_info("[aws_msk_iam] extracted region '%s' from broker address%s", - region_str, ctx->is_serverless ? " (Serverless)" : ""); + flb_info("[aws_msk_iam] detected %s MSK cluster, region: %s", + ctx->is_serverless ? "Serverless" : "Standard", + region_str); /* Create TLS instance */ ctx->cred_tls = flb_tls_create(FLB_TLS_CLIENT_MODE, From a64f819eca61fa30337249ae494f52218ab02fa4 Mon Sep 17 00:00:00 2001 From: Arbin Date: Sat, 29 Nov 2025 18:13:10 +0800 Subject: [PATCH 37/37] aws_msk_iam: support VPC endpoint Signed-off-by: Arbin --- src/aws/flb_aws_msk_iam.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/aws/flb_aws_msk_iam.c b/src/aws/flb_aws_msk_iam.c index ca1a8cf25d2..4be3ea7e261 100644 --- a/src/aws/flb_aws_msk_iam.c +++ b/src/aws/flb_aws_msk_iam.c @@ -135,14 +135,17 @@ static int hmac_sha256_sign(unsigned char out[32], } /* Extract region from MSK broker address - * MSK Standard format: b-1.example.c1.kafka..amazonaws.com:port - * MSK Serverless format: boot-.c.kafka-serverless..amazonaws.com:port + * Supported formats: + * - MSK Standard: b-1.example.c1.kafka..amazonaws.com:port + * - MSK Serverless: boot-.c.kafka-serverless..amazonaws.com:port + * - VPC Endpoint: vpce-.kafka..vpce.amazonaws.com:port */ static flb_sds_t extract_region_from_broker(const char *broker) { const char *p; const char *start; const char *end; + const char *port_pos; size_t len; flb_sds_t out; @@ -150,18 +153,35 @@ static flb_sds_t extract_region_from_broker(const char *broker) return NULL; } + /* Remove port if present (e.g., :9098) */ + port_pos = strchr(broker, ':'); + if (port_pos) { + len = port_pos - broker; + } else { + len = strlen(broker); + } + /* Find .amazonaws.com */ p = strstr(broker, ".amazonaws.com"); - if (!p) { + if (!p || p >= broker + len) { return NULL; } /* Region is between the last dot before .amazonaws.com and .amazonaws.com - * Example: ...kafka.us-east-1.amazonaws.com - * or ...kafka-serverless.us-east-1.amazonaws.com + * Handle VPC endpoints (vpce-xxx.kafka.region.vpce.amazonaws.com) + * Example formats: + * Standard: ...kafka.us-east-1.amazonaws.com + * Serverless: ...kafka-serverless.us-east-1.amazonaws.com + * VPC Endpoint: ...kafka.us-east-1.vpce.amazonaws.com */ end = p; /* Points to .amazonaws.com */ + /* Check for VPC endpoint format: .vpce.amazonaws.com */ + if (p >= broker + 5 && strncmp(p - 5, ".vpce", 5) == 0) { + /* For VPC endpoints, region ends at .vpce */ + end = p - 5; + } + /* Find the start of region by going backwards to find the previous dot */ start = end - 1; while (start > broker && *start != '.') { @@ -177,7 +197,9 @@ static flb_sds_t extract_region_from_broker(const char *broker) } len = end - start; - if (len == 0 || len > 32) { /* Sanity check on region length (AWS regions are typically <= 20 chars) */ + + /* Sanity check on region length (AWS regions are typically 9-20 chars) */ + if (len == 0 || len > 32) { return NULL; }