From e0d0edb44e353b9bee517782a70b7762d5576dfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 2 Apr 2025 15:33:07 +0200 Subject: [PATCH 1/6] conditionner: make text encoders optional for SD3.x --- conditioner.hpp | 181 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 142 insertions(+), 39 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index b1dc76983..7d637d643 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -658,38 +658,108 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr clip_l; std::shared_ptr clip_g; std::shared_ptr t5; + bool use_clip_l = false; + bool use_clip_g = false; + bool use_t5 = false; SD3CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); + if (clip_skip <= 0) { + clip_skip = 2; + } + + for (auto pair : tensor_types) { + if (pair.first.find("text_encoders.clip_l") != std::string::npos) { + use_clip_l = true; + } else if (pair.first.find("text_encoders.clip_g") != std::string::npos) { + use_clip_g = true; + } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5 = true; + } + } + if (!use_clip_l && !use_clip_g && !use_t5) { + LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); + return; + } + if (use_clip_l) { + clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); + } else { + LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); + } + if (use_clip_g) { + clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); + } else { + LOG_WARN("clip_g text encoder not found! Prompt adherence might be degraded."); + } + if (use_t5) { + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); + } else { + LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); + } + set_clip_skip(clip_skip); + } + + void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 2; + } + if (use_clip_l) { + clip_l->set_clip_skip(clip_skip); + } + if (use_clip_g) { + clip_g->set_clip_skip(clip_skip); + } } void get_param_tensors(std::map& tensors) { - clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); - clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model"); - t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + if (use_clip_l) { + clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); + } + if (use_clip_g) { + clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model"); + } + if (use_t5) { + t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + } } void alloc_params_buffer() { - clip_l->alloc_params_buffer(); - clip_g->alloc_params_buffer(); - t5->alloc_params_buffer(); + if (use_clip_l) { + clip_l->alloc_params_buffer(); + } + if (use_clip_g) { + clip_g->alloc_params_buffer(); + } + if (use_t5) { + t5->alloc_params_buffer(); + } } void free_params_buffer() { - clip_l->free_params_buffer(); - clip_g->free_params_buffer(); - t5->free_params_buffer(); + if (use_clip_l) { + clip_l->free_params_buffer(); + } + if (use_clip_g) { + clip_g->free_params_buffer(); + } + if (use_t5) { + t5->free_params_buffer(); + } } size_t get_params_buffer_size() { - size_t buffer_size = clip_l->get_params_buffer_size(); - buffer_size += clip_g->get_params_buffer_size(); - buffer_size += t5->get_params_buffer_size(); + size_t buffer_size = 0; + if (use_clip_l) { + buffer_size += clip_l->get_params_buffer_size(); + } + if (use_clip_g) { + buffer_size += clip_g->get_params_buffer_size(); + } + if (use_t5) { + buffer_size += t5->get_params_buffer_size(); + } return buffer_size; } @@ -721,23 +791,32 @@ struct SD3CLIPEmbedder : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; - - std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); - clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); - - curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb); - clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight); - - curr_tokens = t5_tokenizer.Encode(curr_text, true); - t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + if (use_clip_l) { + std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); + clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); + } + if (use_clip_g) { + std::vector curr_tokens = clip_g_tokenizer.encode(curr_text, on_new_token_cb); + clip_g_tokens.insert(clip_g_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + clip_g_weights.insert(clip_g_weights.end(), curr_tokens.size(), curr_weight); + } + if (use_t5) { + std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + } } - clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding); - clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding); - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); + if (use_clip_l) { + clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, max_length, padding); + } + if (use_clip_g) { + clip_g_tokenizer.pad_tokens(clip_g_tokens, clip_g_weights, max_length, padding); + } + if (use_t5) { + t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); + } // for (int i = 0; i < clip_l_tokens.size(); i++) { // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; @@ -785,10 +864,10 @@ struct SD3CLIPEmbedder : public Conditioner { std::vector hidden_states_vec; size_t chunk_len = 77; - size_t chunk_count = clip_l_tokens.size() / chunk_len; + size_t chunk_count = std::max(std::max(clip_l_tokens.size(), clip_g_tokens.size()), t5_tokens.size()) / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // clip_l - { + if (use_clip_l) { std::vector chunk_tokens(clip_l_tokens.begin() + chunk_idx * chunk_len, clip_l_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_l_weights.begin() + chunk_idx * chunk_len, @@ -835,10 +914,17 @@ struct SD3CLIPEmbedder : public Conditioner { &pooled_l, work_ctx); } + } else { + chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len); + ggml_set_f32(chunk_hidden_states_l, 0.f); + if (chunk_idx == 0) { + pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); + ggml_set_f32(pooled_l, 0.f); + } } // clip_g - { + if (use_clip_g) { std::vector chunk_tokens(clip_g_tokens.begin() + chunk_idx * chunk_len, clip_g_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(clip_g_weights.begin() + chunk_idx * chunk_len, @@ -886,10 +972,17 @@ struct SD3CLIPEmbedder : public Conditioner { &pooled_g, work_ctx); } + } else { + chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len); + ggml_set_f32(chunk_hidden_states_g, 0.f); + if (chunk_idx == 0) { + pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280); + ggml_set_f32(pooled_g, 0.f); + } } // t5 - { + if (use_t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, @@ -917,6 +1010,8 @@ struct SD3CLIPEmbedder : public Conditioner { float new_mean = ggml_tensor_mean(tensor); ggml_tensor_scale(tensor, (original_mean / new_mean)); } + } else { + chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 0); } auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx, @@ -959,11 +1054,19 @@ struct SD3CLIPEmbedder : public Conditioner { ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); } - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + if (hidden_states_vec.size() > 0) { + hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); + hidden_states = ggml_reshape_2d(work_ctx, + hidden_states, + chunk_hidden_states->ne[0], + ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + } else { + hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 0); + } + if (pooled == NULL) { + pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 2048); + ggml_set_f32(pooled, 0.f); + } return SDCondition(hidden_states, pooled, NULL); } From c61cfecb8c7547735b030db844ee3d1d64457c76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 2 Apr 2025 16:14:47 +0200 Subject: [PATCH 2/6] conditionner: make text encoders optional for Flux --- conditioner.hpp | 141 ++++++++++++++++++++++++++++--------------- stable-diffusion.cpp | 1 + 2 files changed, 92 insertions(+), 50 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 7d637d643..2ae098479 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1107,31 +1107,52 @@ struct FluxCLIPEmbedder : public Conditioner { std::shared_ptr t5; size_t chunk_len = 256; + bool use_clip_l = false; + bool use_t5 = false; + FluxCLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); + clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); } + void get_param_tensors(std::map& tensors) { - clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); - t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + if (use_clip_l) { + clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model"); + } + if (use_t5) { + t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + } } void alloc_params_buffer() { - clip_l->alloc_params_buffer(); - t5->alloc_params_buffer(); + if (use_clip_l) { + clip_l->alloc_params_buffer(); + } + if (use_t5) { + t5->alloc_params_buffer(); + } } void free_params_buffer() { - clip_l->free_params_buffer(); - t5->free_params_buffer(); + if (use_clip_l) { + clip_l->free_params_buffer(); + } + if (use_t5) { + t5->free_params_buffer(); + } } size_t get_params_buffer_size() { - size_t buffer_size = clip_l->get_params_buffer_size(); - buffer_size += t5->get_params_buffer_size(); + size_t buffer_size = 0; + if (use_clip_l) { + buffer_size += clip_l->get_params_buffer_size(); + } + if (use_t5) { + buffer_size += t5->get_params_buffer_size(); + } return buffer_size; } @@ -1161,18 +1182,23 @@ struct FluxCLIPEmbedder : public Conditioner { for (const auto& item : parsed_attention) { const std::string& curr_text = item.first; float curr_weight = item.second; - - std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); - clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); - - curr_tokens = t5_tokenizer.Encode(curr_text, true); - t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + if (use_clip_l) { + std::vector curr_tokens = clip_l_tokenizer.encode(curr_text, on_new_token_cb); + clip_l_tokens.insert(clip_l_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + clip_l_weights.insert(clip_l_weights.end(), curr_tokens.size(), curr_weight); + } + if (use_t5) { + std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + } + } + if (use_clip_l) { + clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); + } + if (use_t5) { + t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); } - - clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, NULL, max_length, padding); // for (int i = 0; i < clip_l_tokens.size(); i++) { // std::cout << clip_l_tokens[i] << ":" << clip_l_weights[i] << ", "; @@ -1207,35 +1233,37 @@ struct FluxCLIPEmbedder : public Conditioner { struct ggml_tensor* pooled = NULL; // [768,] std::vector hidden_states_vec; - size_t chunk_count = t5_tokens.size() / chunk_len; + size_t chunk_count = std::max(clip_l_tokens.size() > 0 ? chunk_len : 0, t5_tokens.size()) / chunk_len; for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // clip_l if (chunk_idx == 0) { - size_t chunk_len_l = 77; - std::vector chunk_tokens(clip_l_tokens.begin(), - clip_l_tokens.begin() + chunk_len_l); - std::vector chunk_weights(clip_l_weights.begin(), - clip_l_weights.begin() + chunk_len_l); + if (use_clip_l) { + size_t chunk_len_l = 77; + std::vector chunk_tokens(clip_l_tokens.begin(), + clip_l_tokens.begin() + chunk_len_l); + std::vector chunk_weights(clip_l_weights.begin(), + clip_l_weights.begin() + chunk_len_l); - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - size_t max_token_idx = 0; + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + size_t max_token_idx = 0; - auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); - max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); + auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID); + max_token_idx = std::min(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1); - clip_l->compute(n_threads, - input_ids, - 0, - NULL, - max_token_idx, - true, - clip_skip, - &pooled, - work_ctx); + clip_l->compute(n_threads, + input_ids, + 0, + NULL, + max_token_idx, + true, + clip_skip, + &pooled, + work_ctx); + } } // t5 - { + if (use_t5) { std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, t5_tokens.begin() + (chunk_idx + 1) * chunk_len); std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, @@ -1263,8 +1291,12 @@ struct FluxCLIPEmbedder : public Conditioner { float new_mean = ggml_tensor_mean(tensor); ggml_tensor_scale(tensor, (original_mean / new_mean)); } + } else { + chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); + ggml_set_f32(chunk_hidden_states, 0.f); } + int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (zero_out_masked) { @@ -1273,17 +1305,26 @@ struct FluxCLIPEmbedder : public Conditioner { vec[i] = 0; } } - + hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + (float*)chunk_hidden_states->data, + ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + } + + if (hidden_states_vec.size() > 0) { + hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); + hidden_states = ggml_reshape_2d(work_ctx, + hidden_states, + chunk_hidden_states->ne[0], + ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + } else { + hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 256); + ggml_set_f32(hidden_states, 0.f); + } + if (pooled == NULL) { + pooled = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768); + ggml_set_f32(pooled, 0.f); } - - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); return SDCondition(hidden_states, pooled, NULL); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ff064bb87..1bb897ddd 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -326,6 +326,7 @@ class StableDiffusionGGML { clip_backend = backend; bool use_t5xxl = false; if (sd_version_is_dit(version)) { + // TODO: check if t5 is actually loaded? use_t5xxl = true; } if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) { From 250e60f6b96539dcc434f94733f816e409793c14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 2 Apr 2025 18:27:28 +0200 Subject: [PATCH 3/6] only force clip on cpu ifd t5 is actually used --- stable-diffusion.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1bb897ddd..3121da21b 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -326,8 +326,12 @@ class StableDiffusionGGML { clip_backend = backend; bool use_t5xxl = false; if (sd_version_is_dit(version)) { - // TODO: check if t5 is actually loaded? - use_t5xxl = true; + for (auto pair : model_loader.tensor_storages_types) { + if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5xxl = true; + break; + } + } } if (!clip_on_cpu && !ggml_backend_is_cpu(backend) && use_t5xxl) { LOG_WARN( From 7a1623fef3def0fa4e9fa5b7b04cd2209230a3b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 18:31:08 +0200 Subject: [PATCH 4/6] conditionner: make t5 optional for chroma --- conditioner.hpp | 208 ++++++++++++++++++++++++++++-------------------- 1 file changed, 121 insertions(+), 87 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 2ae098479..f55a964e6 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -666,9 +666,6 @@ struct SD3CLIPEmbedder : public Conditioner { bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { - if (clip_skip <= 0) { - clip_skip = 2; - } for (auto pair : tensor_types) { if (pair.first.find("text_encoders.clip_l") != std::string::npos) { @@ -684,12 +681,12 @@ struct SD3CLIPEmbedder : public Conditioner { return; } if (use_clip_l) { - clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); + clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); } else { LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); } if (use_clip_g) { - clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); + clip_g = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } else { LOG_WARN("clip_g text encoder not found! Prompt adherence might be degraded."); } @@ -698,19 +695,6 @@ struct SD3CLIPEmbedder : public Conditioner { } else { LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); } - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - if (use_clip_l) { - clip_l->set_clip_skip(clip_skip); - } - if (use_clip_g) { - clip_g->set_clip_skip(clip_skip); - } } void get_param_tensors(std::map& tensors) { @@ -1113,10 +1097,30 @@ struct FluxCLIPEmbedder : public Conditioner { FluxCLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, const String2GGMLType& tensor_types = {}) { - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); - t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); - } + for (auto pair : tensor_types) { + if (pair.first.find("text_encoders.clip_l") != std::string::npos) { + use_clip_l = true; + } else if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5 = true; + } + } + + if (!use_clip_l && !use_t5) { + LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); + return; + } + if (use_clip_l) { + clip_l = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); + } else { + LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded."); + } + if (use_t5) { + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer"); + } else { + LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded."); + } + } void get_param_tensors(std::map& tensors) { if (use_clip_l) { @@ -1296,7 +1300,6 @@ struct FluxCLIPEmbedder : public Conditioner { ggml_set_f32(chunk_hidden_states, 0.f); } - int64_t t1 = ggml_time_ms(); LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); if (zero_out_masked) { @@ -1305,12 +1308,12 @@ struct FluxCLIPEmbedder : public Conditioner { vec[i] = 0; } } - + hidden_states_vec.insert(hidden_states_vec.end(), - (float*)chunk_hidden_states->data, - ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); + (float*)chunk_hidden_states->data, + ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); } - + if (hidden_states_vec.size() > 0) { hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); hidden_states = ggml_reshape_2d(work_ctx, @@ -1364,7 +1367,8 @@ struct T5CLIPEmbedder : public Conditioner { size_t chunk_len = 512; bool use_mask = false; int mask_pad = 1; - bool is_umt5 = false; + bool use_t5 = false; + T5CLIPEmbedder(ggml_backend_t backend, bool offload_params_to_cpu, @@ -1373,26 +1377,43 @@ struct T5CLIPEmbedder : public Conditioner { int mask_pad = 1, bool is_umt5 = false) : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) { - t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); + for (auto pair : tensor_types) { + if (pair.first.find("text_encoders.t5xxl") != std::string::npos) { + use_t5 = true; + } + } + + if (!use_t5) { + LOG_WARN("IMPORTANT NOTICE: No text encoders provided, cannot process prompts!"); + return; + } else { + t5 = std::make_shared(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5); + } } void get_param_tensors(std::map& tensors) { - t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + if (use_t5) { + t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); + } } void alloc_params_buffer() { - t5->alloc_params_buffer(); + if (use_t5) { + t5->alloc_params_buffer(); + } } void free_params_buffer() { - t5->free_params_buffer(); + if (use_t5) { + t5->free_params_buffer(); + } } size_t get_params_buffer_size() { size_t buffer_size = 0; - - buffer_size += t5->get_params_buffer_size(); - + if (use_t5) { + buffer_size += t5->get_params_buffer_size(); + } return buffer_size; } @@ -1418,17 +1439,18 @@ struct T5CLIPEmbedder : public Conditioner { std::vector t5_tokens; std::vector t5_weights; std::vector t5_mask; - for (const auto& item : parsed_attention) { - const std::string& curr_text = item.first; - float curr_weight = item.second; - - std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); - t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); - t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); - } + if (use_t5) { + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; - t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding); + std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + } + t5_tokenizer.pad_tokens(t5_tokens, t5_weights, &t5_mask, max_length, padding); + } return {t5_tokens, t5_weights, t5_mask}; } @@ -1465,66 +1487,78 @@ struct T5CLIPEmbedder : public Conditioner { std::vector hidden_states_vec; size_t chunk_count = t5_tokens.size() / chunk_len; - for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) { // t5 - std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, - t5_tokens.begin() + (chunk_idx + 1) * chunk_len); - std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, - t5_weights.begin() + (chunk_idx + 1) * chunk_len); - std::vector chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len, - t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len); - - auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); - auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL; - - t5->compute(n_threads, - input_ids, - t5_attn_mask_chunk, - &chunk_hidden_states, - work_ctx); - { - auto tensor = chunk_hidden_states; - float original_mean = ggml_tensor_mean(tensor); - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - float value = ggml_tensor_get_f32(tensor, i0, i1, i2); - value *= chunk_weights[i1]; - ggml_tensor_set_f32(tensor, value, i0, i1, i2); + + if (use_t5) { + std::vector chunk_tokens(t5_tokens.begin() + chunk_idx * chunk_len, + t5_tokens.begin() + (chunk_idx + 1) * chunk_len); + std::vector chunk_weights(t5_weights.begin() + chunk_idx * chunk_len, + t5_weights.begin() + (chunk_idx + 1) * chunk_len); + std::vector chunk_mask(t5_attn_mask_vec.begin() + chunk_idx * chunk_len, + t5_attn_mask_vec.begin() + (chunk_idx + 1) * chunk_len); + + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); + auto t5_attn_mask_chunk = use_mask ? vector_to_ggml_tensor(work_ctx, chunk_mask) : NULL; + t5->compute(n_threads, + input_ids, + t5_attn_mask_chunk, + &chunk_hidden_states, + work_ctx); + { + auto tensor = chunk_hidden_states; + float original_mean = ggml_tensor_mean(tensor); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float value = ggml_tensor_get_f32(tensor, i0, i1, i2); + value *= chunk_weights[i1]; + ggml_tensor_set_f32(tensor, value, i0, i1, i2); + } } } + float new_mean = ggml_tensor_mean(tensor); + ggml_tensor_scale(tensor, (original_mean / new_mean)); } - float new_mean = ggml_tensor_mean(tensor); - ggml_tensor_scale(tensor, (original_mean / new_mean)); - } - - int64_t t1 = ggml_time_ms(); - LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); - if (zero_out_masked) { - auto tensor = chunk_hidden_states; - for (int i2 = 0; i2 < tensor->ne[2]; i2++) { - for (int i1 = 0; i1 < tensor->ne[1]; i1++) { - for (int i0 = 0; i0 < tensor->ne[0]; i0++) { - if (chunk_mask[i1] < 0.f) { - ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2); + if (zero_out_masked) { + auto tensor = chunk_hidden_states; + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + if (chunk_mask[i1] < 0.f) { + ggml_tensor_set_f32(tensor, 0.f, i0, i1, i2); + } } } } } + } else { + chunk_hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); + ggml_set_f32(chunk_hidden_states, 0.f); + t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, chunk_len); + ggml_set_f32(t5_attn_mask, -HUGE_VALF); } + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); + hidden_states_vec.insert(hidden_states_vec.end(), (float*)chunk_hidden_states->data, ((float*)chunk_hidden_states->data) + ggml_nelements(chunk_hidden_states)); } - GGML_ASSERT(hidden_states_vec.size() > 0); - hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); - hidden_states = ggml_reshape_2d(work_ctx, - hidden_states, - chunk_hidden_states->ne[0], - ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + if (hidden_states_vec.size() > 0) { + hidden_states = vector_to_ggml_tensor(work_ctx, hidden_states_vec); + hidden_states = ggml_reshape_2d(work_ctx, + hidden_states, + chunk_hidden_states->ne[0], + ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); + } else { + hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); + ggml_set_f32(hidden_states, 0.f); + t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, chunk_len); + ggml_set_f32(t5_attn_mask, -HUGE_VALF); + } modify_mask_to_attend_padding(t5_attn_mask, ggml_nelements(t5_attn_mask), mask_pad); From 9a2ef28a8a68e75a0243e7c58d06f09b107f9afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 10 Jul 2025 18:36:21 +0200 Subject: [PATCH 5/6] Add todo --- conditioner.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/conditioner.hpp b/conditioner.hpp index f55a964e6..13bfb36b5 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -1554,6 +1554,7 @@ struct T5CLIPEmbedder : public Conditioner { chunk_hidden_states->ne[0], ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); } else { + // TODO: maybe precompute embeddings for token and fill with that instead? (proper uncond) hidden_states = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); ggml_set_f32(hidden_states, 0.f); t5_attn_mask = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, chunk_len); From 85e38abde6dbb962f37cd136e5dd29305e8595c4 Mon Sep 17 00:00:00 2001 From: leejet Date: Sat, 25 Oct 2025 21:59:10 +0800 Subject: [PATCH 6/6] fix SD3CLIPEmbedder::get_learned_condition_common --- conditioner.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/conditioner.hpp b/conditioner.hpp index c9ad99270..e4b475cee 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -998,7 +998,8 @@ struct SD3CLIPEmbedder : public Conditioner { ggml_tensor_scale(tensor, (original_mean / new_mean)); } } else { - chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, 0); + chunk_hidden_states_t5 = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 4096, chunk_len); + ggml_set_f32(chunk_hidden_states_t5, 0.f); } auto chunk_hidden_states_lg_pad = ggml_new_tensor_3d(work_ctx, @@ -1182,6 +1183,7 @@ struct FluxCLIPEmbedder : public Conditioner { t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); } } + if (clip_l) { clip_l_tokenizer.pad_tokens(clip_l_tokens, clip_l_weights, 77, padding); }