fixed another tts bug, clblast selection and quiet mode

LostRuins · LostRuins · commit 0e74db7fd48d · 2025-01-22T21:36:13.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -106,6 +106,7 @@ static kcpp_params * kcpp_data = nullptr;
 static int max_context_limit_at_load = 0;
 static int n_past = 0;
 static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall
+static bool quiet = false;
 static std::vector<gpt_vocab::id> last_n_tokens;
 static std::vector<gpt_vocab::id> current_context_tokens;
 static size_t mem_per_token = 0;
@@ -930,12 +931,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
 
     if(last_idx>1) //if there are 2 or more viable candidates
     {
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
             printf("XTC penalties [");
         }
         // then remove all other tokens above threshold EXCEPT the least likely one
         for (size_t i = 0; i < last_idx - 1; ++i) {
-            if (debugmode==1)
+            if (debugmode==1 && !quiet)
             {
                 gpt_vocab::id token = candidates->data[i].id;
                 std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
@@ -944,7 +945,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
             }
             candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough
         }
-        if (debugmode==1) {
+        if (debugmode==1 && !quiet) {
             printf("]\n");
         }
         candidates->sorted = false;
@@ -1133,7 +1134,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
         max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);
     }
 
-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
         printf("DRY penalties [");
     }
     size_t count = 0;
@@ -1144,7 +1145,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
             repeat_exp = max_exponent;
         }
         float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);
-        if (debugmode==1)
+        if (debugmode==1 && !quiet)
         {
             std::string tokenizedstr = FileFormatTokenizeID(token, file_format);
             ::utreplace(tokenizedstr, "\n", "\\n");
@@ -1157,7 +1158,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
     {
         candidates->sorted = false;
     }
-    if (debugmode==1 && !dry_max_token_repeat.empty()) {
+    if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {
         printf("]\n");
     }
 }
@@ -1688,7 +1689,7 @@ static void load_grammar(const std::string & gammarstr)
             printf("\nIgnored invalid grammar sampler.");
             return;
         }
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             parsed_grammar.print(stderr);
         }
@@ -1831,7 +1832,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;
         float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));
 
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("Trained max context length (value:%.d).\n", n_ctx_train);
             printf("Desired context length (value:%.d).\n", n_ctx_desired);
@@ -1848,7 +1849,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
         {
             float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));
             float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
-            if(debugmode==1)
+            if(debugmode==1 && !quiet)
             {
                 printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);
                 printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);
@@ -2679,13 +2680,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
         printf("\nWarning: KCPP text generation not initialized!\n");
         return toks;
     }
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
     {
         printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());
     }
     TokenizeString(input, toks, file_format,addbos);
     int tokcount = toks.size();
-    if(debugmode==1)
+    if(debugmode==1 && !quiet)
     {
         printf("\nTokens Counted: %d\n",tokcount);
     }
@@ -2770,6 +2771,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         llama_perf_context_reset(llama_ctx_v4);
     }
 
+    quiet = inputs.quiet;
     generation_finished = false; // Set current generation status
     generated_tokens.clear(); // New Generation, new tokens
     delayed_generated_tokens.clear();
@@ -2848,7 +2850,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     banned_token_ids.clear();
     if(banned_tokens.size()>0)
     {
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nBanning %zu single character sequences...",banned_tokens.size());
         }
@@ -2865,13 +2867,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
             }
         }
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());
         }
     }
 
-    if(debugmode==1 && banned_phrases.size()>0)
+    if(debugmode==1 && !quiet && banned_phrases.size()>0)
     {
         printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);
     }
@@ -2916,7 +2918,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         //images have changed. swap identifiers to force reprocessing
         current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
         llava_composite_image_signature = new_llava_composite;
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nLLAVA images changed, existing cache invalidated");
         }
@@ -2972,7 +2974,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             const int MAX_CHAR_LEN = 40;
             const int MAX_SEQ_LEN = 20;
 
-            if (debugmode == 1)
+            if (debugmode == 1 && !quiet)
             {
                 printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());
             }
@@ -2984,7 +2986,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 }
                 GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
             }
-            if (debugmode == 1)
+            if (debugmode == 1 && !quiet)
             {
                 int trivial = 0, non_trivial = 0;
                 for (const auto &seq : dry_sequence_breakers)
@@ -3004,9 +3006,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     }
 
     bool stream_sse = inputs.stream_sse;
-
-    bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) || debugmode >= 1;
-
+    bool allow_regular_prints = (!quiet && debugmode!=-1);
 
     std::string grammarstr = inputs.grammar;
     bool grammar_retain_state = inputs.grammar_retain_state;
@@ -3039,7 +3039,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     if (kcpp_data->seed <= 0 || kcpp_data->seed==0xFFFFFFFF)
     {
         kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);
-        if(debugmode==1)
+        if(debugmode==1 && !quiet)
         {
             printf("\nUsing Seed: %d",kcpp_data->seed);
         }
@@ -3071,15 +3071,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             }
             else
             {
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                 {
                     printf("\nCreating clip image embed...");
                 }
                 llava_images[i].clp_image_tokens = 0;
                 if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {
                     printf("\nError: Clip image %d failed to create embd!",i);
                 }
-                if(debugmode==1)
+                if(debugmode==1 && !quiet)
                 {
                     printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);
                 }
@@ -3202,7 +3202,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
     n_past = 0;
 
-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
     {
         std::string outstr = "";
         printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);
@@ -3347,7 +3347,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         printf("\n");
     }
 
-    if (debugmode==1)
+    if (debugmode==1 && !quiet)
     {
         std::string outstr = "";
         printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);
@@ -3396,7 +3396,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                     draft_used = true;
                     draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
                     evalres = draft_results.draft_success;
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                     {
                         std::string draftedtoks = get_tok_vec_str(draft_results.draftids);
                         printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());
@@ -3599,7 +3599,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 if(draft_used)
                 {
                     int32_t draftedid = draft_results.draftids[logits_sampled];
-                    if(debugmode==1)
+                    if(debugmode==1 && !quiet)
                     {
                         std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);
                         std::string realtok = FileFormatTokenizeID(id, file_format, true);
@@ -3652,7 +3652,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                 {
                     printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);
                 }
-                if(debugmode==1 && top_picks_history.size()>0)
+                if(debugmode==1 && !quiet && top_picks_history.size()>0)
                 {
                     printf(" [");
                     bool firstloop = true;
@@ -3904,7 +3904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         delayed_generated_tokens.pop_front();
     }
 
-    if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
+    if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
     {
         printf("\n");
         llama_perf_context_print(llama_ctx_v4);
diff --git a/koboldcpp.py b/koboldcpp.py
@@ -59,7 +59,7 @@
 modelbusy = threading.Lock()
 requestsinqueue = 0
 defaultport = 5001
-KcppVersion = "1.82.2"
+KcppVersion = "1.82.3"
 showdebug = True
 guimode = False
 showsamplerwarning = True
@@ -388,7 +388,7 @@ def pick_existant_file(ntoption,nonntoption):
     (lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
     (lib_clblast_noavx2, "Use CLBlast (Older CPU)"),
     (lib_failsafe, "Failsafe Mode (Older CPU)")]
-default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
+default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, vulkan_noavx2_option, clblast_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
 runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]
 
 def init_library():
@@ -596,6 +596,8 @@ def exit_with_error(code, message, title="Error"):
     sys.exit(code)
 
 def utfprint(str, importance = 2): #0 = only debugmode, 1 = except quiet, 2 = always print
+    if args.quiet and importance<2: #quiet overrides debugmode
+        return
     if args.debugmode < 1:
         if importance==1 and (args.debugmode == -1 or args.quiet):
             return
diff --git a/otherarch/tts_adapter.cpp b/otherarch/tts_adapter.cpp
@@ -152,9 +152,10 @@ static std::vector<float> embd_to_audio(
         const int n_codes,
         const int n_embd,
         const int n_thread) {
-    const int n_hop = 600;
-    const int n_fft = n_hop*4; //its 1280 at 320, or 2400 at 600
-    const int n_win = n_hop*4;
+
+    const int n_fft = 1280; //its 1280 at 320, or 2400 at 600
+    const int n_hop = 320;
+    const int n_win = 1280;
     const int n_pad = (n_win - n_hop)/2;
     const int n_out = (n_codes - 1)*n_hop + n_win;
 
@@ -622,7 +623,7 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
     {
         audio_seed = (((uint32_t)time(NULL)) % 1000000u);
     }
-    if(ttsdebugmode==1)
+    if(ttsdebugmode==1 && !inputs.quiet)
     {
         printf("\nUsing Speaker Seed: %d", speaker_seed);
         printf("\nUsing Audio Seed: %d", audio_seed);
@@ -638,13 +639,12 @@ tts_generation_outputs ttstype_generate(const tts_generation_inputs inputs)
     && last_generated_audio!=""
     && last_generation_settings_prompt == std::string(inputs.prompt))
     {
-        if(ttsdebugmode==1 || !inputs.quiet)
-        {
+        if (ttsdebugmode == 1 && !inputs.quiet) {
             printf("\nReusing Cached Audio.\n");
-            output.data = last_generated_audio.c_str();
-            output.status = 1;
-            return output;
         }
+        output.data = last_generated_audio.c_str();
+        output.status = 1;
+        return output;
     }
 
 
diff --git a/otherarch/whispercpp/whisper_adapter.cpp b/otherarch/whispercpp/whisper_adapter.cpp
@@ -24,6 +24,7 @@
 #endif
 
 static int whisperdebugmode = 0;
+static bool whisperquiet = false;
 static whisper_context * whisper_ctx = nullptr;
 static std::string whisper_output_text = "";
 
@@ -89,7 +90,7 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
     std::vector<float> raw_pcm;
     raw_pcm.resize(n);
 
-    if(whisperdebugmode==1)
+    if(whisperdebugmode==1 && !whisperquiet)
     {
         printf("\nwav_data_size: %d, n:%d",wav_data.size(),n);
     }
@@ -106,7 +107,7 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st
     }
 
     if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-        if(whisperdebugmode==1)
+        if(whisperdebugmode==1 && !whisperquiet)
         {
             printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",
             wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());
@@ -202,7 +203,8 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
         return output;
     }
 
-    if(!inputs.quiet)
+    whisperquiet = inputs.quiet;
+    if(!whisperquiet)
     {
         printf("\nWhisper Transcribe Generating...");
     }
@@ -261,14 +263,14 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs
         return output;
     }
 
-    if (!inputs.quiet && whisperdebugmode==1) {
+    if (!whisperquiet && whisperdebugmode==1) {
         whisper_print_timings(whisper_ctx);
     }
 
     // output text transcription
     whisper_output_text = output_txt(whisper_ctx, pcmf32s);
     std::string ts = get_timestamp_str();
-    if(!inputs.quiet)
+    if(!whisperquiet)
     {
         printf("\n[%s] Whisper Transcribe Output: %s",ts.c_str(),whisper_output_text.c_str());
     } else {

Original file line number	Diff line number	Diff line change
`@@ -106,6 +106,7 @@ static kcpp_params * kcpp_data = nullptr;`
`106`	`106`	`static int max_context_limit_at_load = 0;`
`107`	`107`	`static int n_past = 0;`
`108`	`108`	`static int debugmode = 0; //-1 = hide all, 0 = normal, 1 = showall`
	`109`	`+static bool quiet = false;`
`109`	`110`	`static std::vector<gpt_vocab::id> last_n_tokens;`
`110`	`111`	`static std::vector<gpt_vocab::id> current_context_tokens;`
`111`	`112`	`static size_t mem_per_token = 0;`
`@@ -930,12 +931,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`930`	`931`
`931`	`932`	`if(last_idx>1) //if there are 2 or more viable candidates`
`932`	`933`	`{`
`933`		`- if (debugmode==1) {`
	`934`	`+ if (debugmode==1 && !quiet) {`
`934`	`935`	`printf("XTC penalties [");`
`935`	`936`	`}`
`936`	`937`	`// then remove all other tokens above threshold EXCEPT the least likely one`
`937`	`938`	`for (size_t i = 0; i < last_idx - 1; ++i) {`
`938`		`- if (debugmode==1)`
	`939`	`+ if (debugmode==1 && !quiet)`
`939`	`940`	`{`
`940`	`941`	`gpt_vocab::id token = candidates->data[i].id;`
`941`	`942`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`@@ -944,7 +945,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float`
`944`	`945`	`}`
`945`	`946`	`candidates->data[i].logit -= 999.0f; //infinity gets wonky results downstream, this hack works well enough`
`946`	`947`	`}`
`947`		`- if (debugmode==1) {`
	`948`	`+ if (debugmode==1 && !quiet) {`
`948`	`949`	`printf("]\n");`
`949`	`950`	`}`
`950`	`951`	`candidates->sorted = false;`
`@@ -1133,7 +1134,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1133`	`1134`	`max_exponent = FLOAT_MAX_LOG / std::log(penalty_base);`
`1134`	`1135`	`}`
`1135`	`1136`
`1136`		`- if (debugmode==1 && !dry_max_token_repeat.empty()) {`
	`1137`	`+ if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
`1137`	`1138`	`printf("DRY penalties [");`
`1138`	`1139`	`}`
`1139`	`1140`	`size_t count = 0;`
`@@ -1144,7 +1145,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1144`	`1145`	`repeat_exp = max_exponent;`
`1145`	`1146`	`}`
`1146`	`1147`	`float penalty = penalty_multiplier * pow(penalty_base, repeat_exp);`
`1147`		`- if (debugmode==1)`
	`1148`	`+ if (debugmode==1 && !quiet)`
`1148`	`1149`	`{`
`1149`	`1150`	`std::string tokenizedstr = FileFormatTokenizeID(token, file_format);`
`1150`	`1151`	`::utreplace(tokenizedstr, "\n", "\\n");`
`@@ -1157,7 +1158,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe`
`1157`	`1158`	`{`
`1158`	`1159`	`candidates->sorted = false;`
`1159`	`1160`	`}`
`1160`		`- if (debugmode==1 && !dry_max_token_repeat.empty()) {`
	`1161`	`+ if (debugmode==1 && !quiet && !dry_max_token_repeat.empty()) {`
`1161`	`1162`	`printf("]\n");`
`1162`	`1163`	`}`
`1163`	`1164`	`}`
`@@ -1688,7 +1689,7 @@ static void load_grammar(const std::string & gammarstr)`
`1688`	`1689`	`printf("\nIgnored invalid grammar sampler.");`
`1689`	`1690`	`return;`
`1690`	`1691`	`}`
`1691`		`- if(debugmode==1)`
	`1692`	`+ if(debugmode==1 && !quiet)`
`1692`	`1693`	`{`
`1693`	`1694`	`parsed_grammar.print(stderr);`
`1694`	`1695`	`}`
`@@ -1831,7 +1832,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1831`	`1832`	`float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318;`
`1832`	`1833`	`float gradient_ai_rope_freq_base_value = powf(original_rope_base, log10f(chi_ctx_value) / log10f(chi_ctx_train_value));`
`1833`	`1834`
`1834`		`- if(debugmode==1)`
	`1835`	`+ if(debugmode==1 && !quiet)`
`1835`	`1836`	`{`
`1836`	`1837`	`printf("Trained max context length (value:%.d).\n", n_ctx_train);`
`1837`	`1838`	`printf("Desired context length (value:%.d).\n", n_ctx_desired);`
`@@ -1848,7 +1849,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai`
`1848`	`1849`	`{`
`1849`	`1850`	`float extended_rope_positive_offset_value = 1 + ((log10f(chi_ctx_value) - log10f(chi_ctx_train_value)) / ((log10f(chi_ctx_value) * log10f(chi_ctx_train_value)) - (log10f(chi_ctx_value) + log10f(chi_ctx_train_value))));`
`1850`	`1851`	`float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;`
`1851`		`- if(debugmode==1)`
	`1852`	`+ if(debugmode==1 && !quiet)`
`1852`	`1853`	`{`
`1853`	`1854`	`printf("Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n", extended_rope_positive_offset_value);`
`1854`	`1855`	`printf("RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n", rope_freq_base_with_positive_offset);`
`@@ -2679,13 +2680,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)`
`2679`	`2680`	`printf("\nWarning: KCPP text generation not initialized!\n");`
`2680`	`2681`	`return toks;`
`2681`	`2682`	`}`
`2682`		`- if(debugmode==1)`
	`2683`	`+ if(debugmode==1 && !quiet)`
`2683`	`2684`	`{`
`2684`	`2685`	`printf("\nFileFormat: %d, Tokenizing: %s",file_format ,input.c_str());`
`2685`	`2686`	`}`
`2686`	`2687`	`TokenizeString(input, toks, file_format,addbos);`
`2687`	`2688`	`int tokcount = toks.size();`
`2688`		`- if(debugmode==1)`
	`2689`	`+ if(debugmode==1 && !quiet)`
`2689`	`2690`	`{`
`2690`	`2691`	`printf("\nTokens Counted: %d\n",tokcount);`
`2691`	`2692`	`}`
`@@ -2770,6 +2771,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2770`	`2771`	`llama_perf_context_reset(llama_ctx_v4);`
`2771`	`2772`	`}`
`2772`	`2773`
	`2774`	`+ quiet = inputs.quiet;`
`2773`	`2775`	`generation_finished = false; // Set current generation status`
`2774`	`2776`	`generated_tokens.clear(); // New Generation, new tokens`
`2775`	`2777`	`delayed_generated_tokens.clear();`
`@@ -2848,7 +2850,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2848`	`2850`	`banned_token_ids.clear();`
`2849`	`2851`	`if(banned_tokens.size()>0)`
`2850`	`2852`	`{`
`2851`		`- if(debugmode==1)`
	`2853`	`+ if(debugmode==1 && !quiet)`
`2852`	`2854`	`{`
`2853`	`2855`	`printf("\nBanning %zu single character sequences...",banned_tokens.size());`
`2854`	`2856`	`}`
`@@ -2865,13 +2867,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2865`	`2867`	`}`
`2866`	`2868`	`}`
`2867`	`2869`	`}`
`2868`		`- if(debugmode==1)`
	`2870`	`+ if(debugmode==1 && !quiet)`
`2869`	`2871`	`{`
`2870`	`2872`	`printf("\nBanned a total of %zu individual tokens.\n",banned_token_ids.size());`
`2871`	`2873`	`}`
`2872`	`2874`	`}`
`2873`	`2875`
`2874`		`- if(debugmode==1 && banned_phrases.size()>0)`
	`2876`	`+ if(debugmode==1 && !quiet && banned_phrases.size()>0)`
`2875`	`2877`	`{`
`2876`	`2878`	`printf("\nBanned a total of %zu phrases, with max token count of %d.\n",banned_phrases.size(),delayed_generated_tokens_limit);`
`2877`	`2879`	`}`
`@@ -2916,7 +2918,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2916`	`2918`	`//images have changed. swap identifiers to force reprocessing`
`2917`	`2919`	`current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);`
`2918`	`2920`	`llava_composite_image_signature = new_llava_composite;`
`2919`		`- if(debugmode==1)`
	`2921`	`+ if(debugmode==1 && !quiet)`
`2920`	`2922`	`{`
`2921`	`2923`	`printf("\nLLAVA images changed, existing cache invalidated");`
`2922`	`2924`	`}`
`@@ -2972,7 +2974,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2972`	`2974`	`const int MAX_CHAR_LEN = 40;`
`2973`	`2975`	`const int MAX_SEQ_LEN = 20;`
`2974`	`2976`
`2975`		`- if (debugmode == 1)`
	`2977`	`+ if (debugmode == 1 && !quiet)`
`2976`	`2978`	`{`
`2977`	`2979`	`printf("\nProcessing %zu dry break strings...", kcpp_data->dry_sequence_breakers.size());`
`2978`	`2980`	`}`
`@@ -2984,7 +2986,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`2984`	`2986`	`}`
`2985`	`2987`	`GetOverlappingTokenSequences(sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);`
`2986`	`2988`	`}`
`2987`		`- if (debugmode == 1)`
	`2989`	`+ if (debugmode == 1 && !quiet)`
`2988`	`2990`	`{`
`2989`	`2991`	`int trivial = 0, non_trivial = 0;`
`2990`	`2992`	`for (const auto &seq : dry_sequence_breakers)`
`@@ -3004,9 +3006,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3004`	`3006`	`}`
`3005`	`3007`
`3006`	`3008`	`bool stream_sse = inputs.stream_sse;`
`3007`		`-`
`3008`		`- bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet) \|\| debugmode >= 1;`
`3009`		`-`
	`3009`	`+ bool allow_regular_prints = (!quiet && debugmode!=-1);`
`3010`	`3010`
`3011`	`3011`	`std::string grammarstr = inputs.grammar;`
`3012`	`3012`	`bool grammar_retain_state = inputs.grammar_retain_state;`
`@@ -3039,7 +3039,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3039`	`3039`	`if (kcpp_data->seed <= 0 \|\| kcpp_data->seed==0xFFFFFFFF)`
`3040`	`3040`	`{`
`3041`	`3041`	`kcpp_data->seed = (((uint32_t)time(NULL)) % 1000000u);`
`3042`		`- if(debugmode==1)`
	`3042`	`+ if(debugmode==1 && !quiet)`
`3043`	`3043`	`{`
`3044`	`3044`	`printf("\nUsing Seed: %d",kcpp_data->seed);`
`3045`	`3045`	`}`
`@@ -3071,15 +3071,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3071`	`3071`	`}`
`3072`	`3072`	`else`
`3073`	`3073`	`{`
`3074`		`- if(debugmode==1)`
	`3074`	`+ if(debugmode==1 && !quiet)`
`3075`	`3075`	`{`
`3076`	`3076`	`printf("\nCreating clip image embed...");`
`3077`	`3077`	`}`
`3078`	`3078`	`llava_images[i].clp_image_tokens = 0;`
`3079`	`3079`	`if (!llava_image_embed_make_with_clip_img(clp_ctx, kcpp_data->n_threads, clp_img_data, &llava_images[i].clp_img_embd, &llava_images[i].clp_image_tokens)) {`
`3080`	`3080`	`printf("\nError: Clip image %d failed to create embd!",i);`
`3081`	`3081`	`}`
`3082`		`- if(debugmode==1)`
	`3082`	`+ if(debugmode==1 && !quiet)`
`3083`	`3083`	`{`
`3084`	`3084`	`printf("\nLLAVA Clip Embed %i used Tokens: %d",i,llava_images[i].clp_image_tokens);`
`3085`	`3085`	`}`
`@@ -3202,7 +3202,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3202`	`3202`	`std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);`
`3203`	`3203`	`n_past = 0;`
`3204`	`3204`
`3205`		`- if (debugmode==1)`
	`3205`	`+ if (debugmode==1 && !quiet)`
`3206`	`3206`	`{`
`3207`	`3207`	`std::string outstr = "";`
`3208`	`3208`	`printf("\n\n[Debug: Dump Raw Input Tokens, format: %d]\n", file_format);`
`@@ -3347,7 +3347,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3347`	`3347`	`printf("\n");`
`3348`	`3348`	`}`
`3349`	`3349`
`3350`		`- if (debugmode==1)`
	`3350`	`+ if (debugmode==1 && !quiet)`
`3351`	`3351`	`{`
`3352`	`3352`	`std::string outstr = "";`
`3353`	`3353`	`printf("\n[Debug: Dump Forwarded Input Tokens, format: %d]\n", file_format);`
`@@ -3396,7 +3396,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3396`	`3396`	`draft_used = true;`
`3397`	`3397`	`draft_results = speculative_decoding_eval_chunk(draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);`
`3398`	`3398`	`evalres = draft_results.draft_success;`
`3399`		`- if(debugmode==1)`
	`3399`	`+ if(debugmode==1 && !quiet)`
`3400`	`3400`	`{`
`3401`	`3401`	`std::string draftedtoks = get_tok_vec_str(draft_results.draftids);`
`3402`	`3402`	`printf("\nDrafted %d Tokens: [%s]\n",speculative_chunk_amt,draftedtoks.c_str());`
`@@ -3599,7 +3599,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3599`	`3599`	`if(draft_used)`
`3600`	`3600`	`{`
`3601`	`3601`	`int32_t draftedid = draft_results.draftids[logits_sampled];`
`3602`		`- if(debugmode==1)`
	`3602`	`+ if(debugmode==1 && !quiet)`
`3603`	`3603`	`{`
`3604`	`3604`	`std::string drafttok = FileFormatTokenizeID(draftedid, file_format, true);`
`3605`	`3605`	`std::string realtok = FileFormatTokenizeID(id, file_format, true);`
`@@ -3652,7 +3652,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3652`	`3652`	`{`
`3653`	`3653`	`printf("\rGenerating (%d / %d tokens)", (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict);`
`3654`	`3654`	`}`
`3655`		`- if(debugmode==1 && top_picks_history.size()>0)`
	`3655`	`+ if(debugmode==1 && !quiet && top_picks_history.size()>0)`
`3656`	`3656`	`{`
`3657`	`3657`	`printf(" [");`
`3658`	`3658`	`bool firstloop = true;`
`@@ -3904,7 +3904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`3904`	`3904`	`delayed_generated_tokens.pop_front();`
`3905`	`3905`	`}`
`3906`	`3906`
`3907`		`- if(debugmode==1 && file_format == FileFormat::GGUF_GENERIC)`
	`3907`	`+ if(debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)`
`3908`	`3908`	`{`
`3909`	`3909`	`printf("\n");`
`3910`	`3910`	`llama_perf_context_print(llama_ctx_v4);`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@`
`24`	`24`	`#endif`
`25`	`25`
`26`	`26`	`static int whisperdebugmode = 0;`
	`27`	`+static bool whisperquiet = false;`
`27`	`28`	`static whisper_context * whisper_ctx = nullptr;`
`28`	`29`	`static std::string whisper_output_text = "";`
`29`	`30`
`@@ -89,7 +90,7 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st`
`89`	`90`	`std::vector<float> raw_pcm;`
`90`	`91`	`raw_pcm.resize(n);`
`91`	`92`
`92`		`- if(whisperdebugmode==1)`
	`93`	`+ if(whisperdebugmode==1 && !whisperquiet)`
`93`	`94`	`{`
`94`	`95`	`printf("\nwav_data_size: %d, n:%d",wav_data.size(),n);`
`95`	`96`	`}`
`@@ -106,7 +107,7 @@ static bool read_wav(const std::string & b64data, std::vector<float>& pcmf32, st`
`106`	`107`	`}`
`107`	`108`
`108`	`109`	`if (wav.sampleRate != COMMON_SAMPLE_RATE) {`
`109`		`- if(whisperdebugmode==1)`
	`110`	`+ if(whisperdebugmode==1 && !whisperquiet)`
`110`	`111`	`{`
`111`	`112`	`printf("\nResample wav from %" PRIu32 " to %" PRIu32 " (in size: %zu)",`
`112`	`113`	`wav.sampleRate, COMMON_SAMPLE_RATE, raw_pcm.size());`
`@@ -202,7 +203,8 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs`
`202`	`203`	`return output;`
`203`	`204`	`}`
`204`	`205`
`205`		`- if(!inputs.quiet)`
	`206`	`+ whisperquiet = inputs.quiet;`
	`207`	`+ if(!whisperquiet)`
`206`	`208`	`{`
`207`	`209`	`printf("\nWhisper Transcribe Generating...");`
`208`	`210`	`}`
`@@ -261,14 +263,14 @@ whisper_generation_outputs whispertype_generate(const whisper_generation_inputs`
`261`	`263`	`return output;`
`262`	`264`	`}`
`263`	`265`
`264`		`- if (!inputs.quiet && whisperdebugmode==1) {`
	`266`	`+ if (!whisperquiet && whisperdebugmode==1) {`
`265`	`267`	`whisper_print_timings(whisper_ctx);`
`266`	`268`	`}`
`267`	`269`
`268`	`270`	`// output text transcription`
`269`	`271`	`whisper_output_text = output_txt(whisper_ctx, pcmf32s);`
`270`	`272`	`std::string ts = get_timestamp_str();`
`271`		`- if(!inputs.quiet)`
	`273`	`+ if(!whisperquiet)`
`272`	`274`	`{`
`273`	`275`	`printf("\n[%s] Whisper Transcribe Output: %s",ts.c_str(),whisper_output_text.c_str());`
`274`	`276`	`} else {`