@@ -106,6 +106,7 @@ static kcpp_params * kcpp_data = nullptr;
106
106
static int max_context_limit_at_load = 0 ;
107
107
static int n_past = 0 ;
108
108
static int debugmode = 0 ; // -1 = hide all, 0 = normal, 1 = showall
109
+ static bool quiet = false ;
109
110
static std::vector<gpt_vocab::id> last_n_tokens;
110
111
static std::vector<gpt_vocab::id> current_context_tokens;
111
112
static size_t mem_per_token = 0 ;
@@ -930,12 +931,12 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
930
931
931
932
if (last_idx>1 ) // if there are 2 or more viable candidates
932
933
{
933
- if (debugmode==1 ) {
934
+ if (debugmode==1 && !quiet ) {
934
935
printf (" XTC penalties [" );
935
936
}
936
937
// then remove all other tokens above threshold EXCEPT the least likely one
937
938
for (size_t i = 0 ; i < last_idx - 1 ; ++i) {
938
- if (debugmode==1 )
939
+ if (debugmode==1 && !quiet )
939
940
{
940
941
gpt_vocab::id token = candidates->data [i].id ;
941
942
std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
@@ -944,7 +945,7 @@ void sample_xtc(llama_token_data_array * candidates, float xtc_threshold, float
944
945
}
945
946
candidates->data [i].logit -= 999 .0f ; // infinity gets wonky results downstream, this hack works well enough
946
947
}
947
- if (debugmode==1 ) {
948
+ if (debugmode==1 && !quiet ) {
948
949
printf (" ]\n " );
949
950
}
950
951
candidates->sorted = false ;
@@ -1133,7 +1134,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
1133
1134
max_exponent = FLOAT_MAX_LOG / std::log (penalty_base);
1134
1135
}
1135
1136
1136
- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1137
+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
1137
1138
printf (" DRY penalties [" );
1138
1139
}
1139
1140
size_t count = 0 ;
@@ -1144,7 +1145,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
1144
1145
repeat_exp = max_exponent;
1145
1146
}
1146
1147
float penalty = penalty_multiplier * pow (penalty_base, repeat_exp);
1147
- if (debugmode==1 )
1148
+ if (debugmode==1 && !quiet )
1148
1149
{
1149
1150
std::string tokenizedstr = FileFormatTokenizeID (token, file_format);
1150
1151
::utreplace (tokenizedstr, " \n " , " \\ n" );
@@ -1157,7 +1158,7 @@ void sample_dry(int n_ctx, int penalty_range, float penalty_multiplier, float pe
1157
1158
{
1158
1159
candidates->sorted = false ;
1159
1160
}
1160
- if (debugmode==1 && !dry_max_token_repeat.empty ()) {
1161
+ if (debugmode==1 && !quiet && ! dry_max_token_repeat.empty ()) {
1161
1162
printf (" ]\n " );
1162
1163
}
1163
1164
}
@@ -1688,7 +1689,7 @@ static void load_grammar(const std::string & gammarstr)
1688
1689
printf (" \n Ignored invalid grammar sampler." );
1689
1690
return ;
1690
1691
}
1691
- if (debugmode==1 )
1692
+ if (debugmode==1 && !quiet )
1692
1693
{
1693
1694
parsed_grammar.print (stderr);
1694
1695
}
@@ -1831,7 +1832,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
1831
1832
float chi_ctx_value = (n_ctx_desired * ctx_multiplier) / 6.28318 ;
1832
1833
float gradient_ai_rope_freq_base_value = powf (original_rope_base, log10f (chi_ctx_value) / log10f (chi_ctx_train_value));
1833
1834
1834
- if (debugmode==1 )
1835
+ if (debugmode==1 && !quiet )
1835
1836
{
1836
1837
printf (" Trained max context length (value:%.d).\n " , n_ctx_train);
1837
1838
printf (" Desired context length (value:%.d).\n " , n_ctx_desired);
@@ -1848,7 +1849,7 @@ static float CalcGradientAIRopeFreqBase(float original_rope_base, int n_ctx_trai
1848
1849
{
1849
1850
float extended_rope_positive_offset_value = 1 + ((log10f (chi_ctx_value) - log10f (chi_ctx_train_value)) / ((log10f (chi_ctx_value) * log10f (chi_ctx_train_value)) - (log10f (chi_ctx_value) + log10f (chi_ctx_train_value))));
1850
1851
float rope_freq_base_with_positive_offset = gradient_ai_rope_freq_base_value * extended_rope_positive_offset_value;
1851
- if (debugmode==1 )
1852
+ if (debugmode==1 && !quiet )
1852
1853
{
1853
1854
printf (" Extended RoPE Positive Offset (multiplicator) for Solar based models. (value:%.3f).\n " , extended_rope_positive_offset_value);
1854
1855
printf (" RoPE base calculated via Gradient AI formula for Solar based models. (value:%.1f).\n " , rope_freq_base_with_positive_offset);
@@ -2679,13 +2680,13 @@ std::vector<int> gpttype_get_token_arr(const std::string & input, bool addbos)
2679
2680
printf (" \n Warning: KCPP text generation not initialized!\n " );
2680
2681
return toks;
2681
2682
}
2682
- if (debugmode==1 )
2683
+ if (debugmode==1 && !quiet )
2683
2684
{
2684
2685
printf (" \n FileFormat: %d, Tokenizing: %s" ,file_format ,input.c_str ());
2685
2686
}
2686
2687
TokenizeString (input, toks, file_format,addbos);
2687
2688
int tokcount = toks.size ();
2688
- if (debugmode==1 )
2689
+ if (debugmode==1 && !quiet )
2689
2690
{
2690
2691
printf (" \n Tokens Counted: %d\n " ,tokcount);
2691
2692
}
@@ -2770,6 +2771,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2770
2771
llama_perf_context_reset (llama_ctx_v4);
2771
2772
}
2772
2773
2774
+ quiet = inputs.quiet ;
2773
2775
generation_finished = false ; // Set current generation status
2774
2776
generated_tokens.clear (); // New Generation, new tokens
2775
2777
delayed_generated_tokens.clear ();
@@ -2848,7 +2850,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2848
2850
banned_token_ids.clear ();
2849
2851
if (banned_tokens.size ()>0 )
2850
2852
{
2851
- if (debugmode==1 )
2853
+ if (debugmode==1 && !quiet )
2852
2854
{
2853
2855
printf (" \n Banning %zu single character sequences..." ,banned_tokens.size ());
2854
2856
}
@@ -2865,13 +2867,13 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2865
2867
}
2866
2868
}
2867
2869
}
2868
- if (debugmode==1 )
2870
+ if (debugmode==1 && !quiet )
2869
2871
{
2870
2872
printf (" \n Banned a total of %zu individual tokens.\n " ,banned_token_ids.size ());
2871
2873
}
2872
2874
}
2873
2875
2874
- if (debugmode==1 && banned_phrases.size ()>0 )
2876
+ if (debugmode==1 && !quiet && banned_phrases.size ()>0 )
2875
2877
{
2876
2878
printf (" \n Banned a total of %zu phrases, with max token count of %d.\n " ,banned_phrases.size (),delayed_generated_tokens_limit);
2877
2879
}
@@ -2916,7 +2918,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2916
2918
// images have changed. swap identifiers to force reprocessing
2917
2919
current_llava_identifier = (current_llava_identifier==LLAVA_TOKEN_IDENTIFIER_A?LLAVA_TOKEN_IDENTIFIER_B:LLAVA_TOKEN_IDENTIFIER_A);
2918
2920
llava_composite_image_signature = new_llava_composite;
2919
- if (debugmode==1 )
2921
+ if (debugmode==1 && !quiet )
2920
2922
{
2921
2923
printf (" \n LLAVA images changed, existing cache invalidated" );
2922
2924
}
@@ -2972,7 +2974,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2972
2974
const int MAX_CHAR_LEN = 40 ;
2973
2975
const int MAX_SEQ_LEN = 20 ;
2974
2976
2975
- if (debugmode == 1 )
2977
+ if (debugmode == 1 && !quiet )
2976
2978
{
2977
2979
printf (" \n Processing %zu dry break strings..." , kcpp_data->dry_sequence_breakers .size ());
2978
2980
}
@@ -2984,7 +2986,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
2984
2986
}
2985
2987
GetOverlappingTokenSequences (sequence_break, dry_sequence_breakers, MAX_SEQ_LEN);
2986
2988
}
2987
- if (debugmode == 1 )
2989
+ if (debugmode == 1 && !quiet )
2988
2990
{
2989
2991
int trivial = 0 , non_trivial = 0 ;
2990
2992
for (const auto &seq : dry_sequence_breakers)
@@ -3004,9 +3006,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3004
3006
}
3005
3007
3006
3008
bool stream_sse = inputs.stream_sse ;
3007
-
3008
- bool allow_regular_prints = (debugmode!=-1 && !inputs.quiet ) || debugmode >= 1 ;
3009
-
3009
+ bool allow_regular_prints = (!quiet && debugmode!=-1 );
3010
3010
3011
3011
std::string grammarstr = inputs.grammar ;
3012
3012
bool grammar_retain_state = inputs.grammar_retain_state ;
@@ -3039,7 +3039,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3039
3039
if (kcpp_data->seed <= 0 || kcpp_data->seed ==0xFFFFFFFF )
3040
3040
{
3041
3041
kcpp_data->seed = (((uint32_t )time (NULL )) % 1000000u );
3042
- if (debugmode==1 )
3042
+ if (debugmode==1 && !quiet )
3043
3043
{
3044
3044
printf (" \n Using Seed: %d" ,kcpp_data->seed );
3045
3045
}
@@ -3071,15 +3071,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3071
3071
}
3072
3072
else
3073
3073
{
3074
- if (debugmode==1 )
3074
+ if (debugmode==1 && !quiet )
3075
3075
{
3076
3076
printf (" \n Creating clip image embed..." );
3077
3077
}
3078
3078
llava_images[i].clp_image_tokens = 0 ;
3079
3079
if (!llava_image_embed_make_with_clip_img (clp_ctx, kcpp_data->n_threads , clp_img_data, &llava_images[i].clp_img_embd , &llava_images[i].clp_image_tokens )) {
3080
3080
printf (" \n Error: Clip image %d failed to create embd!" ,i);
3081
3081
}
3082
- if (debugmode==1 )
3082
+ if (debugmode==1 && !quiet )
3083
3083
{
3084
3084
printf (" \n LLAVA Clip Embed %i used Tokens: %d" ,i,llava_images[i].clp_image_tokens );
3085
3085
}
@@ -3202,7 +3202,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3202
3202
std::fill (last_n_tokens.begin (), last_n_tokens.end (), 0 );
3203
3203
n_past = 0 ;
3204
3204
3205
- if (debugmode==1 )
3205
+ if (debugmode==1 && !quiet )
3206
3206
{
3207
3207
std::string outstr = " " ;
3208
3208
printf (" \n\n [Debug: Dump Raw Input Tokens, format: %d]\n " , file_format);
@@ -3347,7 +3347,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3347
3347
printf (" \n " );
3348
3348
}
3349
3349
3350
- if (debugmode==1 )
3350
+ if (debugmode==1 && !quiet )
3351
3351
{
3352
3352
std::string outstr = " " ;
3353
3353
printf (" \n [Debug: Dump Forwarded Input Tokens, format: %d]\n " , file_format);
@@ -3396,7 +3396,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3396
3396
draft_used = true ;
3397
3397
draft_results = speculative_decoding_eval_chunk (draft_ctx, llama_ctx_v4, embd, n_vocab, n_past);
3398
3398
evalres = draft_results.draft_success ;
3399
- if (debugmode==1 )
3399
+ if (debugmode==1 && !quiet )
3400
3400
{
3401
3401
std::string draftedtoks = get_tok_vec_str (draft_results.draftids );
3402
3402
printf (" \n Drafted %d Tokens: [%s]\n " ,speculative_chunk_amt,draftedtoks.c_str ());
@@ -3599,7 +3599,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3599
3599
if (draft_used)
3600
3600
{
3601
3601
int32_t draftedid = draft_results.draftids [logits_sampled];
3602
- if (debugmode==1 )
3602
+ if (debugmode==1 && !quiet )
3603
3603
{
3604
3604
std::string drafttok = FileFormatTokenizeID (draftedid, file_format, true );
3605
3605
std::string realtok = FileFormatTokenizeID (id, file_format, true );
@@ -3652,7 +3652,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3652
3652
{
3653
3653
printf (" \r Generating (%d / %d tokens)" , (kcpp_data->n_predict - remaining_tokens), kcpp_data->n_predict );
3654
3654
}
3655
- if (debugmode==1 && top_picks_history.size ()>0 )
3655
+ if (debugmode==1 && !quiet && top_picks_history.size ()>0 )
3656
3656
{
3657
3657
printf (" [" );
3658
3658
bool firstloop = true ;
@@ -3904,7 +3904,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
3904
3904
delayed_generated_tokens.pop_front ();
3905
3905
}
3906
3906
3907
- if (debugmode==1 && file_format == FileFormat::GGUF_GENERIC)
3907
+ if (debugmode==1 && !quiet && file_format == FileFormat::GGUF_GENERIC)
3908
3908
{
3909
3909
printf (" \n " );
3910
3910
llama_perf_context_print (llama_ctx_v4);
0 commit comments