@@ -52,7 +52,9 @@ Runner::Runner(
52
52
{kMaxContextLen , 128 },
53
53
{kUseKVCache , true },
54
54
{kUseSDPAWithKVCache , false },
55
- }) {
55
+ }),
56
+ // @lint-ignore CLANGTIDY facebook-hte-Deprecated
57
+ stats_(std::make_unique<llm::Stats>()) {
56
58
if (data_path.has_value ()) {
57
59
module_ = std::make_unique<Module>(
58
60
model_path, data_path.value (), Module::LoadMode::File);
@@ -99,6 +101,7 @@ Error Runner::load() {
99
101
" Failed to load %s as a Tiktoken artifact, trying BPE tokenizer" ,
100
102
tokenizer_path_.c_str ());
101
103
tokenizer_.reset ();
104
+ // @lint-ignore CLANGTIDY facebook-hte-Deprecated
102
105
tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
103
106
err = tokenizer_->load (tokenizer_path_);
104
107
ET_CHECK_TK_OK_OR_RETURN_ERROR (
@@ -156,7 +159,7 @@ Error Runner::load() {
156
159
text_decoder_runner_.get (),
157
160
metadata_.at (kUseKVCache ),
158
161
std::move (eos_ids),
159
- & stats_);
162
+ stats_. get () );
160
163
161
164
return Error::Ok;
162
165
}
@@ -178,9 +181,9 @@ Error Runner::generate(
178
181
// Use ones-initialized inputs.
179
182
ET_CHECK_MSG (!prompt.empty (), " Prompt cannot be null" );
180
183
if (!is_loaded ()) {
181
- stats_. model_load_start_ms = llm::time_in_ms ();
184
+ stats_-> model_load_start_ms = llm::time_in_ms ();
182
185
ET_CHECK_OK_OR_RETURN_ERROR (load ());
183
- stats_. model_load_end_ms = llm::time_in_ms ();
186
+ stats_-> model_load_end_ms = llm::time_in_ms ();
184
187
}
185
188
186
189
if (config.warming ) {
@@ -206,7 +209,7 @@ Error Runner::generate(
206
209
// First token time only measures the time it takes to encode the prompt and
207
210
// return a response token.
208
211
209
- stats_. inference_start_ms = llm::time_in_ms ();
212
+ stats_-> inference_start_ms = llm::time_in_ms ();
210
213
shouldStop_ = false ;
211
214
212
215
::tokenizers::Result<std::vector<uint64_t >> encode_res = tokenizer_->encode (
@@ -247,8 +250,8 @@ Error Runner::generate(
247
250
auto prefill_res = text_prefiller_->prefill (prompt_tokens, pos);
248
251
ET_CHECK_OK_OR_RETURN_ERROR (prefill_res.error ());
249
252
uint64_t cur_token = prefill_res.get ();
250
- stats_. first_token_ms = llm::time_in_ms ();
251
- stats_. prompt_eval_end_ms = llm::time_in_ms ();
253
+ stats_-> first_token_ms = llm::time_in_ms ();
254
+ stats_-> prompt_eval_end_ms = llm::time_in_ms ();
252
255
253
256
// print the first token from prefill. No prev_token so use cur_token for it.
254
257
wrapped_callback (
@@ -269,7 +272,7 @@ Error Runner::generate(
269
272
temperature_ == -1 .0f ? config.temperature : temperature_,
270
273
wrapped_callback));
271
274
272
- stats_. inference_end_ms = llm::time_in_ms ();
275
+ stats_-> inference_end_ms = llm::time_in_ms ();
273
276
if (!config.warming ) {
274
277
printf (" \n " );
275
278
}
@@ -282,17 +285,17 @@ Error Runner::generate(
282
285
RUNNER_ET_LOG (config.warming , " Max new tokens %i reached!" , max_new_tokens);
283
286
}
284
287
285
- stats_. num_prompt_tokens = num_prompt_tokens;
286
- stats_. num_generated_tokens = num_generated_tokens;
288
+ stats_-> num_prompt_tokens = num_prompt_tokens;
289
+ stats_-> num_generated_tokens = num_generated_tokens;
287
290
288
291
if (config.warming ) {
289
292
ET_LOG (Info, " Warmup run finished!" );
290
293
} else {
291
294
// Do not print report during warmup
292
- ::executorch::llm::print_report (stats_);
295
+ ::executorch::llm::print_report (* stats_);
293
296
}
294
297
if (stats_callback) {
295
- stats_callback (stats_);
298
+ stats_callback (* stats_);
296
299
}
297
300
298
301
return Error::Ok;
@@ -307,7 +310,7 @@ Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
307
310
Error err = generate (prompt, config);
308
311
309
312
// Reset stats after warmup
310
- stats_. reset ();
313
+ stats_-> reset ();
311
314
return err;
312
315
}
313
316
0 commit comments