@@ -52,7 +52,9 @@ Runner::Runner(
5252 {kMaxContextLen , 128 },
5353 {kUseKVCache , true },
5454 {kUseSDPAWithKVCache , false },
55- }) {
55+ }),
56+ // @lint-ignore CLANGTIDY facebook-hte-Deprecated
57+ stats_(std::make_unique<llm::Stats>()) {
5658 if (data_path.has_value ()) {
5759 module_ = std::make_unique<Module>(
5860 model_path, data_path.value (), Module::LoadMode::File);
@@ -99,6 +101,7 @@ Error Runner::load() {
99101 " Failed to load %s as a Tiktoken artifact, trying BPE tokenizer" ,
100102 tokenizer_path_.c_str ());
101103 tokenizer_.reset ();
104+ // @lint-ignore CLANGTIDY facebook-hte-Deprecated
102105 tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
103106 err = tokenizer_->load (tokenizer_path_);
104107 ET_CHECK_TK_OK_OR_RETURN_ERROR (
@@ -156,7 +159,7 @@ Error Runner::load() {
156159 text_decoder_runner_.get (),
157160 metadata_.at (kUseKVCache ),
158161 std::move (eos_ids),
159- & stats_);
162+ stats_. get () );
160163
161164 return Error::Ok;
162165}
@@ -178,9 +181,9 @@ Error Runner::generate(
178181 // Use ones-initialized inputs.
179182 ET_CHECK_MSG (!prompt.empty (), " Prompt cannot be null" );
180183 if (!is_loaded ()) {
181- stats_. model_load_start_ms = llm::time_in_ms ();
184+ stats_-> model_load_start_ms = llm::time_in_ms ();
182185 ET_CHECK_OK_OR_RETURN_ERROR (load ());
183- stats_. model_load_end_ms = llm::time_in_ms ();
186+ stats_-> model_load_end_ms = llm::time_in_ms ();
184187 }
185188
186189 if (config.warming ) {
@@ -206,7 +209,7 @@ Error Runner::generate(
206209 // First token time only measures the time it takes to encode the prompt and
207210 // return a response token.
208211
209- stats_. inference_start_ms = llm::time_in_ms ();
212+ stats_-> inference_start_ms = llm::time_in_ms ();
210213 shouldStop_ = false ;
211214
212215 ::tokenizers::Result<std::vector<uint64_t >> encode_res = tokenizer_->encode (
@@ -247,8 +250,8 @@ Error Runner::generate(
247250 auto prefill_res = text_prefiller_->prefill (prompt_tokens, pos);
248251 ET_CHECK_OK_OR_RETURN_ERROR (prefill_res.error ());
249252 uint64_t cur_token = prefill_res.get ();
250- stats_. first_token_ms = llm::time_in_ms ();
251- stats_. prompt_eval_end_ms = llm::time_in_ms ();
253+ stats_-> first_token_ms = llm::time_in_ms ();
254+ stats_-> prompt_eval_end_ms = llm::time_in_ms ();
252255
253256 // print the first token from prefill. No prev_token so use cur_token for it.
254257 wrapped_callback (
@@ -269,7 +272,7 @@ Error Runner::generate(
269272 temperature_ == -1 .0f ? config.temperature : temperature_,
270273 wrapped_callback));
271274
272- stats_. inference_end_ms = llm::time_in_ms ();
275+ stats_-> inference_end_ms = llm::time_in_ms ();
273276 if (!config.warming ) {
274277 printf (" \n " );
275278 }
@@ -282,17 +285,17 @@ Error Runner::generate(
282285 RUNNER_ET_LOG (config.warming , " Max new tokens %i reached!" , max_new_tokens);
283286 }
284287
285- stats_. num_prompt_tokens = num_prompt_tokens;
286- stats_. num_generated_tokens = num_generated_tokens;
288+ stats_-> num_prompt_tokens = num_prompt_tokens;
289+ stats_-> num_generated_tokens = num_generated_tokens;
287290
288291 if (config.warming ) {
289292 ET_LOG (Info, " Warmup run finished!" );
290293 } else {
291294 // Do not print report during warmup
292- ::executorch::llm::print_report (stats_);
295+ ::executorch::llm::print_report (* stats_);
293296 }
294297 if (stats_callback) {
295- stats_callback (stats_);
298+ stats_callback (* stats_);
296299 }
297300
298301 return Error::Ok;
@@ -307,7 +310,7 @@ Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
307310 Error err = generate (prompt, config);
308311
309312 // Reset stats after warmup
310- stats_. reset ();
313+ stats_-> reset ();
311314 return err;
312315}
313316
0 commit comments