pytorch
diff --git a/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+1-1 b/‎examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm
+1-1
diff --git a/‎examples/models/llama/main.cpp
+4-3 b/‎examples/models/llama/main.cpp
+4-3
diff --git a/‎examples/models/llama/runner/runner.cpp
+113-84 b/‎examples/models/llama/runner/runner.cpp
+113-84
diff --git a/‎examples/models/llama/runner/runner.h
+14-14 b/‎examples/models/llama/runner/runner.h
+14-14
diff --git a/‎examples/models/llama/runner/test/CMakeLists.txt
+28 b/‎examples/models/llama/runner/test/CMakeLists.txt
+28
diff --git a/‎examples/models/llama/runner/test/TARGETS
+14 b/‎examples/models/llama/runner/test/TARGETS
+14
@@ -31,7 +31,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath
   self = [super init];
   if (self) {
     [ExecuTorchLog.sharedLog addSink:self];
-    _runner = std::make_unique<example::Runner>(
+    _runner = example::Runner::create(
         modelPath.UTF8String, tokenizerPath.UTF8String);
   }
   return self;
 
@@ -74,17 +74,18 @@ int32_t main(int32_t argc, char** argv) {
 #endif
   // create llama runner
   // @lint-ignore CLANGTIDY facebook-hte-Deprecated
-  example::Runner runner(model_path, tokenizer_path);
+  std::unique_ptr<example::Runner> runner =
+      example::Runner::create(model_path, tokenizer_path);
 
   if (warmup) {
     // @lint-ignore CLANGTIDY facebook-hte-Deprecated
-    runner.warmup(prompt, /*max_new_tokens=*/seq_len);
+    runner->warmup(prompt, /*max_new_tokens=*/seq_len);
   }
   // generate
   executorch::extension::llm::GenerationConfig config{
       .seq_len = seq_len, .temperature = temperature};
   // @lint-ignore CLANGTIDY facebook-hte-Deprecated
-  runner.generate(prompt, config);
+  runner->generate(prompt, config);
 
   return 0;
 }
@@ -11,9 +11,6 @@
 
 #include <executorch/examples/models/llama/runner/runner.h>
 
-#include <algorithm>
-#include <ctime>
-
 #include <executorch/extension/llm/runner/util.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
@@ -35,129 +32,161 @@ static constexpr auto kMaxSeqLen = "get_max_seq_len";
 static constexpr auto kMaxContextLen = "get_max_context_len";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
-static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 } // namespace
 
-Runner::Runner(
+std::unique_ptr<Runner> Runner::create(
     const std::string& model_path,
     const std::string& tokenizer_path,
-    std::optional<const std::string> data_path)
-    // NOTE: we observed ~2x loading performance increase on iPhone 15
-    // and a ~5% improvement on Galaxy S22 by switching to
-    // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-    : tokenizer_path_(tokenizer_path),
-      metadata_({
-          {kEnableDynamicShape, false},
-          {kMaxSeqLen, 128},
-          {kMaxContextLen, 128},
-          {kUseKVCache, true},
-          {kUseSDPAWithKVCache, false},
-      }) {
-  if (data_path.has_value()) {
-    module_ = std::make_unique<Module>(
-        model_path, data_path.value(), Module::LoadMode::File);
-  } else {
-    module_ = std::make_unique<Module>(model_path, Module::LoadMode::File);
-  }
+    std::optional<const std::string> data_path,
+    float temperature) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
       model_path.c_str(),
       tokenizer_path.c_str());
-}
 
-[[deprecated(
-    "This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
-Runner::Runner(
-    const std::string& model_path,
-    const std::string& tokenizer_path,
-    const float temperature,
-    std::optional<const std::string> data_path)
-    : Runner(model_path, tokenizer_path, std::move(data_path)) {
-  temperature_ = temperature;
-}
+  // Create the Module
+  std::unique_ptr<Module> module;
+  if (data_path.has_value()) {
+    module = std::make_unique<Module>(
+        model_path, data_path.value(), Module::LoadMode::File);
+  } else {
+    module = std::make_unique<Module>(model_path, Module::LoadMode::File);
+  }
 
-bool Runner::is_loaded() const {
-  return module_->is_loaded() && tokenizer_ && text_decoder_runner_ &&
-      text_prefiller_ && text_token_generator_;
-}
+  // Initialize metadata with default values
+  std::unordered_map<std::string, int64_t> metadata({
+      {kEnableDynamicShape, false},
+      {kMaxSeqLen, 128},
+      {kMaxContextLen, 128},
+      {kUseKVCache, true},
+  });
 
-Error Runner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-  // load tokenizer. Assuming tiktoken is the default tokenizer
-  tokenizer_ = nullptr;
-  tokenizer_ = get_tiktoken_for_llama();
-  ::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
-  // Rely on tiktoken to throw error if the artifact is incompatible. Then we
-  // fallback to BPE tokenizer.
-  if (err != ::tokenizers::Error::Ok) {
+  // Create and load tokenizer
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer = get_tiktoken_for_llama();
+  ::tokenizers::Error tk_err = tokenizer->load(tokenizer_path);
+
+  // Fallback to BPE tokenizer if tiktoken fails
+  if (tk_err != ::tokenizers::Error::Ok) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
-        tokenizer_path_.c_str());
-    tokenizer_.reset();
-    tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
-    err = tokenizer_->load(tokenizer_path_);
-    ET_CHECK_TK_OK_OR_RETURN_ERROR(
-        err,
-        "Failed to load %s as a llama2.c tokenizer artifact",
-        tokenizer_path_.c_str());
+        tokenizer_path.c_str());
+    tokenizer.reset();
+    tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
+    tk_err = tokenizer->load(tokenizer_path);
+    if (tk_err != ::tokenizers::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load %s as a llama2.c tokenizer artifact",
+          tokenizer_path.c_str());
+      return nullptr;
+    }
   }
 
   ET_LOG(Info, "Reading metadata from model");
 
-  metadata_[kBosId] = tokenizer_->bos_tok();
+  // Set tokenizer-related metadata
+  metadata[kBosId] = tokenizer->bos_tok();
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>(
-      std::unordered_set<uint64_t>{tokenizer_->eos_tok()});
-  metadata_[kVocabSize] = tokenizer_->vocab_size();
-
-  const auto method_names =
-      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+      std::unordered_set<uint64_t>{tokenizer->eos_tok()});
+  metadata[kVocabSize] = tokenizer->vocab_size();
+
+  // Read metadata from the model
+  auto method_names_result = module->method_names();
+  if (method_names_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method names");
+    return nullptr;
+  }
+  const auto method_names = method_names_result.get();
 
-  for (auto& pair : metadata_) {
+  for (auto& pair : metadata) {
     const auto& method_name = pair.first;
     auto& value = pair.second;
 
     if (method_names.count(method_name)) {
-      value = ET_UNWRAP(module_->get(method_name))
-                  .toScalar()
-                  .to<decltype(metadata_)::mapped_type>();
+      auto get_result = module->get(method_name);
+      value = get_result.get().toScalar().to<decltype(metadata)::mapped_type>();
     } else {
       ET_LOG(
           Info,
-          "Methond %s not found, using the default value %" PRId64,
+          "Method %s not found, using the default value %" PRId64,
           method_name.c_str(),
           value);
     }
     ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
   }
+
+  // Get EOS IDs if available
   if (method_names.count(kEosIds)) {
     eos_ids->clear();
-    for (const auto& eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
+    auto execute_result = module->execute(kEosIds);
+    if (execute_result.error() != Error::Ok) {
+      ET_LOG(Error, "Failed to execute %s", kEosIds);
+      return nullptr;
+    }
+    for (const auto& eos_id : execute_result.get()) {
       auto value = eos_id.toScalar().to<int64_t>();
       eos_ids->emplace(value);
       ET_LOG(Info, "eos_id = %" PRId64, value);
     }
   }
-  // @lint-ignore CLANGTIDY facebook-hte-Deprecated
-  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_.get(), metadata_.at(kUseKVCache));
-  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
-      text_decoder_runner_.get(),
-      metadata_.at(kUseKVCache),
-      metadata_.at(kEnableDynamicShape),
-      metadata_.at(kMaxSeqLen));
-
-  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer_.get(),
-      text_decoder_runner_.get(),
-      metadata_.at(kUseKVCache),
+
+  // Create text_decoder_runner
+  auto text_decoder_runner = std::make_unique<llm::TextDecoderRunner>(
+      module.get(), metadata.at(kUseKVCache));
+
+  // Create text_prefiller
+  auto text_prefiller = std::make_unique<llm::TextPrefiller>(
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
+      metadata.at(kEnableDynamicShape),
+      metadata.at(kMaxSeqLen));
+
+  // Create text_token_generator with stats
+  auto stats = new llm::Stats();
+  auto text_token_generator = std::make_unique<llm::TextTokenGenerator>(
+      tokenizer.get(),
+      text_decoder_runner.get(),
+      metadata.at(kUseKVCache),
       std::move(eos_ids),
-      &stats_);
+      stats);
+
+  // Create and return the Runner instance
+  return std::make_unique<Runner>(
+      std::move(metadata),
+      std::move(tokenizer),
+      std::move(text_prefiller),
+      std::move(text_token_generator),
+      temperature);
+}
 
+Runner::Runner(
+    std::unordered_map<std::string, int64_t> metadata,
+    std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+    std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller,
+    std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+        text_token_generator,
+    float temperature)
+    : tokenizer_(std::move(tokenizer)),
+      metadata_(std::move(metadata)),
+      text_prefiller_(std::move(text_prefiller)),
+      text_token_generator_(std::move(text_token_generator)),
+      temperature_(temperature) {
+  // Note: This constructor assumes that text_prefiller and text_token_generator
+  // already have references to the Module and TextDecoderRunner they need
+}
+
+bool Runner::is_loaded() const {
+  return text_prefiller_->is_loaded() && text_token_generator_->is_loaded();
+}
+
+Error Runner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(text_prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(text_token_generator_->load());
   return Error::Ok;
 }
 
 
@@ -30,18 +30,22 @@ namespace example {
 
 class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
  public:
-  explicit Runner(
+  // Static factory method to create a Runner instance
+  static std::unique_ptr<Runner> create(
       const std::string& model_path,
       const std::string& tokenizer_path,
-      std::optional<const std::string> data_path = std::nullopt);
+      std::optional<const std::string> data_path = std::nullopt,
+      float temperature = -1.0f);
 
-  [[deprecated(
-      "This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
+  // Constructor with dependency injection
   explicit Runner(
-      const std::string& model_path,
-      const std::string& tokenizer_path,
-      const float temperature,
-      std::optional<const std::string> data_path = std::nullopt);
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
+      std::unique_ptr<::executorch::extension::llm::TextPrefiller>
+          text_prefiller,
+      std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+          text_token_generator,
+      float temperature = -1.0f);
 
   bool is_loaded() const override;
   ::executorch::runtime::Error load() override;
@@ -59,18 +63,14 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
  private:
   bool shouldStop_{false};
 
-  // model
-  std::unique_ptr<::executorch::extension::Module> module_;
-  std::string tokenizer_path_;
+  // Components
   std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
-      text_decoder_runner_;
   std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
   std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
       text_token_generator_;
 
-  // stats
+  // Stats
   ::executorch::extension::llm::Stats stats_;
 
   // temperature.
 
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../..)
+
+include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
+
+set(_test_srcs runner_test.cpp)
+
+et_cxx_test(
+  runner_test
+  SOURCES
+  ${_test_srcs}
+  EXTRA_LIBS
+  executorch
+)
@@ -0,0 +1,14 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ - (instancetype)initWithModelPath:(NSString*)modelPath`
`31`	`31`	`self = [super init];`
`32`	`32`	`if (self) {`
`33`	`33`	`[ExecuTorchLog.sharedLog addSink:self];`
`34`		`- _runner = std::make_unique<example::Runner>(`
	`34`	`+ _runner = example::Runner::create(`
`35`	`35`	`modelPath.UTF8String, tokenizerPath.UTF8String);`
`36`	`36`	`}`
`37`	`37`	`return self;`