Support multimethod in runner (#17398)

pytorchbot · Github Executorch · web-flow · commit 964c56579ae5 · 2026-02-11T13:51:58.000-08:00
This PR was created by the merge bot to help merge the original PR into the main branch. ghstack PR number: #17228 by @lucylq ^ Please use this as the source of truth for the PR details, comments, and reviews ghstack PR base: https://github.com/pytorch/executorch/tree/gh/lucylq/131/base ghstack PR head: https://github.com/pytorch/executorch/tree/gh/lucylq/131/head Merge bot PR base: https://github.com/pytorch/executorch/tree/main Merge bot PR head: https://github.com/pytorch/executorch/tree/gh/lucylq/131/orig Differential Revision: [D92225533](https://our.internmc.facebook.com/intern/diff/D92225533/) @diff-train-skip-merge Co-authored-by: Github Executorch <github_executorch@arm.com>
diff --git a/examples/models/llama/main.cpp b/examples/models/llama/main.cpp
@@ -77,6 +77,11 @@ DEFINE_string(
     "etdump.in",
     "If an etdump path is provided, generate an ETDump file at the specified path for profiling purposes.");
 
+DEFINE_string(
+    method_name,
+    "forward",
+    "Method name to execute in the model (e.g., 'forward', 'lora_forward').");
+
 // Helper function to parse comma-separated string lists
 std::vector<std::string> parseStringList(const std::string& input) {
   std::vector<std::string> result;
@@ -145,11 +150,11 @@ int32_t main(int32_t argc, char** argv) {
           data_paths,
           temperature,
 #ifdef ET_EVENT_TRACER_ENABLED
-          std::move(etdump_gen_ptr)
+          std::move(etdump_gen_ptr),
 #else
-          nullptr
+          nullptr,
 #endif
-      );
+          FLAGS_method_name);
 
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create llama runner");
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -37,7 +37,8 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
     float temperature,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) {
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer,
+    const std::string& method_name) {
   if (data_path.has_value()) {
     std::vector<std::string> data_files;
     data_files.push_back(data_path.value());
@@ -46,22 +47,25 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
         tokenizer_path,
         std::move(data_files),
         temperature,
-        std::move(event_tracer));
+        std::move(event_tracer),
+        method_name);
   }
   return create_llama_runner(
       model_path,
       tokenizer_path,
       std::vector<std::string>(),
       temperature,
-      std::move(event_tracer));
+      std::move(event_tracer),
+      method_name);
 }
 
 std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     std::vector<std::string> data_files,
     float temperature,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) {
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer,
+    const std::string& method_name) {
   ET_LOG(
       Info,
       "Creating LLaMa runner: model_path=%s, tokenizer_path=%s",
@@ -84,7 +88,8 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
       std::move(tokenizer),
       data_files,
       temperature,
-      std::move(event_tracer));
+      std::move(event_tracer),
+      method_name);
 }
 
 } // namespace example
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
@@ -29,14 +29,16 @@ std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& tokenizer_path,
     std::optional<const std::string> data_path,
     float temperature = -1.0f,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr);
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr,
+    const std::string& method_name = "forward");
 
 std::unique_ptr<llm::TextLLMRunner> create_llama_runner(
     const std::string& model_path,
     const std::string& tokenizer_path,
     std::vector<std::string> data_files = {},
     float temperature = -1.0f,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr);
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr,
+    const std::string& method_name = "forward");
 
 std::unique_ptr<tokenizers::Tokenizer> load_llama_tokenizer(
     const std::string& tokenizer_path,
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
@@ -182,26 +182,35 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::optional<const std::string> data_path,
-    float temperature) {
+    float temperature,
+    const std::string& method_name) {
   if (data_path.has_value()) {
     std::vector<std::string> data_files;
     data_files.push_back(data_path.value());
     return create_text_llm_runner(
-        model_path, std::move(tokenizer), std::move(data_files), temperature);
+        model_path,
+        std::move(tokenizer),
+        std::move(data_files),
+        temperature,
+        nullptr,
+        method_name);
   }
   return create_text_llm_runner(
       model_path,
       std::move(tokenizer),
       std::vector<std::string>(),
-      temperature);
+      temperature,
+      nullptr,
+      method_name);
 }
 
 std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::vector<std::string> data_files,
     float temperature,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer) {
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer,
+    const std::string& method_name) {
   // Sanity check tokenizer
   if (!tokenizer || !tokenizer->is_loaded()) {
     ET_LOG(Error, "Tokenizer is null or not loaded");
@@ -236,10 +245,10 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
   // Create IOManager
   std::unique_ptr<IOManager> io_manager = std::make_unique<IOManager>(*module);
 
-  // Create text_decoder_runner. Use a shared_ptr so that it can be shared with
-  // TextPrefiller and TextTokenGenerator
-  auto text_decoder_runner =
-      std::make_unique<TextDecoderRunner>(module.get(), io_manager.get());
+  // Create text_decoder_runner
+  ET_LOG(Info, "Using method: %s", method_name.c_str());
+  auto text_decoder_runner = std::make_unique<TextDecoderRunner>(
+      module.get(), io_manager.get(), method_name);
 
   // Create text_prefiller
   auto text_prefiller = std::make_unique<TextPrefiller>(
diff --git a/extension/llm/runner/llm_runner_helper.h b/extension/llm/runner/llm_runner_helper.h
@@ -95,14 +95,16 @@ ET_EXPERIMENTAL std::unordered_set<uint64_t> get_eos_ids(
  * @param data_path Optional path to additional data required by the model
  * @param temperature Optional temperature parameter for controlling randomness
  * (deprecated)
+ * @param method_name Name of the method to execute in the model
  * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
  * nullptr on failure
  */
 ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     const std::string& model_path,
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::optional<const std::string> data_path,
-    float temperature = -1.0f);
+    float temperature = -1.0f,
+    const std::string& method_name = "forward");
 
 /**
  * @brief Creates a TextLLMRunner instance with dependency injection
@@ -116,6 +118,8 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
  * @param data_files Vector of paths to additional data required by the model
  * @param temperature Optional temperature parameter for controlling randomness
  * (deprecated)
+ * @param event_tracer Optional event tracer for profiling
+ * @param method_name Name of the method to execute in the model
  * @return std::unique_ptr<TextLLMRunner> Initialized TextLLMRunner instance, or
  * nullptr on failure
  */
@@ -124,7 +128,8 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
     std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
     std::vector<std::string> data_files = {},
     float temperature = -1.0f,
-    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr);
+    std::unique_ptr<::executorch::runtime::EventTracer> event_tracer = nullptr,
+    const std::string& method_name = "forward");
 
 /**
  * @brief Creates a MultimodalRunner instance with dependency injection
diff --git a/extension/llm/runner/test/test_text_decoder_runner.cpp b/extension/llm/runner/test/test_text_decoder_runner.cpp
@@ -47,6 +47,41 @@ class TextDecoderRunnerTest : public Test {
   std::unique_ptr<IOManager> io_manager_;
 };
 
+// Test that method_name defaults to "forward"
+TEST_F(TextDecoderRunnerTest, MethodNameDefaultsToForward) {
+  EXPECT_EQ(runner_->method_name(), "forward");
+}
+
+// Test that method_name can be set to a custom value
+TEST_F(TextDecoderRunnerTest, MethodNameCustomValue) {
+  auto custom_runner = std::make_unique<TextDecoderRunner>(
+      mock_module_.get(), io_manager_.get(), "encode");
+  EXPECT_EQ(custom_runner->method_name(), "encode");
+}
+
+// Test that load() uses method_name (not hardcoded "forward")
+TEST_F(TextDecoderRunnerTest, LoadUsesMethodName) {
+  // Get an available model
+  const char* model_path = std::getenv("KVCACHE_CACHE_POS");
+  if (!model_path) {
+    GTEST_SKIP() << "No PTE model environment variable set";
+  }
+  auto module = std::make_unique<Module>(model_path);
+  auto load_result = module->load();
+  if (load_result != Error::Ok) {
+    GTEST_SKIP() << "Failed to load model";
+  }
+
+  auto io_mgr = std::make_unique<IOManager>(*module);
+
+  // Create runner with a method name that doesn't exist
+  TextDecoderRunner runner(module.get(), io_mgr.get(), "nonexistent_method");
+
+  // load() should fail because "nonexistent_method" doesn't exist
+  auto result = runner.load();
+  EXPECT_NE(result, Error::Ok);
+}
+
 // Test logits_to_token() method with Float tensor
 TEST_F(TextDecoderRunnerTest, LogitsToTokenFloat) {
   TensorFactory<executorch::aten::ScalarType::Float> tf_float;
diff --git a/extension/llm/runner/text_decoder_runner.cpp b/extension/llm/runner/text_decoder_runner.cpp
@@ -22,8 +22,13 @@ namespace llm {
 // NOTE: we observed ~2x loading performance increase on iPhone 15
 // and a ~5% improvement on Galaxy S22 by switching to
 // FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
-TextDecoderRunner::TextDecoderRunner(Module* module, IOManager* io_manager)
-    : module_(module), io_manager_(io_manager) {}
+TextDecoderRunner::TextDecoderRunner(
+    Module* module,
+    IOManager* io_manager,
+    std::string method_name)
+    : module_(module),
+      io_manager_(io_manager),
+      method_name_(std::move(method_name)) {}
 
 // This function is functional, meaning it shouldn't modify any state of the
 // input. It should be safe to call multiple times with the same inputs. The
@@ -32,7 +37,7 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
     TensorPtr& tokens,
     int64_t start_pos) {
   // ET_LOG(Info, "Input token %" PRIu64, input_token);
-  auto method_meta_result = module_->method_meta("forward");
+  auto method_meta_result = module_->method_meta(method_name_);
   if (!method_meta_result.ok()) {
     return method_meta_result.error();
   }
@@ -44,25 +49,31 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
 
   if (use_kv_cache) {
     auto start_pos_tensor_result = populate_start_pos_or_cache_position(
-        module_, start_pos, cache_positions, tokens->numel(), "forward");
+        module_,
+        start_pos,
+        cache_positions,
+        tokens->numel(),
+        method_name_.c_str());
     if (!start_pos_tensor_result.ok()) {
       return start_pos_tensor_result.error();
     }
     auto start_pos_tensor = std::move(*start_pos_tensor_result);
 
     std::vector<runtime::EValue> inputs;
-    auto inputs_res = io_manager_->prepare_decode(tokens, start_pos_tensor);
+    auto inputs_res =
+        io_manager_->prepare_decode(tokens, start_pos_tensor, method_name_);
     ET_CHECK_OK_OR_RETURN_ERROR(inputs_res.error());
     inputs = inputs_res.get();
-    auto outputs_res = module_->forward(inputs);
+    auto outputs_res = module_->execute(method_name_, inputs);
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
 
-    auto update_err = io_manager_->update_decode(outputs_res.get());
+    auto update_err =
+        io_manager_->update_decode(outputs_res.get(), method_name_);
     ET_CHECK_OK_OR_RETURN_ERROR(update_err);
 
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
-        "More then one output returned from executing LLM.");
+        "More than one output returned from executing LLM.");
     ET_CHECK_MSG(
         outputs_res.get()[0].isTensor(),
         "Non Tensor Output returned from executing LLM");
@@ -72,11 +83,12 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
   } else { // no kv cache
     (void)start_pos; // unused
 
-    auto outputs_res = module_->forward(tokens);
+    std::vector<runtime::EValue> inputs{tokens};
+    auto outputs_res = module_->execute(method_name_, inputs);
     ET_CHECK_OK_OR_RETURN_ERROR(outputs_res.error());
     ET_CHECK_MSG(
         outputs_res.get().size() == 1,
-        "More then one output returned from executing LLM.");
+        "More than one output returned from executing LLM.");
     ET_CHECK_MSG(
         outputs_res.get()[0].isTensor(),
         "Non Tensor Output returned from executing LLM");
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
@@ -20,7 +20,10 @@ namespace llm {
 
 class ET_EXPERIMENTAL TextDecoderRunner {
  public:
-  explicit TextDecoderRunner(Module* module, IOManager* io_manager);
+  explicit TextDecoderRunner(
+      Module* module,
+      IOManager* io_manager,
+      std::string method_name = "forward");
 
   virtual ~TextDecoderRunner() = default;
 
@@ -40,15 +43,30 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    * @return The error code.
    */
   virtual ::executorch::runtime::Error load() {
-    return module_->load_method("forward");
+    auto err = module_->load_method(method_name_);
+    if (err != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load method '%s'. Check available methods in the model.",
+          method_name_.c_str());
+    }
+    return err;
   }
 
   /**
    * Check if the required methods in the Module is loaded.
    * @return True if the Module is loaded, false otherwise.
    */
   virtual bool is_method_loaded() {
-    return module_->is_method_loaded("forward");
+    return module_->is_method_loaded(method_name_);
+  }
+
+  /**
+   * Get the method name used by this runner.
+   * @return The method name.
+   */
+  const std::string& method_name() const {
+    return method_name_;
   }
 
   inline void stop() {
@@ -79,6 +97,7 @@ class ET_EXPERIMENTAL TextDecoderRunner {
    */
   Module* module_;
   IOManager* io_manager_;
+  std::string method_name_;
   bool should_stop_{false};
 };