@@ -22,8 +22,13 @@ namespace llm {
2222// NOTE: we observed ~2x loading performance increase on iPhone 15
2323// and a ~5% improvement on Galaxy S22 by switching to
2424// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
25- TextDecoderRunner::TextDecoderRunner (Module* module , IOManager* io_manager)
26- : module_(module ), io_manager_(io_manager) {}
25+ TextDecoderRunner::TextDecoderRunner (
26+ Module* module ,
27+ IOManager* io_manager,
28+ std::string method_name)
29+ : module_(module ),
30+ io_manager_ (io_manager),
31+ method_name_(std::move(method_name)) {}
2732
2833// This function is functional, meaning it shouldn't modify any state of the
2934// input. It should be safe to call multiple times with the same inputs. The
@@ -32,7 +37,7 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
3237 TensorPtr& tokens,
3338 int64_t start_pos) {
3439 // ET_LOG(Info, "Input token %" PRIu64, input_token);
35- auto method_meta_result = module_->method_meta (" forward " );
40+ auto method_meta_result = module_->method_meta (method_name_ );
3641 if (!method_meta_result.ok ()) {
3742 return method_meta_result.error ();
3843 }
@@ -44,25 +49,31 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
4449
4550 if (use_kv_cache) {
4651 auto start_pos_tensor_result = populate_start_pos_or_cache_position (
47- module_, start_pos, cache_positions, tokens->numel (), " forward" );
52+ module_,
53+ start_pos,
54+ cache_positions,
55+ tokens->numel (),
56+ method_name_.c_str ());
4857 if (!start_pos_tensor_result.ok ()) {
4958 return start_pos_tensor_result.error ();
5059 }
5160 auto start_pos_tensor = std::move (*start_pos_tensor_result);
5261
5362 std::vector<runtime::EValue> inputs;
54- auto inputs_res = io_manager_->prepare_decode (tokens, start_pos_tensor);
63+ auto inputs_res =
64+ io_manager_->prepare_decode (tokens, start_pos_tensor, method_name_);
5565 ET_CHECK_OK_OR_RETURN_ERROR (inputs_res.error ());
5666 inputs = inputs_res.get ();
57- auto outputs_res = module_->forward ( inputs);
67+ auto outputs_res = module_->execute (method_name_, inputs);
5868 ET_CHECK_OK_OR_RETURN_ERROR (outputs_res.error ());
5969
60- auto update_err = io_manager_->update_decode (outputs_res.get ());
70+ auto update_err =
71+ io_manager_->update_decode (outputs_res.get (), method_name_);
6172 ET_CHECK_OK_OR_RETURN_ERROR (update_err);
6273
6374 ET_CHECK_MSG (
6475 outputs_res.get ().size () == 1 ,
65- " More then one output returned from executing LLM." );
76+ " More than one output returned from executing LLM." );
6677 ET_CHECK_MSG (
6778 outputs_res.get ()[0 ].isTensor (),
6879 " Non Tensor Output returned from executing LLM" );
@@ -72,11 +83,12 @@ ::executorch::runtime::Result<executorch::aten::Tensor> TextDecoderRunner::step(
7283 } else { // no kv cache
7384 (void )start_pos; // unused
7485
75- auto outputs_res = module_->forward (tokens);
86+ std::vector<runtime::EValue> inputs{tokens};
87+ auto outputs_res = module_->execute (method_name_, inputs);
7688 ET_CHECK_OK_OR_RETURN_ERROR (outputs_res.error ());
7789 ET_CHECK_MSG (
7890 outputs_res.get ().size () == 1 ,
79- " More then one output returned from executing LLM." );
91+ " More than one output returned from executing LLM." );
8092 ET_CHECK_MSG (
8193 outputs_res.get ()[0 ].isTensor (),
8294 " Non Tensor Output returned from executing LLM" );
0 commit comments