Fix T5 (#172)

jackzhxng · web-flow · commit fc535680c36c · 2025-10-27T14:01:42.000-04:00
diff --git a/optimum/executorch/modeling.py b/optimum/executorch/modeling.py
@@ -460,29 +460,27 @@ def __init__(
         config: "PretrainedConfig",
     ):
         super().__init__(models=models, config=config)
-        if not hasattr(self, "encoder"):
-            raise AttributeError("Expected attribute 'encoder' not found in the instance.")
-        if not hasattr(self, "text_decoder"):
-            raise AttributeError("Expected attribute 'text_decoder' not found in the instance.")
-        metadata = self.decoder.method_names()
+        if not hasattr(self, "model"):
+            raise AttributeError("Expected attribute 'model' not found in the instance.")
+        metadata = self.model.method_names()
         if "use_kv_cache" in metadata:
-            self.use_kv_cache = self.decoder.run_method("use_kv_cache")[0]
+            self.use_kv_cache = self.model.run_method("use_kv_cache")[0]
         if "get_max_seq_len" in metadata:
-            self.max_cache_size = self.decoder.run_method("get_max_seq_len")[0]
+            self.max_cache_size = self.model.run_method("get_max_seq_len")[0]
         if "get_max_batch_size" in metadata:
-            self.max_batch_size = self.decoder.run_method("get_max_batch_size")[0]
+            self.max_batch_size = self.model.run_method("get_max_batch_size")[0]
         if "get_dtype" in metadata:
-            self.dtype = self.decoder.run_method("get_dtype")[0]
+            self.dtype = self.model.run_method("get_dtype")[0]
         if "get_bos_id" in metadata:
-            self.bos_token_id = self.decoder.run_method("get_bos_id")[0]
+            self.bos_token_id = self.model.run_method("get_bos_id")[0]
         if "get_eos_id" in metadata:
-            self.eos_token_id = self.decoder.run_method("get_eos_id")[0]
+            self.eos_token_id = self.model.run_method("get_eos_id")[0]
         if "get_vocab_size" in metadata:
-            self.vocab_size = self.decoder.run_method("get_vocab_size")[0]
+            self.vocab_size = self.model.run_method("get_vocab_size")[0]
         if "max_hidden_seq_length" in metadata:
-            self.max_hidden_seq_length = self.decoder.run_method("max_hidden_seq_length")[0]
+            self.max_hidden_seq_length = self.model.run_method("max_hidden_seq_length")[0]
         if "decoder_start_token_id" in metadata:
-            self.decoder_start_token_id = self.decoder.run_method("decoder_start_token_id")[0]
+            self.decoder_start_token_id = self.model.run_method("decoder_start_token_id")[0]
 
     def forward(
         self,
@@ -491,15 +489,14 @@ def forward(
         cache_position: torch.Tensor,
         encoder_outputs: Optional[torch.Tensor] = None,
     ):
-        # Encode if needed (first prediction pass)
         is_first_prediction = encoder_outputs is None
         self.stats.on_model_execution_start()
         if is_first_prediction:
-            encoder_outputs = self.encoder.forward((input_ids,))[0]
+            encoder_outputs = self.model.run_method("encoder", (input_ids,))[0]
             self.stats.on_prompt_eval_end()
 
         result = (
-            self.decoder.forward((decoder_input_ids, encoder_outputs, cache_position))[0],
+            self.model.run_method("text_decoder", (decoder_input_ids, encoder_outputs, cache_position))[0],
             encoder_outputs,
         )
         self.stats.on_model_execution_end()
@@ -530,9 +527,6 @@ def generate(
         Returns:
             List[int]: List of generated token IDs.
 
-        Note:
-            Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind.
-            Expect improvements to the pybind interface in ExecuTorch version 0.4.1.
         """
         self.device = torch.device("cpu")
         if max_seq_len is None:
@@ -550,7 +544,6 @@ def generate(
         encoder_input_ids = input_ids
         encoder_outputs = None
         generated_ids = [0]
-
         first_token_generated = False
 
         # Generate tokens one by one
diff --git a/optimum/exporters/executorch/integrations.py b/optimum/exporters/executorch/integrations.py
@@ -424,7 +424,10 @@ def __init__(
         self.use_custom_sdpa = use_custom_sdpa
         self.disable_dynamic_shapes = disable_dynamic_shapes
         self.metadata = save_config_to_constant_methods(
-            model.config, model.generation_config, get_max_seq_len=max_seq_len, enable_dynamic_shape=not self.disable_dynamic_shapes
+            model.config,
+            model.generation_config,
+            get_max_seq_len=max_seq_len,
+            enable_dynamic_shape=not self.disable_dynamic_shapes,
         )
         logging.info(f"Metadata to be recorded in PTE: {self.metadata}")
 
diff --git a/optimum/exporters/executorch/tasks/seq2seq_lm.py b/optimum/exporters/executorch/tasks/seq2seq_lm.py
@@ -23,36 +23,36 @@
 @register_task("text2text-generation")
 def load_seq2seq_lm_model(model_name_or_path: str, **kwargs) -> Seq2SeqLMExportableModule:
     """
-    Loads a seq2seq language model for conditional text generation and registers it under the task
-    'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`.
+        Loads a seq2seq language model for conditional text generation and registers it under the task
+        'text2text-generation' using Hugging Face's `AutoModelForSeq2SeqLM`.
 
-    Args:
-        model_name_or_path (str):
-            Model ID on huggingface.co or path on disk to the model repository to export. For example:
-            `model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder`
-        **kwargs:
-            Additional configuration options for the model:
-                - dtype (str, optional):
-                    Data type for model weights (default: "float32").
-                    Options include "float16" and "bfloat16".
-                - max_hidden_seq_length (int, optional):
-                    Maximum hidden sequence length (default: 4096).
-                - max_cache_length (int, optional):
-                    Maximum sequence length for generation (default: 1024).
+        Args:
+            model_name_or_path (str):
+                Model ID on huggingface.co or path on disk to the model repository to export. For example:
+                `model_name_or_path="google-t5/t5-small"` or `mode_name_or_path="/path/to/model_folder`
+            **kwargs:
+                Additional configuration options for the model:
+                    - dtype (str, optional):
+                        Data type for model weights (default: "float32").
+                        Options include "float16" and "bfloat16".
+                    - max_hidden_seq_length (int, optional):
+                        Maximum hidden sequence length (default: 4096).
+                    - max_cache_length (int, optional):
+                        Maximum sequence length for generation (default: 1024).
 
-    Returns:
-        Seq2SeqLMExportableModule:
-            An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch.
-    """
+        Returns:
+            Seq2SeqLMExportableModule:
+                An instance of `Seq2SeqLMExportableModule` for exporting and lowering to ExecuTorch.
+    n"""
     device = "cpu"
     batch_size = 1
-    max_hidden_seq_length = kwargs.get("max_hidden_seq_length", 4096)
-    max_cache_length = kwargs.get("max_cache_length", 1024)
+    max_hidden_seq_len = kwargs.get("max_hidden_seq_len", 4096)
+    max_seq_len = kwargs.get("max_seq_len", 1024)
 
     full_model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path).to(device).eval()
     return Seq2SeqLMExportableModule(
         full_model,
         batch_size=batch_size,
-        max_hidden_seq_length=max_hidden_seq_length,
-        max_cache_length=max_cache_length,
+        max_seq_len=max_seq_len,
+        max_hidden_seq_len=max_hidden_seq_len,
     )
diff --git a/tests/models/test_modeling_t5.py b/tests/models/test_modeling_t5.py
@@ -21,7 +21,6 @@
 
 import pytest
 from executorch import version
-from executorch.extension.pybindings.portable_lib import ExecuTorchModule
 from packaging.version import parse
 from transformers import AutoTokenizer
 from transformers.testing_utils import slow
@@ -45,20 +44,13 @@ def test_t5_export_to_executorch(self):
                 shell=True,
                 check=True,
             )
-            self.assertTrue(os.path.exists(f"{tempdir}/executorch/encoder.pte"))
-            self.assertTrue(os.path.exists(f"{tempdir}/executorch/decoder.pte"))
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
 
     def _helper_t5_translation(self, recipe: str):
         model_id = "google/flan-t5-small"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe)
 
-        self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM)
-        self.assertTrue(hasattr(model, "text_encoder"))
-        self.assertIsInstance(model.encoder, ExecuTorchModule)
-        self.assertTrue(hasattr(model, "text_decoder"))
-        self.assertIsInstance(model.decoder, ExecuTorchModule)
-
         input_text = "translate English to German: How old are you?"
         generated_text = model.text_generation(
             tokenizer=tokenizer,
@@ -88,12 +80,6 @@ def _helper_t5_summarization(self, recipe: str):
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_id, recipe=recipe)
 
-        self.assertIsInstance(model, ExecuTorchModelForSeq2SeqLM)
-        self.assertTrue(hasattr(model, "encoder"))
-        self.assertIsInstance(model.encoder, ExecuTorchModule)
-        self.assertTrue(hasattr(model, "text_decoder"))
-        self.assertIsInstance(model.decoder, ExecuTorchModule)
-
         article = (
             " New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
             " year later, she got married again in Westchester County, but to a different man and without divorcing"