huggingface
diff --git a/‎.github/workflows/test_models.yml‎
Lines changed: 8 additions & 2 deletions b/‎.github/workflows/test_models.yml‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎install_dev.py‎
Lines changed: 7 additions & 6 deletions b/‎install_dev.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎optimum/commands/export/executorch.py‎
Lines changed: 3 additions & 2 deletions b/‎optimum/commands/export/executorch.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎optimum/commands/register/register_export.py‎
Lines changed: 2 additions & 2 deletions b/‎optimum/commands/register/register_export.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎optimum/executorch/attentions/custom_kv_cache.py‎
Lines changed: 2 additions & 2 deletions b/‎optimum/executorch/attentions/custom_kv_cache.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎optimum/executorch/attentions/custom_sdpa.py‎
Lines changed: 54 additions & 4 deletions b/‎optimum/executorch/attentions/custom_sdpa.py‎
Lines changed: 54 additions & 4 deletions
diff --git a/‎optimum/executorch/modeling.py‎
Lines changed: 6 additions & 12 deletions b/‎optimum/executorch/modeling.py‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎optimum/exporters/executorch/convert.py‎
Lines changed: 2 additions & 3 deletions b/‎optimum/exporters/executorch/convert.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎optimum/exporters/executorch/integrations.py‎
Lines changed: 8 additions & 10 deletions b/‎optimum/exporters/executorch/integrations.py‎
Lines changed: 8 additions & 10 deletions
@@ -52,11 +52,17 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies for ExecuTorch
         run: |
+          # Clean up cache to save space
+          pip cache purge || true
+          rm -rf ~/.cache/huggingface/hub/* || true
+
           if [ "${{ matrix.executorch-version }}" == "nightly" ]; then
             python install_dev.py
           else
-            pip install '.[dev]'
-            pip install executorch==${{ matrix.executorch-version }}
+            # Use CPU-only torch to avoid CUDA dependencies (saves ~5GB)
+            pip install --no-cache-dir '.[dev]' \
+              --extra-index-url https://download.pytorch.org/whl/cpu
+            pip install --no-cache-dir executorch==${{ matrix.executorch-version }}
           fi
           pip list
       - name: Run tests
 
@@ -5,7 +5,7 @@
 
 def install_torch_nightly_deps():
     """Install torch related dependencies from pinned nightly"""
-    EXECUTORCH_NIGHTLY_VERSION = "dev20251003"
+    EXECUTORCH_NIGHTLY_VERSION = "dev20251104"
     TORCHAO_NIGHTLY_VERSION = "dev20251104"
     # Torch nightly is aligned with pinned nightly in https://github.com/pytorch/executorch/blob/main/torch_pin.py#L2
     TORCH_NIGHTLY_VERSION = "dev20251104"
@@ -15,6 +15,7 @@ def install_torch_nightly_deps():
             "-m",
             "pip",
             "install",
+            "--no-cache-dir",  # Prevent cached CUDA packages
             f"executorch==1.1.0.{EXECUTORCH_NIGHTLY_VERSION}",
             f"torch==2.10.0.{TORCH_NIGHTLY_VERSION}",
             f"torchvision==0.25.0.{TORCH_NIGHTLY_VERSION}",
@@ -34,7 +35,7 @@ def install_dep_from_source():
             "-m",
             "pip",
             "install",
-            "git+https://github.com/huggingface/transformers@91393fe4cc3266a05bc0d129e34ff5f761bb46e2#egg=transformers",  # 4.56.1
+            "git+https://github.com/huggingface/transformers@bdc85cb85c8772d37aa29ce447860b44d7fad6ef#egg=transformers",  # v5.0.0rc0
         ]
     )
     subprocess.check_call(
@@ -58,13 +59,13 @@ def main():
     )
     args = parser.parse_args()
 
-    # Install package with dev extras
-    subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])
-
-    # Install nightly dependencies
+    # Install nightly torch dependencies FIRST to avoid pulling CUDA versions
     if not args.skip_override_torch:
         install_torch_nightly_deps()
 
+    # Install package with dev extras
+    subprocess.check_call([sys.executable, "-m", "pip", "install", ".[dev]"])
+
     # Install source dependencies
     install_dep_from_source()
 
 
@@ -17,7 +17,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from ...exporters import TasksManager
+from transformers.pipelines import get_supported_tasks
+
 from ..base import BaseOptimumCLICommand, CommandInfo
 
 
@@ -46,7 +47,7 @@ def parse_args_executorch(parser):
         default="text-generation",
         help=(
             "The task to export the model for. Available tasks depend on the model, but are among:"
-            f" {str(TasksManager.get_all_tasks())}."
+            f" {str(get_supported_tasks())}."
         ),
     )
     required_group.add_argument(
 
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..export import ExportCommand
-from ..export.executorch import ExecuTorchExportCommand
+from optimum.commands.export.base import ExportCommand
+from optimum.commands.export.executorch import ExecuTorchExportCommand
 
 
 REGISTER_COMMANDS = [(ExecuTorchExportCommand, ExportCommand)]
@@ -45,8 +45,8 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        num_heads = getattr(config, "num_key_value_heads", None) or config.num_attention_heads
         self.early_initialization(
             batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device
         )
 
@@ -18,12 +18,59 @@
 from executorch.extension.llm.custom_ops.custom_ops import custom_sdpa  # noqa
 
 
+def sdpa_mask_passthrough(
+    batch_size: int,
+    cache_position: torch.Tensor,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Optional[Callable] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    allow_torch_fix: bool = True,
+    **kwargs,
+) -> Optional[torch.Tensor]:
+    """
+    Pass-through for attention mask creation since it is never used:
+    - For regular attention, the custom sdpa op in causal mode creates its own attention mask
+    - For sliding window attention, the attention mask from the attention mask API is ditched and re-created during the attention API since it needs to know about cache internals
+
+    Additionally, there were some vmap export issues with sliding window attention mask creation in Transformers.
+
+    Args:
+        batch_size (`int`):
+            The batch size of the input sequence.
+        cache_position (`torch.Tensor`):
+            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
+        kv_length (`int`):
+            The size that the key and value states will have during the attention computation.
+        kv_offset (`int`, optional):
+            An optional offset to indicate at which first position the key and values states will refer to.
+        mask_function (`Callable`):
+            The mask factory function describing the mask pattern.
+        attention_mask (`torch.Tensor`, optional):
+            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
+        local_size (`int`, optional):
+            The size of the local attention, if we do not use full attention. This is used only if `allow_is_causal_skip=True`
+            to try to skip mask creation if possible.
+        allow_is_causal_skip (`bool`, optional):
+            Whether to allow to return `None` for the mask under conditions where we can use the `is_causal` argument in
+            `torch.sdpa` instead. Default to `True`.
+        allow_torch_fix (`bool`, optional):
+            Whether to update the mask in case a query is not attending to any tokens, to solve a bug in torch's older
+            versions. We need an arg to skip it when using eager. By default `True`.
+
+    """
+    return None
+
+
 def custom_sdpa_with_start_pos_forward(
     module: torch.nn.Module,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
+    position_ids: Optional[torch.Tensor] = None,
     scaling: Optional[float] = None,
     softcap: Optional[float] = None,
     head_mask: Optional[torch.Tensor] = None,
@@ -56,10 +103,10 @@ def custom_sdpa_with_start_pos_forward(
             # Calculate the input pos from attention mask.
             # Branch out for float vs bool mask
             # assert attention_mask.dim() == 2, f"attention_mask must be a 2D matrix."
-            attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1])
-            first_row_mask = attention_mask[0, :]
-            # [0, 0, 0, 0, -inf, -inf, -inf, -inf], start_pos = 3
-            start_pos = torch.argmin(first_row_mask.to(torch.long)).item() - 1
+            assert (
+                position_ids is not None
+            ), "position_ids must be provided to find start position for causal attention"
+            start_pos = position_ids[0][0].item()
         else:
             start_pos = 0
 
@@ -95,6 +142,7 @@ def _custom_sdpa_for_ring_kv_cache(
         key: torch.Tensor,
         value: torch.Tensor,
         attention_mask: Union[torch.Tensor, "BlockMask"],  # noqa
+        position_ids: Optional[torch.Tensor] = None,
         scaling: Optional[float] = None,
         softcap: Optional[float] = None,
         head_mask: Optional[torch.Tensor] = None,
@@ -122,6 +170,7 @@ def _custom_sdpa_for_ring_kv_cache(
                 key,
                 value,
                 attention_mask,
+                position_ids,
                 scaling,
                 softcap,
                 head_mask,
@@ -134,6 +183,7 @@ def _custom_sdpa_for_ring_kv_cache(
                 key,
                 value,
                 attention_mask,
+                position_ids,
                 scaling,
                 softcap,
                 head_mask,
 
@@ -23,7 +23,7 @@
 from typing import Dict, List, Optional, Union
 
 import torch
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, is_offline_mode
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa
 from transformers import (
@@ -34,25 +34,22 @@
     AutoModelForSeq2SeqLM,
     AutoModelForSpeechSeq2Seq,
     PreTrainedTokenizer,
-    add_start_docstrings,
 )
 from transformers.configuration_utils import PretrainedConfig
+from transformers.pipelines import get_task
 from transformers.processing_utils import ProcessorMixin
-from transformers.utils import is_offline_mode
 
 from executorch.extension.pybindings.portable_lib import (
     ExecuTorchModule,
     _load_for_executorch,
 )
 from executorch.kernels import quantized  # noqa
 
-from ..exporters import TasksManager
 from ..exporters.executorch import main_export
 from ..exporters.executorch.utils import (
     process_conversation_inputs,
     verify_eos_tokens_in_pretrained_tokenizer,
 )
-from ..modeling_base import FROM_PRETRAINED_START_DOCSTRING, OptimizedModel
 from ..utils.file_utils import find_files_matching_pattern
 from .stats import Stats
 
@@ -63,7 +60,7 @@
 logger = logging.getLogger(__name__)
 
 
-class ExecuTorchModelBase(OptimizedModel, ABC):
+class ExecuTorchModelBase(ABC):
     """
     ExecuTorch model for inference using the ExecuTorch Runtime.
 
@@ -99,8 +96,6 @@ def __init__(
         models: Dict[str, "ExecuTorchModule"],
         config: "PretrainedConfig",
     ):
-        super().__init__(model=None, config=config)
-
         if self.__class__.auto_model_class is None:
             raise ValueError(
                 f"Class {self.__class__.__name__} must set auto_model_class. "
@@ -268,6 +263,7 @@ def _export(
         cls,
         model_id: str,
         recipe: str,
+        task: Optional[str] = None,
         config: Optional[PretrainedConfig] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
@@ -278,9 +274,8 @@ def _export(
         local_files_only: bool = False,
         **kwargs,
     ) -> Dict[str, "ExecuTorchModule"]:
-        task = kwargs.pop("task", None)
-        inferred_task = TasksManager.infer_task_from_model(cls.auto_model_class) if not task else task
-        logging.info(f"Inferred task from model class: {inferred_task}")
+        inferred_task = get_task(model_id) if not task else task
+        logging.info(f"Using task: {inferred_task}")
 
         save_dir = TemporaryDirectory(prefix="executorch_export_")
         save_dir_path = Path(save_dir.name)
@@ -316,7 +311,6 @@ def _save_pretrained(self, save_directory):
         raise NotImplementedError
 
     @classmethod
-    @add_start_docstrings(FROM_PRETRAINED_START_DOCSTRING)
     def from_pretrained(
         cls,
         model_id: Union[str, Path],
 
@@ -19,8 +19,7 @@
 from pathlib import Path
 from typing import Union
 
-from transformers.integrations.executorch import sdpa_mask_without_vmap
-from transformers.masking_utils import AttentionMaskInterface
+from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, AttentionMaskInterface
 from transformers.modeling_utils import AttentionInterface
 
 from optimum.executorch.attentions.custom_sdpa import custom_sdpa_with_start_pos_forward
@@ -29,7 +28,7 @@
 
 
 AttentionInterface.register("custom_sdpa", custom_sdpa_with_start_pos_forward)
-AttentionMaskInterface.register("custom_sdpa", sdpa_mask_without_vmap)
+AttentionMaskInterface.register("custom_sdpa", ALL_MASK_ATTENTION_FUNCTIONS["sdpa"])
 
 
 def export_to_executorch(
 
@@ -31,12 +31,11 @@
 )
 from transformers.integrations.executorch import (
     TorchExportableModuleForDecoderOnlyLM,
-    sdpa_mask_without_vmap,
 )
 from transformers.masking_utils import AttentionMaskInterface
 from transformers.modeling_utils import AttentionInterface
 
-from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache
+from optimum.executorch.attentions.custom_sdpa import get_custom_sdpa_for_ring_kv_cache, sdpa_mask_passthrough
 
 from .utils import apply_chat_template_with_fallback, save_config_to_constant_methods
 
@@ -212,7 +211,7 @@ def __init__(
             additional_metadata_kwargs[f"{modality}_token_id"] = getattr(self.config, "image_token_id")
         self.metadata = save_config_to_constant_methods(
             config=model.config.text_config,
-            generation_config=model.generation_config,
+            generation_config=getattr(model, "generation_config", None),
             processor_config=processor_config,
             get_max_seq_len=max_seq_len,
             **additional_metadata_kwargs,
@@ -269,7 +268,7 @@ def _register_custom_attention(self, exportable_module: torch.nn.Module):
         if self.use_custom_sdpa:
             if self.use_custom_kv_cache:
                 AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
-                AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
+                AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_passthrough)
                 # Manually set the attention implementation to custom_sdpa_ring_kv_cache
                 # This handles both regular sdpa and one for sliding window/local attention
                 exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
@@ -425,7 +424,7 @@ def __init__(
         self.disable_dynamic_shapes = disable_dynamic_shapes
         self.metadata = save_config_to_constant_methods(
             model.config,
-            model.generation_config,
+            generation_config=getattr(model, "generation_config", None),
             get_max_seq_len=max_seq_len,
             enable_dynamic_shape=not self.disable_dynamic_shapes,
         )
@@ -455,7 +454,7 @@ def _prepare_export_inputs(self):
 
         if not self.disable_dynamic_shapes and not is_using_hybrid_cache_wo_custom_sdpa_kv_cache:
             # Prepare inputs with dynamic shapes
-            seq_length = 3  # Sequence length > 1 to avoid specialization issues
+            seq_length = 3  # Sequence length > 1 to avoid specialization issue
             example_input_ids = torch.zeros((1, seq_length), dtype=torch.long, device=self.model.device)
             example_cache_position = torch.arange(seq_length, dtype=torch.long, device=self.model.device)
             max_seq_len = self.metadata.get("get_max_seq_len")
@@ -471,15 +470,14 @@ def _prepare_export_inputs(self):
         return example_input_ids, example_cache_position, dynamic_shapes, strict
 
     def _register_custom_attention(self, exportable_module: torch.nn.Module):
-        from transformers.integrations.executorch import sdpa_mask_without_vmap
         from transformers.masking_utils import AttentionMaskInterface
         from transformers.modeling_utils import AttentionInterface
 
         if self.use_custom_sdpa:
             if self.use_custom_kv_cache:
                 _custom_sdpa_for_ring_kv_cache = get_custom_sdpa_for_ring_kv_cache(exportable_module)
                 AttentionInterface.register("custom_sdpa_ring_kv_cache", _custom_sdpa_for_ring_kv_cache)
-                AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_without_vmap)
+                AttentionMaskInterface.register("custom_sdpa_ring_kv_cache", sdpa_mask_passthrough)
                 # Manually set the attention implementation to custom_sdpa_ring_kv_cache
                 # This handles both regular sdpa and one for sliding window/local attention
                 exportable_module.model.model.config._attn_implementation = "custom_sdpa_ring_kv_cache"
@@ -554,7 +552,7 @@ def __init__(self, model):
         self.model = model
         self.config = model.config
         # Metadata to be recorded in the pte model file
-        self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
+        self.metadata = save_config_to_constant_methods(model.config, getattr(model, "generation_config", None))
 
     def forward(self, pixel_values):
         print(f"DEBUG: pixel_values: {pixel_values.shape}")
@@ -593,7 +591,7 @@ def __init__(self, model):
         self.model = model
         self.config = model.config
         # Metadata to be recorded in the pte model file
-        self.metadata = save_config_to_constant_methods(model.config, model.generation_config)
+        self.metadata = save_config_to_constant_methods(model.config, getattr(model, "generation_config", None))
 
     def forward(self, input_ids, attention_mask):
         return self.model(input_ids, attention_mask)
Original file line number	Diff line number	Diff line change
`@@ -45,8 +45,8 @@ def __init__(`
`45`	`45`	`device=device,`
`46`	`46`	`dtype=dtype,`
`47`	`47`	`)`
`48`		`- num_heads = getattr(config, "num_key_value_heads", config.num_attention_heads)`
`49`		`- head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)`
	`48`	`+ head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads`
	`49`	`+ num_heads = getattr(config, "num_key_value_heads", None) or config.num_attention_heads`
`50`	`50`	`self.early_initialization(`
`51`	`51`	`batch_size=max_batch_size, num_heads=num_heads, head_dim=head_dim, dtype=dtype, device=device`
`52`	`52`	`)`