fix: align several usage with authors' example; fix indexing issues when lengths of both modalities don't match; Implemented normalization of embeddings as recommended by the authors.

sarendis56 · sarendis56 · commit df3d5ae3d1b5 · 2026-02-04T11:56:04.000+08:00
- Prepared inputs for generation with cache position handling for Qwen2.5-Omni.
diff --git a/mteb/models/model_implementations/e5_omni.py b/mteb/models/model_implementations/e5_omni.py
@@ -104,9 +104,9 @@ def encode(
                 max_len = max(len(batch_texts), len(batch_images))
                 for i in range(max_len):
                     content = []
-                    if batch_texts:
+                    if i < len(batch_texts):
                         content.append({"type": "text", "text": batch_texts[i]})
-                    if batch_images:
+                    if i < len(batch_images):
                         content.append({"type": "image", "image": batch_images[i]})
                     messages.append([{"role": "user", "content": content}])
 
@@ -121,6 +121,7 @@ def encode(
 
                 image_inputs = None
                 video_inputs = None
+                audio_inputs = None
                 if batch_images:
                     from qwen_vl_utils import process_vision_info
 
@@ -130,12 +131,21 @@ def encode(
                     text=texts,
                     images=image_inputs,
                     videos=video_inputs,
+                    audio=audio_inputs,
                     padding=True,
                     return_tensors="pt",
                     truncation=True,
                     max_length=512,
                 ).to(self.device)
 
+                # Prepare inputs for generation to handle cache_position and other requirements for Qwen2.5-Omni
+                cache_position = torch.arange(
+                    0, model_inputs["input_ids"].shape[1], device=self.device
+                )
+                model_inputs = self.model.prepare_inputs_for_generation(
+                    **model_inputs, use_cache=True, cache_position=cache_position
+                )
+
                 outputs = self.model(**model_inputs, output_hidden_states=True)
 
                 # For E5-Omni, we use the last hidden state of the last token
@@ -144,12 +154,14 @@ def encode(
 
                 # Find the last non-padding token
                 attention_mask = model_inputs["attention_mask"]
-                # Qwen2.5-Omni uses right padding by default in many setups
                 sequence_lengths = attention_mask.sum(dim=1) - 1
                 embeddings = last_hidden_state[
                     torch.arange(last_hidden_state.size(0)), sequence_lengths
                 ]
 
+                # Normalize embeddings as recommended by the authors
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1)
+
                 all_embeddings.append(embeddings.cpu().to(torch.float32))
 
         return torch.cat(all_embeddings, dim=0).numpy()