elixir-nx
diff --git a/‎lib/bumblebee.ex‎
Lines changed: 4 additions & 0 deletions b/‎lib/bumblebee.ex‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/bumblebee/layers/transformer.ex‎
Lines changed: 38 additions & 3 deletions b/‎lib/bumblebee/layers/transformer.ex‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎lib/bumblebee/text.ex‎
Lines changed: 79 additions & 0 deletions b/‎lib/bumblebee/text.ex‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎lib/bumblebee/text/pre_trained_tokenizer.ex‎
Lines changed: 7 additions & 0 deletions b/‎lib/bumblebee/text/pre_trained_tokenizer.ex‎
Lines changed: 7 additions & 0 deletions
@@ -178,6 +178,9 @@ defmodule Bumblebee do
     "Phi3ForCausalLM" => {Bumblebee.Text.Phi3, :for_causal_language_modeling},
     "Phi3ForSequenceClassification" => {Bumblebee.Text.Phi3, :for_sequence_classification},
     "Phi3ForTokenClassification" => {Bumblebee.Text.Phi3, :for_token_classification},
+    "Qwen3Model" => {Bumblebee.Text.Qwen3, :base},
+    "Qwen3ForCausalLM" => {Bumblebee.Text.Qwen3, :for_causal_language_modeling},
+    "Qwen3ForSequenceClassification" => {Bumblebee.Text.Qwen3, :for_sequence_classification},
     "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification},
     "ResNetModel" => {Bumblebee.Vision.ResNet, :base},
     "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling},
@@ -258,6 +261,7 @@ defmodule Bumblebee do
     "mbart" => :mbart,
     "phi" => :code_gen,
     "phi3" => :llama,
+    "qwen3" => :qwen2,
     "roberta" => :roberta,
     "smollm3" => :smollm3,
     "t5" => :t5,
 
@@ -53,7 +53,9 @@ defmodule Bumblebee.Layers.Transformer do
       :layer_norm,
       :block_type,
       :attention_window_size,
-      :scale_attention_weights
+      :scale_attention_weights,
+      :query_norm,
+      :key_norm
     ]
 
     opts =
@@ -330,7 +332,9 @@ defmodule Bumblebee.Layers.Transformer do
         layer_norm: [],
         attention_window_size: nil,
         scale_attention_weights: true,
-        rotary_embedding: nil
+        rotary_embedding: nil,
+        query_norm: nil,
+        key_norm: nil
       ])
 
     name = opts[:name]
@@ -360,6 +364,8 @@ defmodule Bumblebee.Layers.Transformer do
     attention_window_size = opts[:attention_window_size]
     scale_attention_weights = opts[:scale_attention_weights]
     rotary_embedding = opts[:rotary_embedding]
+    query_norm = opts[:query_norm]
+    key_norm = opts[:key_norm]
 
     ffn_fun =
       case ffn do
@@ -418,6 +424,8 @@ defmodule Bumblebee.Layers.Transformer do
           attention_window_size: attention_window_size,
           scale_attention_weights: scale_attention_weights,
           rotary_embedding: rotary_embedding,
+          query_norm: query_norm,
+          key_norm: key_norm,
           name: join(name, "self_attention")
         )
 
@@ -703,6 +711,14 @@ defmodule Bumblebee.Layers.Transformer do
 
         * `:max_positions` - the maximum number of distinct positions
 
+    * `:query_norm` - a function that applies normalization to the query
+      projection before rotary embedding. The function should accept two
+      arguments: the input and a name for the layer. Defaults to `nil`
+
+    * `:key_norm` - a function that applies normalization to the key
+      projection before rotary embedding. The function should accept two
+      arguments: the input and a name for the layer. Defaults to `nil`
+
     * `:name` - the prefix for layer names
 
   ## References
@@ -734,7 +750,9 @@ defmodule Bumblebee.Layers.Transformer do
         key_use_bias: true,
         value_use_bias: true,
         output_use_bias: true,
-        rotary_embedding: nil
+        rotary_embedding: nil,
+        query_norm: nil,
+        key_norm: nil
       ])
 
     attention_mask = opts[:attention_mask]
@@ -752,6 +770,8 @@ defmodule Bumblebee.Layers.Transformer do
     scale_attention_weights = opts[:scale_attention_weights]
     dropout_rate = opts[:dropout_rate]
     rotary_embedding = opts[:rotary_embedding]
+    query_norm = opts[:query_norm]
+    key_norm = opts[:key_norm]
 
     query_use_bias = opts[:query_use_bias]
     key_use_bias = opts[:key_use_bias]
@@ -791,6 +811,21 @@ defmodule Bumblebee.Layers.Transformer do
       )
       |> Layers.split_heads(num_key_value_heads)
 
+    # Apply query and key normalization if configured (before rotary embedding)
+    query =
+      if query_norm do
+        query_norm.(query, join(name, "query_norm"))
+      else
+        query
+      end
+
+    key =
+      if key_norm do
+        key_norm.(key, join(name, "key_norm"))
+      else
+        key
+      end
+
     {query, key} =
       case rotary_embedding do
         opts when is_list(opts) ->
 
@@ -385,6 +385,9 @@ defmodule Bumblebee.Text do
           Note that we currently assume that the CLS token is the first token
           in the sequence
 
+        * `:last_token_pooling` - takes the embedding for the last non-padding
+          token in each sequence
+
       By default no pooling is applied
 
     * `:embedding_processor` - a post-processing step to apply to the
@@ -444,6 +447,82 @@ defmodule Bumblebee.Text do
   defdelegate text_embedding(model_info, tokenizer, opts \\ []),
     to: Bumblebee.Text.TextEmbedding
 
+  @type text_reranking_qwen3_input :: {String.t(), String.t()} | [{String.t(), String.t()}]
+  @type text_reranking_qwen3_output :: %{
+          scores: text_reranking_qwen3_score() | list(text_reranking_qwen3_score())
+        }
+  @type text_reranking_qwen3_score :: %{score: number(), query: String.t(), document: String.t()}
+
+  @doc """
+  Builds a serving for text reranking with Qwen3 reranker models.
+
+  The serving expects input in one of the following formats:
+
+    * `{query, document}` - a tuple with query and document text
+    * `[{query1, doc1}, {query2, doc2}, ...]` - a list of query-document pairs
+
+  ## Options
+
+    * `:yes_token` - the token ID corresponding to "yes" for relevance scoring.
+      If not provided, will be inferred from the tokenizer
+
+    * `:no_token` - the token ID corresponding to "no" for relevance scoring.
+      If not provided, will be inferred from the tokenizer
+
+    * `:instruction_prefix` - the instruction prefix to use. Defaults to the
+      Qwen3 reranker format
+
+    * `:instruction_suffix` - the instruction suffix to use. Defaults to the
+      Qwen3 reranker format
+
+    * `:task_description` - the task description to include in prompts. Defaults
+      to "Given a web search query, retrieve relevant passages that answer the query"
+
+    * `:compile` - compiles all computations for predefined input shapes
+      during serving initialization. Should be a keyword list with the
+      following keys:
+
+        * `:batch_size` - the maximum batch size of the input. Inputs
+          are optionally padded to always match this batch size
+
+        * `:sequence_length` - the maximum input sequence length. Input
+          sequences are always padded/truncated to match that length
+
+      It is advised to set this option in production and also configure
+      a defn compiler using `:defn_options` to maximally reduce inference
+      time
+
+    * `:defn_options` - the options for JIT compilation. Defaults to `[]`
+
+    * `:preallocate_params` - when `true`, explicitly allocates params
+      on the device configured in `:defn_options`. You may want to set
+      this option when using partitioned models on the GPU. Defaults to `false`
+
+  ## Examples
+
+      {:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-Reranker-0.6B"})
+
+      serving = Bumblebee.Text.text_reranking_qwen3(model_info, tokenizer)
+
+      query = "What is the capital of France?"
+      documents = [
+        "Paris is the capital of France.",
+        "Berlin is the capital of Germany."
+      ]
+
+      pairs = Enum.map(documents, &{query, &1})
+      Nx.Serving.run(serving, pairs)
+
+  """
+  @spec text_reranking_qwen3(
+          Bumblebee.model_info(),
+          Bumblebee.Tokenizer.t(),
+          keyword()
+        ) :: Nx.Serving.t()
+  defdelegate text_reranking_qwen3(model_info, tokenizer, opts \\ []),
+    to: Bumblebee.Text.TextRerankingQwen3
+
   @type fill_mask_input :: String.t()
   @type fill_mask_output :: %{predictions: list(fill_mask_prediction())}
   @type fill_mask_prediction :: %{score: number(), token: String.t()}
 
@@ -200,6 +200,13 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do
       },
       default_template_options: [language_token: "eng_Latn"]
     },
+    qwen2: %{
+      special_tokens: %{
+        unk: "<|endoftext|>",
+        eos: "<|endoftext|>",
+        pad: "<|endoftext|>"
+      }
+    },
     roberta: %{
       special_tokens: %{
         bos: "<s>",