feat: Add native FP8 model support with scale_inv dequantization

nyo16 · claude · nyo16 · commit 0c4cfc36e726 · 2026-01-08T11:30:38.000-05:00
Add comprehensive FP8 quantized model support for models like Qwen3-FP8.
This enables loading and running FP8 models with per-block scale factors.

Changes:

bumblebee.ex:
- Add :preserve_source_types option to load_model/2 to keep FP8 types

pytorch_params.ex:
- Pass preserve_source_types through param loading pipeline
- Modify ensure_type/3 to preserve FP8 types when option is set

layers.ex:
- Add fp8_aware_dense/3 layer that handles FP8 quantized weights
- Implements block-wise dequantization using scale_inv parameter
- Automatically falls back to identity scaling for non-FP8 models

layers/transformer.ex:
- Add :attention_dense option to blocks/2, block/2, multi_head_attention/4
- Allows custom dense function for Q, K, V, and output projections

text/qwen3.ex:
- Update decoder to use fp8_aware_dense for attention via attention_dense
- Update gated_ffn to use fp8_aware_dense for FFN layers
- Add scale_inv to params_mapping for all attention and FFN layers

The implementation supports both:
- Pre-dequantization: Convert FP8-&gt;F32 before loading
- Native FP8: Load FP8 weights directly, apply scale_inv at runtime

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
@@ -607,7 +607,8 @@ defmodule Bumblebee do
         :params_filename,
         :log_params_diff,
         :backend,
-        :type
+        :type,
+        :preserve_source_types
       ])
 
     with {:ok, repo_files} <- get_repo_files(repository),
@@ -654,7 +655,7 @@ defmodule Bumblebee do
         [
           params_mapping: params_mapping,
           loader_fun: loader_fun
-        ] ++ Keyword.take(opts, [:backend, :log_params_diff])
+        ] ++ Keyword.take(opts, [:backend, :log_params_diff, :preserve_source_types])
 
       params = Bumblebee.Conversion.PyTorchParams.load_params!(model, input_template, paths, opts)
       {:ok, params}
diff --git a/lib/bumblebee/conversion/pytorch_params.ex b/lib/bumblebee/conversion/pytorch_params.ex
@@ -28,6 +28,11 @@ defmodule Bumblebee.Conversion.PyTorchParams do
       and loads the params file. Defaults to
       `Bumblebee.Conversion.PyTorchLoader.load!/1`
 
+    * `:preserve_source_types` - when `true`, preserves FP8 types from the
+      source file instead of converting them to the model's expected type.
+      This is useful for loading quantized models that use FP8 weights.
+      Defaults to `false`
+
   """
   @spec load_params!(Axon.t(), map(), Path.t() | list(Path.t()), keyword()) :: %Axon.ModelState{}
   def load_params!(model, input_template, path, opts \\ []) do
@@ -36,6 +41,7 @@ defmodule Bumblebee.Conversion.PyTorchParams do
       |> Keyword.validate!([
         :log_params_diff,
         :backend,
+        :preserve_source_types,
         params_mapping: %{},
         loader_fun: &Bumblebee.Conversion.PyTorchLoader.load!/1
       ])
@@ -58,7 +64,8 @@ defmodule Bumblebee.Conversion.PyTorchParams do
       model_state = Axon.trace_init(model, input_template)
 
       params_expr = model_state.data
-      {params, diff} = init_params(model, params_expr, pytorch_state, opts[:params_mapping])
+      preserve_source_types = opts[:preserve_source_types] || false
+      {params, diff} = init_params(model, params_expr, pytorch_state, opts[:params_mapping], preserve_source_types)
       model_state = %{model_state | data: params}
 
       params_complete? = diff.missing == [] and diff.mismatched == []
@@ -95,15 +102,15 @@ defmodule Bumblebee.Conversion.PyTorchParams do
       Nx.Container.impl_for(value) != nil
   end
 
-  defp init_params(model, params_expr, pytorch_state, params_mapping) do
+  defp init_params(model, params_expr, pytorch_state, params_mapping, preserve_source_types) do
     layers =
       model
       |> Utils.Axon.nodes_with_names()
       |> Enum.filter(fn {layer, _name} -> layer.parameters != [] end)
 
     prefixes = infer_prefixes(layers, pytorch_state, params_mapping)
 
-    diff = %{missing: [], mismatched: [], used_keys: []}
+    diff = %{missing: [], mismatched: [], used_keys: [], preserve_source_types: preserve_source_types}
 
     {params, diff} =
       layers
@@ -155,7 +162,7 @@ defmodule Bumblebee.Conversion.PyTorchParams do
 
                 case verify_param_shape(param_expr, value) do
                   :ok ->
-                    value = ensure_type(param_expr, value)
+                    value = ensure_type(param_expr, value, diff.preserve_source_types)
                     {value, diff}
 
                   {:error, expected, actual} ->
@@ -507,11 +514,13 @@ defmodule Bumblebee.Conversion.PyTorchParams do
     Utils.Nx.map(expr, &Nx.shape/1)
   end
 
-  defp ensure_type(param_expr, value) do
+  defp ensure_type(param_expr, value, preserve_source_types \\ false) do
     Utils.Nx.zip_with(param_expr, value, fn expr, tensor ->
-      case {Nx.type(expr), Nx.type(tensor)} do
-        {type, type} -> tensor
-        {expected, _actual} -> Nx.as_type(tensor, expected)
+      case {Nx.type(expr), Nx.type(tensor), preserve_source_types} do
+        {type, type, _} -> tensor
+        # Preserve FP8 types when preserve_source_types is enabled
+        {_expected, {:f, 8, _format}, true} -> tensor
+        {expected, _actual, _} -> Nx.as_type(tensor, expected)
       end
     end)
   end
diff --git a/lib/bumblebee/layers.ex b/lib/bumblebee/layers.ex
@@ -438,6 +438,128 @@ defmodule Bumblebee.Layers do
     |> Nx.add(bias)
   end
 
+  @doc """
+  Adds an FP8-aware dense layer to the network.
+
+  This layer supports optional scale_inv parameter for FP8 quantized weights.
+  When scale_inv is provided, it's applied to the matmul output to account
+  for FP8 quantization scaling.
+
+  The kernel parameter uses standard dense layout (transposed from PyTorch).
+
+  ## Options
+
+    * `:name` - layer name
+
+    * `:kernel_initializer` - initializer for `kernel` weights.
+      Defaults to `:glorot_uniform`
+
+    * `:use_bias` - whether the layer should add bias to the output.
+      Defaults to `false`
+
+    * `:block_size` - the block size used for FP8 quantization.
+      Defaults to 128
+
+  """
+  def fp8_aware_dense(%Axon{} = x, units, opts \\ []) do
+    opts =
+      Keyword.validate!(opts, [
+        :name,
+        kernel_initializer: :glorot_uniform,
+        use_bias: false,
+        block_size: 128
+      ])
+
+    name = opts[:name]
+    block_size = opts[:block_size]
+
+    kernel_shape = &Axon.Shape.dense_kernel(&1, units)
+    bias_shape = &Axon.Shape.dense_bias(&1, units)
+
+    # Scale shape: [input_blocks, output_blocks] where block_size is typically 128
+    # This matches the transposed layout from PyTorch (kernel is transposed, so is scale)
+    # For non-FP8 models, scale_inv will be initialized to 1.0
+    scale_shape = fn input_shape ->
+      in_features = elem(input_shape, tuple_size(input_shape) - 1)
+      out_features = units
+      # Round up to handle cases where dimensions aren't exact multiples of block_size
+      out_blocks = div(out_features + block_size - 1, block_size)
+      in_blocks = div(in_features + block_size - 1, block_size)
+      # Note: [in_blocks, out_blocks] to match transposed scale_inv from PyTorch
+      {in_blocks, out_blocks}
+    end
+
+    kernel = Axon.param("kernel", kernel_shape, initializer: opts[:kernel_initializer])
+
+    # scale_inv is initialized to 1.0 (identity) for non-FP8 models
+    # For FP8 models, it will be loaded from the checkpoint
+    scale_inv = Axon.param("scale_inv", scale_shape, initializer: :ones)
+
+    {inputs, op} =
+      if opts[:use_bias] do
+        bias = Axon.param("bias", bias_shape, initializer: :zeros)
+        {[x, kernel, scale_inv, bias], &fp8_aware_dense_impl(&1, &2, &3, &4, &5, block_size)}
+      else
+        {[x, kernel, scale_inv], &fp8_aware_dense_impl(&1, &2, &3, nil, &4, block_size)}
+      end
+
+    Axon.layer(op, inputs, name: name, op_name: :fp8_aware_dense)
+  end
+
+  deftransformp fp8_aware_dense_impl(x, kernel, scale_inv, bias, _opts, block_size) do
+    # Dequantize the kernel using scale_inv before matmul
+    # kernel: [in_features, out_features]
+    # scale_inv: [in_blocks, out_blocks] (transposed from PyTorch layout)
+    # Each 128x128 block of the kernel should be multiplied by its scale
+    kernel_dequant = dequantize_kernel(kernel, scale_inv, block_size)
+
+    # Do the matmul with dequantized kernel
+    # x: [batch, seq_len, in_features]
+    # kernel_dequant: [in_features, out_features]
+    # result: [batch, seq_len, out_features]
+    result = Nx.dot(x, [-1], kernel_dequant, [0])
+
+    # Add bias if present
+    if bias do
+      Nx.add(result, bias)
+    else
+      result
+    end
+  end
+
+  defp dequantize_kernel(kernel, scale_inv, block_size) do
+    # kernel: [in_features, out_features]
+    # scale_inv: [in_blocks, out_blocks] where in_blocks = ceil(in_features/128)
+    #
+    # To dequantize: for each element kernel[i,o], multiply by scale_inv[i/128, o/128]
+    # This is done by expanding scale_inv to match kernel shape
+
+    {in_features, out_features} = Nx.shape(kernel)
+    {in_blocks, out_blocks} = Nx.shape(scale_inv)
+
+    # Expand scale_inv to [in_features, out_features]
+    # Each scale value is replicated block_size times in both dimensions
+    scale_expanded =
+      scale_inv
+      # Replicate along input dimension: [in_blocks, out_blocks] -> [in_blocks * block_size, out_blocks]
+      |> Nx.reshape({in_blocks, 1, out_blocks})
+      |> Nx.broadcast({in_blocks, block_size, out_blocks})
+      |> Nx.reshape({in_blocks * block_size, out_blocks})
+      # Replicate along output dimension: [..., out_blocks] -> [..., out_blocks * block_size]
+      |> Nx.reshape({in_blocks * block_size, out_blocks, 1})
+      |> Nx.broadcast({in_blocks * block_size, out_blocks, block_size})
+      |> Nx.reshape({in_blocks * block_size, out_blocks * block_size})
+
+    # Slice to exact kernel dimensions (in case they're not exact multiples of block_size)
+    scale_expanded =
+      scale_expanded
+      |> Nx.slice([0, 0], [in_features, out_features])
+
+    # Convert kernel to higher precision for dequantization, then multiply by scale
+    kernel_f32 = Nx.as_type(kernel, {:f, 32})
+    Nx.multiply(kernel_f32, scale_expanded)
+  end
+
   @doc """
   Adds a 1-dimensional convolution layer to the network.
 
diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
@@ -63,7 +63,8 @@ defmodule Bumblebee.Layers.Transformer do
       :block_type,
       :attention_scale,
       :query_norm,
-      :key_norm
+      :key_norm,
+      :attention_dense
     ]
 
     opts =
@@ -354,7 +355,8 @@ defmodule Bumblebee.Layers.Transformer do
         attention_scale: nil,
         rotary_embedding: nil,
         query_norm: nil,
-        key_norm: nil
+        key_norm: nil,
+        attention_dense: nil
       ])
 
     name = opts[:name]
@@ -386,6 +388,7 @@ defmodule Bumblebee.Layers.Transformer do
     rotary_embedding = opts[:rotary_embedding]
     query_norm = opts[:query_norm]
     key_norm = opts[:key_norm]
+    attention_dense = opts[:attention_dense]
 
     ffn_fun =
       case ffn do
@@ -446,6 +449,7 @@ defmodule Bumblebee.Layers.Transformer do
           rotary_embedding: rotary_embedding,
           query_norm: query_norm,
           key_norm: key_norm,
+          attention_dense: attention_dense,
           name: join(name, "self_attention")
         )
 
@@ -491,6 +495,7 @@ defmodule Bumblebee.Layers.Transformer do
           attention_window_size: attention_window_size,
           attention_scale: attention_scale,
           rotary_embedding: rotary_embedding,
+          attention_dense: attention_dense,
           name: join(name, "cross_attention")
         )
 
@@ -772,7 +777,8 @@ defmodule Bumblebee.Layers.Transformer do
         output_use_bias: true,
         rotary_embedding: nil,
         query_norm: nil,
-        key_norm: nil
+        key_norm: nil,
+        attention_dense: nil
       ])
 
     attention_mask = opts[:attention_mask]
@@ -792,6 +798,7 @@ defmodule Bumblebee.Layers.Transformer do
     rotary_embedding = opts[:rotary_embedding]
     query_norm = opts[:query_norm]
     key_norm = opts[:key_norm]
+    attention_dense = opts[:attention_dense]
 
     query_use_bias = opts[:query_use_bias]
     key_use_bias = opts[:key_use_bias]
@@ -804,9 +811,18 @@ defmodule Bumblebee.Layers.Transformer do
     inner_size = num_heads * attention_head_size
     inner_kv_size = num_key_value_heads * attention_head_size
 
+    # Helper to create dense layer, using custom attention_dense if provided
+    dense_fn = fn input, units, dense_opts ->
+      if attention_dense do
+        attention_dense.(input, units, dense_opts)
+      else
+        Axon.dense(input, units, dense_opts)
+      end
+    end
+
     query =
       query
-      |> Axon.dense(inner_size,
+      |> dense_fn.(inner_size,
         kernel_initializer: kernel_initializer,
         name: join(name, "query"),
         use_bias: query_use_bias
@@ -815,7 +831,7 @@ defmodule Bumblebee.Layers.Transformer do
 
     key =
       key
-      |> Axon.dense(inner_kv_size,
+      |> dense_fn.(inner_kv_size,
         kernel_initializer: kernel_initializer,
         name: join(name, "key"),
         use_bias: key_use_bias
@@ -824,7 +840,7 @@ defmodule Bumblebee.Layers.Transformer do
 
     value =
       value
-      |> Axon.dense(inner_kv_size,
+      |> dense_fn.(inner_kv_size,
         kernel_initializer: kernel_initializer,
         name: join(name, "value"),
         use_bias: value_use_bias
@@ -937,7 +953,7 @@ defmodule Bumblebee.Layers.Transformer do
     attention_output =
       attention_output
       |> Layers.flatten_trailing()
-      |> Axon.dense(hidden_size,
+      |> dense_fn.(hidden_size,
         kernel_initializer: kernel_initializer,
         name: join(name, "output"),
         use_bias: output_use_bias
diff --git a/lib/bumblebee/text/qwen3.ex b/lib/bumblebee/text/qwen3.ex