ashvardanian
diff --git a/‎include/numkong/dots.h‎
Lines changed: 36 additions & 36 deletions b/‎include/numkong/dots.h‎
Lines changed: 36 additions & 36 deletions
diff --git a/‎include/numkong/numkong.h‎
Lines changed: 2 additions & 0 deletions b/‎include/numkong/numkong.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/numkong/set.h‎
Lines changed: 1 addition & 0 deletions b/‎include/numkong/set.h‎
Lines changed: 1 addition & 0 deletions
@@ -144,7 +144,6 @@
 #define NK_DOTS_H
 
 #include "numkong/types.h"
-#include "numkong/dot.h" // nk_bf16x16_to_f32x16_skylake_
 
 #if defined(__cplusplus)
 extern "C" {
@@ -253,6 +252,8 @@ NK_DYNAMIC void nk_dots_packed_u4(nk_u4x2_t const *a, void const *b_packed, nk_u
  *  @param[in] stride Row stride in bytes for the input matrix.
  *  @param[out] result Output symmetric matrix (n_vectors × n_vectors).
  *  @param[in] result_stride Row stride in bytes for the result matrix.
+ *  @param[in] row_start Starting row offset of results to compute (needed for parallelism).
+ *  @param[in] row_count Number of rows of results to compute (needed for parallelism).
  */
 NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
                                        nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
@@ -294,6 +295,38 @@ NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vecto
                                      nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
                                      nk_size_t row_count);
 
+/**
+ *  @brief Compacts f32 GEMM output to bf16 (in-place).
+ *
+ *  After computing C_f32 = A × Bᵀ in f32, truncates to bf16 with rounding.
+ *  The operation is done in-place: reads f32 values and writes bf16 to the same buffer.
+ *  Output is tightly packed with stride = n × sizeof(bf16).
+ *
+ *  @param c Buffer containing f32 values, will be overwritten with bf16 output (m × n).
+ *  @param m Number of rows.
+ *  @param n Number of columns.
+ *  @param c_stride Row stride of input f32 matrix in bytes.
+ */
+NK_DYNAMIC void nk_dots_compact_bf16(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride);
+
+/**
+ *  @brief Compacts i32 GEMM output to normalized i8 (in-place).
+ *
+ *  After computing C_i32 = A × Bᵀ in i32, normalizes to cosine similarity in [-128, 127].
+ *  Uses squared norms for normalization: result[i,j] = 127 × C[i,j] / sqrt(a_norm[i] × b_norm[j]).
+ *  The operation is done in-place: reads i32 values and writes i8 to the same buffer.
+ *  Output is tightly packed with stride = n × sizeof(i8).
+ *
+ *  @param c Buffer containing i32 values, will be overwritten with i8 output (m × n).
+ *  @param m Number of rows.
+ *  @param n Number of columns.
+ *  @param c_stride Row stride of input i32 matrix in bytes.
+ *  @param a_squared_norms Squared L2 norms for A rows (length m).
+ *  @param b_squared_norms Squared L2 norms for B rows (length n).
+ */
+NK_DYNAMIC void nk_dots_compact_i8(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride,
+                                   nk_i32_t const *a_squared_norms, nk_i32_t const *b_squared_norms);
+
 /** @copydoc nk_dots_packed_size_f32 */
 NK_PUBLIC nk_size_t nk_dots_packed_size_f32_serial(nk_size_t n, nk_size_t k);
 /** @copydoc nk_dots_pack_f32 */
@@ -302,9 +335,8 @@ NK_PUBLIC void nk_dots_pack_f32_serial(nk_f32_t const *b, nk_size_t n, nk_size_t
 NK_PUBLIC void nk_dots_packed_f32_serial(nk_f32_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t m, nk_size_t n,
                                          nk_size_t k, nk_size_t a_stride, nk_size_t c_stride);
 /** @copydoc nk_dots_symmetric_f32 */
-NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
-                                            nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
-                                            nk_size_t row_start, nk_size_t row_count);
+NK_PUBLIC void 11(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, nk_f32_t *result,
+                  nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count);
 
 /** @copydoc nk_dots_packed_size_f64 */
 NK_PUBLIC nk_size_t nk_dots_packed_size_f64_serial(nk_size_t n, nk_size_t k);
@@ -330,38 +362,6 @@ NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t n
                                             nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
                                             nk_size_t row_start, nk_size_t row_count);
 
-/**
- *  @brief Compacts f32 GEMM output to bf16 (in-place).
- *
- *  After computing C_f32 = A × Bᵀ in f32, truncates to bf16 with rounding.
- *  The operation is done in-place: reads f32 values and writes bf16 to the same buffer.
- *  Output is tightly packed with stride = n × sizeof(bf16).
- *
- *  @param c Buffer containing f32 values, will be overwritten with bf16 output (m × n).
- *  @param m Number of rows.
- *  @param n Number of columns.
- *  @param c_stride Row stride of input f32 matrix in bytes.
- */
-NK_DYNAMIC void nk_dots_compact_bf16(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride);
-
-/**
- *  @brief Compacts i32 GEMM output to normalized i8 (in-place).
- *
- *  After computing C_i32 = A × Bᵀ in i32, normalizes to cosine similarity in [-128, 127].
- *  Uses squared norms for normalization: result[i,j] = 127 × C[i,j] / sqrt(a_norm[i] × b_norm[j]).
- *  The operation is done in-place: reads i32 values and writes i8 to the same buffer.
- *  Output is tightly packed with stride = n × sizeof(i8).
- *
- *  @param c Buffer containing i32 values, will be overwritten with i8 output (m × n).
- *  @param m Number of rows.
- *  @param n Number of columns.
- *  @param c_stride Row stride of input i32 matrix in bytes.
- *  @param a_squared_norms Squared L2 norms for A rows (length m).
- *  @param b_squared_norms Squared L2 norms for B rows (length n).
- */
-NK_DYNAMIC void nk_dots_compact_i8(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride,
-                                   nk_i32_t const *a_squared_norms, nk_i32_t const *b_squared_norms);
-
 /** @copydoc nk_dots_packed_size_bf16 */
 NK_PUBLIC nk_size_t nk_dots_packed_size_bf16_serial(nk_size_t n, nk_size_t k);
 /** @copydoc nk_dots_pack_bf16 */
 
@@ -3448,6 +3448,7 @@ NK_DYNAMIC int nk_uses_smehalf(void);
 NK_DYNAMIC int nk_uses_smebf16(void);
 NK_DYNAMIC int nk_uses_smelut2(void);
 NK_DYNAMIC int nk_uses_smefa64(void);
+NK_DYNAMIC int nk_uses_smebi32(void);
 NK_DYNAMIC int nk_uses_haswell(void);
 NK_DYNAMIC int nk_uses_skylake(void);
 NK_DYNAMIC int nk_uses_ice(void);
@@ -3492,6 +3493,7 @@ NK_PUBLIC int nk_uses_smehalf(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEHALF
 NK_PUBLIC int nk_uses_smebf16(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEBF16; }
 NK_PUBLIC int nk_uses_smelut2(void) { return NK_TARGET_ARM_ && NK_TARGET_SMELUT2; }
 NK_PUBLIC int nk_uses_smefa64(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEFA64; }
+NK_PUBLIC int nk_uses_smebi32(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEBI32; }
 NK_PUBLIC int nk_uses_haswell(void) { return NK_TARGET_X86_ && NK_TARGET_HASWELL; }
 NK_PUBLIC int nk_uses_skylake(void) { return NK_TARGET_X86_ && NK_TARGET_SKYLAKE; }
 NK_PUBLIC int nk_uses_ice(void) { return NK_TARGET_X86_ && NK_TARGET_ICE; }
 
@@ -7,6 +7,7 @@
  *  Contains following similarity measures:
  *
  *  - Bit-level Hamming distance → `u32` counter
+ *  - Byte-level Hamming distance → `u32` counter
  *  - Bit-level Jaccard distance (Tanimoto coefficient) → `f32` ratio
  *  - Jaccard distance for `u32` integral MinHash vectors from StringZilla → `f32` ratio
  *  - TODO: Weighted Jaccard distance for `u32` integral Count-Min-Sketch vectors → `f32` ratio
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`* Contains following similarity measures:`
`8`	`8`	`*`
`9`	`9`	* - Bit-level Hamming distance → `u32` counter
	`10`	+ * - Byte-level Hamming distance → `u32` counter
`10`	`11`	* - Bit-level Jaccard distance (Tanimoto coefficient) → `f32` ratio
`11`	`12`	* - Jaccard distance for `u32` integral MinHash vectors from StringZilla → `f32` ratio
`12`	`13`	* - TODO: Weighted Jaccard distance for `u32` integral Count-Min-Sketch vectors → `f32` ratio