Skip to content

Commit 47b2ffe

Browse files
committed
Merge branch 'main-elementwise' of https://github.com/ashvardanian/SimSIMD into main-elementwise
2 parents 1065061 + ed1cd76 commit 47b2ffe

File tree

5 files changed

+330
-36
lines changed

5 files changed

+330
-36
lines changed

include/numkong/dots.h

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@
144144
#define NK_DOTS_H
145145

146146
#include "numkong/types.h"
147-
#include "numkong/dot.h" // nk_bf16x16_to_f32x16_skylake_
148147

149148
#if defined(__cplusplus)
150149
extern "C" {
@@ -253,6 +252,8 @@ NK_DYNAMIC void nk_dots_packed_u4(nk_u4x2_t const *a, void const *b_packed, nk_u
253252
* @param[in] stride Row stride in bytes for the input matrix.
254253
* @param[out] result Output symmetric matrix (n_vectors × n_vectors).
255254
* @param[in] result_stride Row stride in bytes for the result matrix.
255+
* @param[in] row_start Starting row offset of results to compute (needed for parallelism).
256+
* @param[in] row_count Number of rows of results to compute (needed for parallelism).
256257
*/
257258
NK_DYNAMIC void nk_dots_symmetric_bf16(nk_bf16_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride,
258259
nk_f32_t *result, nk_size_t result_stride, nk_size_t row_start,
@@ -294,6 +295,38 @@ NK_DYNAMIC void nk_dots_symmetric_u4(nk_u4x2_t const *vectors, nk_size_t n_vecto
294295
nk_u32_t *result, nk_size_t result_stride, nk_size_t row_start,
295296
nk_size_t row_count);
296297

298+
/**
299+
* @brief Compacts f32 GEMM output to bf16 (in-place).
300+
*
301+
* After computing C_f32 = A × Bᵀ in f32, truncates to bf16 with rounding.
302+
* The operation is done in-place: reads f32 values and writes bf16 to the same buffer.
303+
* Output is tightly packed with stride = n × sizeof(bf16).
304+
*
305+
* @param c Buffer containing f32 values, will be overwritten with bf16 output (m × n).
306+
* @param m Number of rows.
307+
* @param n Number of columns.
308+
* @param c_stride Row stride of input f32 matrix in bytes.
309+
*/
310+
NK_DYNAMIC void nk_dots_compact_bf16(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride);
311+
312+
/**
313+
* @brief Compacts i32 GEMM output to normalized i8 (in-place).
314+
*
315+
* After computing C_i32 = A × Bᵀ in i32, normalizes to cosine similarity in [-128, 127].
316+
* Uses squared norms for normalization: result[i,j] = 127 × C[i,j] / sqrt(a_norm[i] × b_norm[j]).
317+
* The operation is done in-place: reads i32 values and writes i8 to the same buffer.
318+
* Output is tightly packed with stride = n × sizeof(i8).
319+
*
320+
* @param c Buffer containing i32 values, will be overwritten with i8 output (m × n).
321+
* @param m Number of rows.
322+
* @param n Number of columns.
323+
* @param c_stride Row stride of input i32 matrix in bytes.
324+
* @param a_squared_norms Squared L2 norms for A rows (length m).
325+
* @param b_squared_norms Squared L2 norms for B rows (length n).
326+
*/
327+
NK_DYNAMIC void nk_dots_compact_i8(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride,
328+
nk_i32_t const *a_squared_norms, nk_i32_t const *b_squared_norms);
329+
297330
/** @copydoc nk_dots_packed_size_f32 */
298331
NK_PUBLIC nk_size_t nk_dots_packed_size_f32_serial(nk_size_t n, nk_size_t k);
299332
/** @copydoc nk_dots_pack_f32 */
@@ -302,9 +335,8 @@ NK_PUBLIC void nk_dots_pack_f32_serial(nk_f32_t const *b, nk_size_t n, nk_size_t
302335
NK_PUBLIC void nk_dots_packed_f32_serial(nk_f32_t const *a, void const *b_packed, nk_f32_t *c, nk_size_t m, nk_size_t n,
303336
nk_size_t k, nk_size_t a_stride, nk_size_t c_stride);
304337
/** @copydoc nk_dots_symmetric_f32 */
305-
NK_PUBLIC void nk_dots_symmetric_f32_serial(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth,
306-
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
307-
nk_size_t row_start, nk_size_t row_count);
338+
NK_PUBLIC void 11(nk_f32_t const *vectors, nk_size_t n_vectors, nk_size_t depth, nk_size_t stride, nk_f32_t *result,
339+
nk_size_t result_stride, nk_size_t row_start, nk_size_t row_count);
308340

309341
/** @copydoc nk_dots_packed_size_f64 */
310342
NK_PUBLIC nk_size_t nk_dots_packed_size_f64_serial(nk_size_t n, nk_size_t k);
@@ -330,38 +362,6 @@ NK_PUBLIC void nk_dots_symmetric_f16_serial(nk_f16_t const *vectors, nk_size_t n
330362
nk_size_t stride, nk_f32_t *result, nk_size_t result_stride,
331363
nk_size_t row_start, nk_size_t row_count);
332364

333-
/**
334-
* @brief Compacts f32 GEMM output to bf16 (in-place).
335-
*
336-
* After computing C_f32 = A × Bᵀ in f32, truncates to bf16 with rounding.
337-
* The operation is done in-place: reads f32 values and writes bf16 to the same buffer.
338-
* Output is tightly packed with stride = n × sizeof(bf16).
339-
*
340-
* @param c Buffer containing f32 values, will be overwritten with bf16 output (m × n).
341-
* @param m Number of rows.
342-
* @param n Number of columns.
343-
* @param c_stride Row stride of input f32 matrix in bytes.
344-
*/
345-
NK_DYNAMIC void nk_dots_compact_bf16(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride);
346-
347-
/**
348-
* @brief Compacts i32 GEMM output to normalized i8 (in-place).
349-
*
350-
* After computing C_i32 = A × Bᵀ in i32, normalizes to cosine similarity in [-128, 127].
351-
* Uses squared norms for normalization: result[i,j] = 127 × C[i,j] / sqrt(a_norm[i] × b_norm[j]).
352-
* The operation is done in-place: reads i32 values and writes i8 to the same buffer.
353-
* Output is tightly packed with stride = n × sizeof(i8).
354-
*
355-
* @param c Buffer containing i32 values, will be overwritten with i8 output (m × n).
356-
* @param m Number of rows.
357-
* @param n Number of columns.
358-
* @param c_stride Row stride of input i32 matrix in bytes.
359-
* @param a_squared_norms Squared L2 norms for A rows (length m).
360-
* @param b_squared_norms Squared L2 norms for B rows (length n).
361-
*/
362-
NK_DYNAMIC void nk_dots_compact_i8(void *c, nk_size_t m, nk_size_t n, nk_size_t c_stride,
363-
nk_i32_t const *a_squared_norms, nk_i32_t const *b_squared_norms);
364-
365365
/** @copydoc nk_dots_packed_size_bf16 */
366366
NK_PUBLIC nk_size_t nk_dots_packed_size_bf16_serial(nk_size_t n, nk_size_t k);
367367
/** @copydoc nk_dots_pack_bf16 */

include/numkong/numkong.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3448,6 +3448,7 @@ NK_DYNAMIC int nk_uses_smehalf(void);
34483448
NK_DYNAMIC int nk_uses_smebf16(void);
34493449
NK_DYNAMIC int nk_uses_smelut2(void);
34503450
NK_DYNAMIC int nk_uses_smefa64(void);
3451+
NK_DYNAMIC int nk_uses_smebi32(void);
34513452
NK_DYNAMIC int nk_uses_haswell(void);
34523453
NK_DYNAMIC int nk_uses_skylake(void);
34533454
NK_DYNAMIC int nk_uses_ice(void);
@@ -3492,6 +3493,7 @@ NK_PUBLIC int nk_uses_smehalf(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEHALF
34923493
NK_PUBLIC int nk_uses_smebf16(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEBF16; }
34933494
NK_PUBLIC int nk_uses_smelut2(void) { return NK_TARGET_ARM_ && NK_TARGET_SMELUT2; }
34943495
NK_PUBLIC int nk_uses_smefa64(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEFA64; }
3496+
NK_PUBLIC int nk_uses_smebi32(void) { return NK_TARGET_ARM_ && NK_TARGET_SMEBI32; }
34953497
NK_PUBLIC int nk_uses_haswell(void) { return NK_TARGET_X86_ && NK_TARGET_HASWELL; }
34963498
NK_PUBLIC int nk_uses_skylake(void) { return NK_TARGET_X86_ && NK_TARGET_SKYLAKE; }
34973499
NK_PUBLIC int nk_uses_ice(void) { return NK_TARGET_X86_ && NK_TARGET_ICE; }

include/numkong/set.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Contains following similarity measures:
88
*
99
* - Bit-level Hamming distance → `u32` counter
10+
* - Byte-level Hamming distance → `u32` counter
1011
* - Bit-level Jaccard distance (Tanimoto coefficient) → `f32` ratio
1112
* - Jaccard distance for `u32` integral MinHash vectors from StringZilla → `f32` ratio
1213
* - TODO: Weighted Jaccard distance for `u32` integral Count-Min-Sketch vectors → `f32` ratio

0 commit comments

Comments
 (0)