Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions c/include/cuvs/cluster/kmeans.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,22 @@ typedef enum {
Array = 2
} cuvsKMeansInitMethod;

/**
* @brief Centroid update mode for k-means algorithm
*/
typedef enum {
/**
* Standard k-means (Lloyd's algorithm): accumulate assignments over the
* entire dataset, then update centroids once per iteration.
*/
CUVS_KMEANS_UPDATE_FULL_BATCH = 0,

/**
* Mini-batch k-means: update centroids after each randomly sampled batch.
*/
CUVS_KMEANS_UPDATE_MINI_BATCH = 1
} cuvsKMeansCentroidUpdateMode;

/**
* @brief Hyper-parameters for the kmeans algorithm
*/
Expand Down Expand Up @@ -90,6 +106,13 @@ struct cuvsKMeansParams {
*/
int batch_centroids;

/**
* Centroid update mode:
* - CUVS_KMEANS_UPDATE_FULL_BATCH: Standard Lloyd's algorithm, update after full dataset pass
* - CUVS_KMEANS_UPDATE_MINI_BATCH: Mini-batch k-means, update after each batch
*/
cuvsKMeansCentroidUpdateMode update_mode;

bool inertia_check;

/**
Expand Down
166 changes: 165 additions & 1 deletion cpp/include/cuvs/cluster/kmeans.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,22 @@ struct params : base_params {
Array
};

/**
* Centroid update mode determines when centroids are updated during training.
*/
enum CentroidUpdateMode {
/**
* Standard k-means (Lloyd's algorithm): accumulate assignments over the
* entire dataset, then update centroids once per iteration.
*/
FullBatch,

/**
* Mini-batch k-means: update centroids after each randomly sampled batch.
*/
MiniBatch
};

/**
* The number of clusters to form as well as the number of centroids to generate (default:8).
*/
Expand Down Expand Up @@ -104,7 +120,14 @@ struct params : base_params {
/**
* if 0 then batch_centroids = n_clusters
*/
int batch_centroids = 0; //
int batch_centroids = 0;

/**
* Centroid update mode:
* - FullBatch: Standard Lloyd's algorithm, update centroids after full dataset pass
* - MiniBatch: Mini-batch k-means, update centroids after each batch
*/
CentroidUpdateMode update_mode = FullBatch;

bool inertia_check = false;
};
Expand Down Expand Up @@ -139,6 +162,147 @@ enum class kmeans_type { KMeans = 0, KMeansBalanced = 1 };
* @{
*/

/**
* @defgroup kmeans_batched Batched k-means for out-of-core / host data
* @{
*/

/**
* @brief Find clusters with k-means algorithm using batched processing.
*
* This version supports out-of-core computation where the dataset resides
* on the host. Data is processed in batches, with partial sums accumulated
* across batches and centroids finalized at the end of each iteration.
* This is mathematically equivalent to standard kmeans.
*
* @code{.cpp}
* #include <raft/core/resources.hpp>
* #include <cuvs/cluster/kmeans.hpp>
* using namespace cuvs::cluster;
* ...
* raft::resources handle;
* cuvs::cluster::kmeans::params params;
* int n_features = 15;
* float inertia;
* int n_iter;
*
* // Data on host
* std::vector<float> h_X(n_samples * n_features);
* auto X = raft::make_host_matrix_view<const float, int>(h_X.data(), n_samples, n_features);
*
* // Centroids on device
* auto centroids = raft::make_device_matrix<float, int>(handle, params.n_clusters, n_features);
*
* kmeans::fit_batched(handle,
* params,
* X,
* 100000, // batch_size
* std::nullopt,
* centroids.view(),
* raft::make_host_scalar_view(&inertia),
* raft::make_host_scalar_view(&n_iter));
* @endcode
*
* @param[in] handle The raft handle.
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances on HOST memory. The data must
* be in row-major format.
* [dim = n_samples x n_features]
* @param[in] batch_size Number of samples to process per batch.
* @param[in] sample_weight Optional weights for each observation in X (on host).
* [len = n_samples]
* @param[inout] centroids [in] When init is InitMethod::Array, use
* centroids as the initial cluster centers.
* [out] The generated centroids from the
* kmeans algorithm are stored at the address
* pointed by 'centroids'.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances of samples to their
* closest cluster center.
* @param[out] n_iter Number of iterations run.
*/
void fit_batched(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const float, int> X,
int batch_size,
std::optional<raft::host_vector_view<const float, int>> sample_weight,
raft::device_matrix_view<float, int> centroids,
raft::host_scalar_view<float> inertia,
raft::host_scalar_view<int> n_iter);

/**
* @brief Find clusters with k-means algorithm using batched processing.
*
* @param[in] handle The raft handle.
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances on HOST memory.
* [dim = n_samples x n_features]
* @param[in] batch_size Number of samples to process per batch.
* @param[in] sample_weight Optional weights for each observation in X (on host).
* @param[inout] centroids Cluster centers on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances to nearest centroid.
* @param[out] n_iter Number of iterations run.
*/
void fit_batched(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const float, int64_t> X,
int64_t batch_size,
std::optional<raft::host_vector_view<const float, int64_t>> sample_weight,
raft::device_matrix_view<float, int64_t> centroids,
raft::host_scalar_view<float> inertia,
raft::host_scalar_view<int64_t> n_iter);

/**
* @brief Find clusters with k-means algorithm using batched processing.
*
* @param[in] handle The raft handle.
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances on HOST memory.
* [dim = n_samples x n_features]
* @param[in] batch_size Number of samples to process per batch.
* @param[in] sample_weight Optional weights for each observation in X (on host).
* @param[inout] centroids Cluster centers on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances to nearest centroid.
* @param[out] n_iter Number of iterations run.
*/
void fit_batched(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const double, int> X,
int batch_size,
std::optional<raft::host_vector_view<const double, int>> sample_weight,
raft::device_matrix_view<double, int> centroids,
raft::host_scalar_view<double> inertia,
raft::host_scalar_view<int> n_iter);

/**
* @brief Find clusters with k-means algorithm using batched processing.
*
* @param[in] handle The raft handle.
* @param[in] params Parameters for KMeans model.
* @param[in] X Training instances on HOST memory.
* [dim = n_samples x n_features]
* @param[in] batch_size Number of samples to process per batch.
* @param[in] sample_weight Optional weights for each observation in X (on host).
* @param[inout] centroids Cluster centers on device.
* [dim = n_clusters x n_features]
* @param[out] inertia Sum of squared distances to nearest centroid.
* @param[out] n_iter Number of iterations run.
*/
void fit_batched(raft::resources const& handle,
const cuvs::cluster::kmeans::params& params,
raft::host_matrix_view<const double, int64_t> X,
int64_t batch_size,
std::optional<raft::host_vector_view<const double, int64_t>> sample_weight,
raft::device_matrix_view<double, int64_t> centroids,
raft::host_scalar_view<double> inertia,
raft::host_scalar_view<int64_t> n_iter);

/**
* @}
*/

/**
* @brief Find clusters with k-means algorithm.
* Initial centroids are chosen with k-means++ algorithm. Empty
Expand Down
Loading