Commit 531c3f0e authored by Julien Jerphanion's avatar Julien Jerphanion

Compile minimal kmeans extension

parent e11442ac
all:
python setup.py build_ext --inplace
mrproper:
rm -f *.cpp
rm -f *.c
rm -f *.so
rm -Rf build
from cython cimport floating
cpdef enum BLAS_Order:
RowMajor # C contiguous
ColMajor # Fortran contiguous
cpdef enum BLAS_Trans:
NoTrans = 110 # correspond to 'n'
Trans = 116 # correspond to 't'
# BLAS Level 1 ################################################################
cdef floating _dot(int, floating*, int, floating*, int) nogil
cdef floating _asum(int, floating*, int) nogil
cdef void _axpy(int, floating, floating*, int, floating*, int) nogil
cdef floating _nrm2(int, floating*, int) nogil
cdef void _copy(int, floating*, int, floating*, int) nogil
cdef void _scal(int, floating, floating*, int) nogil
cdef void _rotg(floating*, floating*, floating*, floating*) nogil
cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil
# BLAS Level 2 ################################################################
cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int,
floating*, int, floating, floating*, int) nogil
cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int,
floating*, int) nogil
# BLASLevel 3 ################################################################
cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
floating*, int, floating*, int, floating, floating*,
int) nogil
# cython: language_level = 3
from cython cimport floating
from scipy.linalg.cython_blas cimport sdot, ddot
from scipy.linalg.cython_blas cimport sasum, dasum
from scipy.linalg.cython_blas cimport saxpy, daxpy
from scipy.linalg.cython_blas cimport snrm2, dnrm2
from scipy.linalg.cython_blas cimport scopy, dcopy
from scipy.linalg.cython_blas cimport sscal, dscal
from scipy.linalg.cython_blas cimport srotg, drotg
from scipy.linalg.cython_blas cimport srot, drot
from scipy.linalg.cython_blas cimport sgemv, dgemv
from scipy.linalg.cython_blas cimport sger, dger
from scipy.linalg.cython_blas cimport sgemm, dgemm
################
# BLAS Level 1 #
################
cdef floating _dot(int n, floating *x, int incx,
floating *y, int incy) nogil:
"""x.T.y"""
if floating is float:
return sdot(&n, x, &incx, y, &incy)
else:
return ddot(&n, x, &incx, y, &incy)
cpdef _dot_memview(floating[::1] x, floating[::1] y):
return _dot(x.shape[0], &x[0], 1, &y[0], 1)
cdef floating _asum(int n, floating *x, int incx) nogil:
"""sum(|x_i|)"""
if floating is float:
return sasum(&n, x, &incx)
else:
return dasum(&n, x, &incx)
cpdef _asum_memview(floating[::1] x):
return _asum(x.shape[0], &x[0], 1)
cdef void _axpy(int n, floating alpha, floating *x, int incx,
floating *y, int incy) nogil:
"""y := alpha * x + y"""
if floating is float:
saxpy(&n, &alpha, x, &incx, y, &incy)
else:
daxpy(&n, &alpha, x, &incx, y, &incy)
cpdef _axpy_memview(floating alpha, floating[::1] x, floating[::1] y):
_axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1)
cdef floating _nrm2(int n, floating *x, int incx) nogil:
"""sqrt(sum((x_i)^2))"""
if floating is float:
return snrm2(&n, x, &incx)
else:
return dnrm2(&n, x, &incx)
cpdef _nrm2_memview(floating[::1] x):
return _nrm2(x.shape[0], &x[0], 1)
cdef void _copy(int n, floating *x, int incx, floating *y, int incy) nogil:
"""y := x"""
if floating is float:
scopy(&n, x, &incx, y, &incy)
else:
dcopy(&n, x, &incx, y, &incy)
cpdef _copy_memview(floating[::1] x, floating[::1] y):
_copy(x.shape[0], &x[0], 1, &y[0], 1)
cdef void _scal(int n, floating alpha, floating *x, int incx) nogil:
"""x := alpha * x"""
if floating is float:
sscal(&n, &alpha, x, &incx)
else:
dscal(&n, &alpha, x, &incx)
cpdef _scal_memview(floating alpha, floating[::1] x):
_scal(x.shape[0], alpha, &x[0], 1)
cdef void _rotg(floating *a, floating *b, floating *c, floating *s) nogil:
"""Generate plane rotation"""
if floating is float:
srotg(a, b, c, s)
else:
drotg(a, b, c, s)
cpdef _rotg_memview(floating a, floating b, floating c, floating s):
_rotg(&a, &b, &c, &s)
return a, b, c, s
cdef void _rot(int n, floating *x, int incx, floating *y, int incy,
floating c, floating s) nogil:
"""Apply plane rotation"""
if floating is float:
srot(&n, x, &incx, y, &incy, &c, &s)
else:
drot(&n, x, &incx, y, &incy, &c, &s)
cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s):
_rot(x.shape[0], &x[0], 1, &y[0], 1, c, s)
################
# BLAS Level 2 #
################
cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha,
floating *A, int lda, floating *x, int incx,
floating beta, floating *y, int incy) nogil:
"""y := alpha * op(A).x + beta * y"""
cdef char ta_ = ta
if order == RowMajor:
ta_ = NoTrans if ta == Trans else Trans
if floating is float:
sgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
else:
dgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
else:
if floating is float:
sgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
else:
dgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
cpdef _gemv_memview(BLAS_Trans ta, floating alpha, floating[:, :] A,
floating[::1] x, floating beta, floating[::1] y):
cdef:
int m = A.shape[0]
int n = A.shape[1]
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
int lda = m if order == ColMajor else n
_gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1)
cdef void _ger(BLAS_Order order, int m, int n, floating alpha, floating *x,
int incx, floating *y, int incy, floating *A, int lda) nogil:
"""A := alpha * x.y.T + A"""
if order == RowMajor:
if floating is float:
sger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
else:
dger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
else:
if floating is float:
sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
else:
dger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
cpdef _ger_memview(floating alpha, floating[::1] x, floating[::] y,
floating[:, :] A):
cdef:
int m = A.shape[0]
int n = A.shape[1]
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
int lda = m if order == ColMajor else n
_ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda)
################
# BLAS Level 3 #
################
cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n,
int k, floating alpha, floating *A, int lda, floating *B,
int ldb, floating beta, floating *C, int ldc) nogil:
"""C := alpha * op(A).op(B) + beta * C"""
cdef:
char ta_ = ta
char tb_ = tb
if order == RowMajor:
if floating is float:
sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
&ldb, A, &lda, &beta, C, &ldc)
else:
dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
&ldb, A, &lda, &beta, C, &ldc)
else:
if floating is float:
sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
&lda, B, &ldb, &beta, C, &ldc)
else:
dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
&lda, B, &ldb, &beta, C, &ldc)
cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha,
floating[:, :] A, floating[:, :] B, floating beta,
floating[:, :] C):
cdef:
int m = A.shape[0] if ta == NoTrans else A.shape[1]
int n = B.shape[1] if tb == NoTrans else B.shape[0]
int k = A.shape[1] if ta == NoTrans else A.shape[0]
int lda, ldb, ldc
BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
if order == RowMajor:
lda = k if ta == NoTrans else m
ldb = n if tb == NoTrans else k
ldc = n
else:
lda = m if ta == NoTrans else k
ldb = k if tb == NoTrans else n
ldc = m
_gemm(order, ta, tb, m, n, k, alpha, &A[0, 0],
lda, &B[0, 0], ldb, beta, &C[0, 0], ldc)
# cython: language_level=3
from cython cimport floating
cimport numpy as np
cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
floating, bint) nogil
cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cpdef void _relocate_empty_clusters_sparse(
floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1],
floating[:, ::1], floating[::1], int[::1])
cdef void _average_centers(floating[:, ::1], floating[::1])
cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])
\ No newline at end of file
# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
# Profiling is enabled by default as the overhead does not seem to be
# measurable on this specific use case.
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
# Olivier Grisel <olivier.grisel@ensta.org>
# Lars Buitinck
#
# License: BSD 3 clause
# TODO: We still need to use ndarrays instead of typed memoryviews when using
# fused types and when the array may be read-only (for instance when it's
# provided by the user). This is fixed in cython > 0.3.
import numpy as np
cimport numpy as np
from cython cimport floating
from libc.math cimport sqrt
from scipy import sparse
import numpy as np
np.import_array()
ctypedef fused integral:
int
long long
ctypedef np.float64_t DOUBLE
ctypedef np.int32_t INT
# Number of samples per data chunk defined as a global constant.
CHUNK_SIZE = 256
def csr_row_norms(X):
"""L2 norm of each row in CSR matrix X."""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)
def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
shape,
np.ndarray[integral, ndim=1, mode="c"] X_indices,
np.ndarray[integral, ndim=1, mode="c"] X_indptr):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long i
integral j
double sum_
norms = np.empty(n_samples, dtype=X_data.dtype)
cdef floating[::1] norms_view = norms
for i in range(n_samples):
sum_ = 0.0
for j in range(X_indptr[i], X_indptr[i + 1]):
sum_ += X_data[j] * X_data[j]
norms_view[i] = sum_
return norms
def row_norms(X, squared=False):
"""Row-wise (squared) Euclidean norm of X.
Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
matrices and does not create an X.shape-sized temporary.
Performs no input validation.
Parameters
----------
X : array-like
The input array.
squared : bool, default=False
If True, return squared norms.
Returns
-------
array-like
The row-wise (squared) Euclidean norm of X.
"""
if sparse.issparse(X):
if not isinstance(X, sparse.csr_matrix):
X = sparse.csr_matrix(X)
norms = csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
if not squared:
np.sqrt(norms, norms)
return norms
cdef floating _euclidean_dense_dense(
floating* a, # IN
floating* b, # IN
int n_features,
bint squared) nogil:
"""Euclidean distance between a dense and b dense"""
cdef:
int i
int n = n_features // 4
int rem = n_features % 4
floating result = 0
# We manually unroll the loop for better cache optimization.
for i in range(n):
result += ((a[0] - b[0]) * (a[0] - b[0])
+(a[1] - b[1]) * (a[1] - b[1])
+(a[2] - b[2]) * (a[2] - b[2])
+(a[3] - b[3]) * (a[3] - b[3]))
a += 4; b += 4
for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
return result if squared else sqrt(result)
def _euclidean_dense_dense_wrapper(floating[::1] a, floating[::1] b,
bint squared):
"""Wrapper of _euclidean_dense_dense for testing purpose"""
return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
cdef floating _euclidean_sparse_dense(
floating[::1] a_data, # IN
int[::1] a_indices, # IN
floating[::1] b, # IN
floating b_squared_norm,
bint squared) nogil:
"""Euclidean distance between a sparse and b dense"""
cdef:
int nnz = a_indices.shape[0]
int i
floating tmp, bi
floating result = 0.0
for i in range(nnz):
bi = b[a_indices[i]]
tmp = a_data[i] - bi
result += tmp * tmp - bi * bi
result += b_squared_norm
if result < 0: result = 0.0
return result if squared else sqrt(result)
def _euclidean_sparse_dense_wrapper(
floating[::1] a_data,
int[::1] a_indices,
floating[::1] b,
floating b_squared_norm,
bint squared):
"""Wrapper of _euclidean_sparse_dense for testing purpose"""
return _euclidean_sparse_dense(
a_data, a_indices, b, b_squared_norm, squared)
cpdef floating _inertia_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers, # IN
int[::1] labels): # IN
"""Compute inertia for dense input data
Sum of squared distance between each sample and its assigned center.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
for i in range(n_samples):
j = labels[i]
sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef floating _inertia_sparse(
X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers, # IN
int[::1] labels): # IN
"""Compute inertia for sparse input data
Sum of squared distance between each sample and its assigned center.
"""
cdef:
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
int n_samples = X.shape[0]
int n_features = X.shape[1]
int i, j
floating sq_dist = 0.0
floating inertia = 0.0
floating[::1] centers_squared_norms = row_norms(centers, squared=True)
for i in range(n_samples):
j = labels[i]
sq_dist = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers[j], centers_squared_norms[j], True)
inertia += sq_dist * sample_weight[i]
return inertia
cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
int[::1] labels): # IN
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_features = X.shape[1]
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx, k
floating weight
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(n_features):
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cpdef void _relocate_empty_clusters_sparse(
floating[::1] X_data, # IN
int[::1] X_indices, # IN
int[::1] X_indptr, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
int[::1] labels): # IN
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
cdef:
int n_samples = X_indptr.shape[0] - 1
int n_features = centers_old.shape[1]
floating x
int i, j, k
floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
for i in range(n_samples):
j = labels[i]
distances[i] = _euclidean_sparse_dense(
X_data[X_indptr[i]: X_indptr[i + 1]],
X_indices[X_indptr[i]: X_indptr[i + 1]],
centers_old[j], centers_squared_norms[j], True)
cdef:
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
int new_cluster_id, old_cluster_id, far_idx, idx
floating weight
for idx in range(n_empty):
new_cluster_id = empty_clusters[idx]
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
old_cluster_id = labels[far_idx]
for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
cdef void _average_centers(
floating[:, ::1] centers, # INOUT
floating[::1] weight_in_clusters): # IN
"""Average new centers wrt weights."""
cdef:
int n_clusters = centers.shape[0]
int n_features = centers.shape[1]
int j, k
floating alpha
for j in range(n_clusters):
if weight_in_clusters[j] > 0:
alpha = 1.0 / weight_in_clusters[j]
for k in range(n_features):
centers[j, k] *= alpha
cdef void _center_shift(
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # IN
floating[::1] center_shift): # OUT
"""Compute shift between old and new centers."""
cdef:
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
int j
for j in range(n_clusters):
center_shift[j] = _euclidean_dense_dense(
&centers_new[j, 0], &centers_old[j, 0], n_features, False)
def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
np.ndarray[floating, ndim=1] x_squared_norms,
np.ndarray[floating, ndim=2] centers,
np.ndarray[floating, ndim=1] weight_sums,
np.ndarray[INT, ndim=1] nearest_center,
np.ndarray[floating, ndim=1] old_center,
int compute_squared_diff):
"""Incremental update of the centers for sparse MiniBatchKMeans.
Parameters
----------
X : CSR matrix, dtype float
The complete (pre allocated) training set as a CSR matrix.
centers : array, shape (n_clusters, n_features)
The cluster centers
counts : array, shape (n_clusters,)
The vector in which we keep track of the numbers of elements in a
cluster
Returns
-------
inertia : float
The inertia of the batch prior to centers update, i.e. the sum
of squared distances to the closest center for each sample. This
is the objective function being minimized by the k-means algorithm.
squared_diff : float
The sum of squared update (squared norm of the centers position
change). If compute_squared_diff is 0, this computation is skipped and
0.0 is returned instead.
Both squared diff and inertia are commonly used to monitor the convergence
of the algorithm.
"""
cdef:
np.ndarray[floating, ndim=1] X_data = X.data
np.ndarray[int, ndim=1] X_indices = X.indices
np.ndarray[int, ndim=1] X_indptr = X.indptr
unsigned int n_samples = X.shape[0]
unsigned int n_clusters = centers.shape[0]
unsigned int n_features = centers.shape[1]
unsigned int sample_idx, center_idx, feature_idx
unsigned int k
DOUBLE old_weight_sum, new_weight_sum
DOUBLE center_diff
DOUBLE squared_diff = 0.0
# move centers to the mean of both old and newly assigned samples
for center_idx in range(n_clusters):
old_weight_sum = weight_sums[center_idx]
new_weight_sum = old_weight_sum
# count the number of samples assigned to this center
for sample_idx in range(n_samples):
if nearest_center[sample_idx] == center_idx:
new_weight_sum += sample_weight[sample_idx]
if new_weight_sum == old_weight_sum:
# no new sample: leave this center as it stands
continue
# rescale the old center to reflect it previous accumulated weight
# with regards to the new data that will be incrementally contributed
if compute_squared_diff:
old_center[:] = centers[center_idx]
centers[center_idx] *= old_weight_sum
# iterate of over samples assigned to this cluster to move the center
# location by inplace summation
for sample_idx in range(n_samples):
if nearest_center[sample_idx] != center_idx:
continue
# inplace sum with new samples that are members of this cluster
# and update of the incremental squared difference update of the
# center position
for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
centers[center_idx, X_indices[k]] += X_data[k]
# inplace rescale center with updated count
if new_weight_sum > old_weight_sum:
# update the count statistics for this center
weight_sums[center_idx] = new_weight_sum
# re-scale the updated center with the total new counts
centers[center_idx] /= new_weight_sum
# update the incremental computation of the squared total
# centers position change
if compute_squared_diff:
for feature_idx in range(n_features):
squared_diff += (old_center[feature_idx]
- centers[center_idx, feature_idx]) ** 2
return squared_diff
......@@ -12,17 +12,32 @@ from cython cimport floating
from cython.parallel import prange, parallel
from libc.stdlib cimport malloc, calloc, free
from libc.string cimport memset
from libc.float cimport DBL_MAX, FLT_MAX
from libc.math cimport sqrt
from scipy.linalg.cython_blas cimport sgemm, dgemm
from scipy import sparse
from ._cython_blas cimport _gemm
from ._cython_blas cimport RowMajor, Trans, NoTrans
from ._k_means_fast import CHUNK_SIZE, row_norms
from ._k_means_fast cimport _relocate_empty_clusters_dense
from ._k_means_fast cimport _relocate_empty_clusters_sparse
from ._k_means_fast cimport _average_centers, _center_shift
np.import_array()
ctypedef fused integral:
int
long long
np.import_array()
ctypedef np.float64_t DOUBLE
ctypedef np.int32_t INT
# Number of samples per data chunk defined as a global constant.
_CHUNK_SIZE = 256
cdef enum _BLAS_Order:
RowMajor # C contiguous
ColMajor # Fortran contiguous
cdef enum _BLAS_Trans:
NoTrans = 110 # correspond to 'n'
Trans = 116 # correspond to 't'
def lloyd_iter_chunked_dense(
......@@ -90,7 +105,7 @@ def lloyd_iter_chunked_dense(
# hard-coded number of samples per chunk. Appeared to be close to
# optimal in all situations.
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_samples_chunk = _CHUNK_SIZE if n_samples > _CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff
......@@ -159,208 +174,219 @@ def lloyd_iter_chunked_dense(
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
# Fixtures
cdef void _update_chunk_dense(
floating *X, # IN
# expecting C alinged 2D array. XXX: Can be
# replaced by const memoryview when cython min
# version is >= 0.3
floating[::1] sample_weight, # IN
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
floating[::1] centers_squared_norms, # IN
int[::1] labels, # OUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
floating *pairwise_distances, # OUT
bint update_centers) nogil:
"""K-means combined EM step for one dense data chunk.
cdef void _gemm(_BLAS_Order order, _BLAS_Trans ta, _BLAS_Trans tb, int m, int n,
int k, floating alpha, floating *A, int lda, floating *B,
int ldb, floating beta, floating *C, int ldc) nogil:
"""C := alpha * op(A).op(B) + beta * C"""
cdef:
char ta_ = ta
char tb_ = tb
if order == RowMajor:
if floating is float:
sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
&ldb, A, &lda, &beta, C, &ldc)
else:
dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
&ldb, A, &lda, &beta, C, &ldc)
else:
if floating is float:
sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
&lda, B, &ldb, &beta, C, &ldc)
else:
dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
&lda, B, &ldb, &beta, C, &ldc)
cpdef void _relocate_empty_clusters_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # INOUT
floating[::1] weight_in_clusters, # INOUT
int[::1] labels): # IN
"""Relocate centers which have no sample assigned to them."""
cdef:
int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
int n_empty = empty_clusters.shape[0]
if n_empty == 0:
return
Compute the partial contribution of a single data chunk to the labels and
centers.
"""
cdef:
int n_samples = labels.shape[0]
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
int n_features = X.shape[1]
floating sq_dist, min_sq_dist
int i, j, k, label
floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
# Instead of computing the full pairwise squared distances matrix,
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
# the - 2 X.C^T + ||C||² term since the argmin for a given sample only
# depends on the centers.
# pairwise_distances = ||C||²
for i in range(n_samples):
for j in range(n_clusters):
pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
int new_cluster_id, old_cluster_id, far_idx, idx, k
floating weight
# pairwise_distances += -2 * X.dot(C.T)
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-2.0, X, n_features, &centers_old[0, 0], n_features,
1.0, pairwise_distances, n_clusters)
for idx in range(n_empty):
for i in range(n_samples):
min_sq_dist = pairwise_distances[i * n_clusters]
label = 0
for j in range(1, n_clusters):
sq_dist = pairwise_distances[i * n_clusters + j]
if sq_dist < min_sq_dist:
min_sq_dist = sq_dist
label = j
labels[i] = label
new_cluster_id = empty_clusters[idx]
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
def lloyd_iter_chunked_sparse(
X, # IN
floating[::1] sample_weight, # IN
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # OUT
floating[::1] weight_in_clusters, # OUT
int[::1] labels, # OUT
floating[::1] center_shift, # OUT
int n_threads,
bint update_centers=True):
"""Single iteration of K-means lloyd algorithm with sparse input.
far_idx = far_from_centers[idx]
weight = sample_weight[far_idx]
Update labels and centers (inplace), for one iteration, distributed
over data chunks.
old_cluster_id = labels[far_idx]
Parameters
----------
X : sparse matrix of shape (n_samples, n_features), dtype=floating
The observations to cluster. Must be in CSR format.
for k in range(n_features):
centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
centers_new[new_cluster_id, k] = X[far_idx, k] * weight
sample_weight : ndarray of shape (n_samples,), dtype=floating
The weights for each observation in X.
weight_in_clusters[new_cluster_id] = weight
weight_in_clusters[old_cluster_id] -= weight
x_squared_norms : ndarray of shape (n_samples,), dtype=floating
Squared L2 norm of X.
def _csr_row_norms(X):
"""L2 norm of each row in CSR matrix X."""
if X.dtype not in [np.float32, np.float64]:
X = X.astype(np.float64)
return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)
centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
Centers before previous iteration, placeholder for the centers after
previous iteration.
def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
shape,
np.ndarray[integral, ndim=1, mode="c"] X_indices,
np.ndarray[integral, ndim=1, mode="c"] X_indptr):
cdef:
unsigned long long n_samples = shape[0]
unsigned long long i
integral j
double sum_
centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
Centers after previous iteration, placeholder for the new centers
computed during this iteration.
norms = np.empty(n_samples, dtype=X_data.dtype)
cdef floating[::1] norms_view = norms
centers_squared_norms : ndarray of shape (n_clusters,), dtype=floating
Squared L2 norm of the centers.
for i in range(n_samples):
sum_ = 0.0
for j in range(X_indptr[i], X_indptr[i + 1]):
sum_ += X_data[j] * X_data[j]
norms_view[i] = sum_
weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
Placeholder for the sums of the weights of every observation assigned
to each center.
return norms
labels : ndarray of shape (n_samples,), dtype=int
labels assignment.
def row_norms(X, squared=False):
"""Row-wise (squared) Euclidean norm of X.
center_shift : ndarray of shape (n_clusters,), dtype=floating
Distance between old and new centers.
Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
matrices and does not create an X.shape-sized temporary.
n_threads : int
The number of threads to be used by openmp.
Performs no input validation.
update_centers : bool
- If True, the labels and the new centers will be computed, i.e. runs
the E-step and the M-step of the algorithm.
- If False, only the labels will be computed, i.e runs the E-step of
the algorithm. This is useful especially when calling predict on a
fitted model.
Parameters
----------
X : array-like
The input array.
squared : bool, default=False
If True, return squared norms.
Returns
-------
array-like
The row-wise (squared) Euclidean norm of X.
"""
# print(X.indices.dtype)
if sparse.issparse(X):
if not isinstance(X, sparse.csr_matrix):
X = sparse.csr_matrix(X)
norms = _csr_row_norms(X)
else:
norms = np.einsum('ij,ij->i', X, X)
if not squared:
np.sqrt(norms, norms)
return norms
cdef floating _euclidean_dense_dense(
floating* a, # IN
floating* b, # IN
int n_features,
bint squared) nogil:
"""Euclidean distance between a dense and b dense"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int n_clusters = centers_new.shape[0]
int i
int n = n_features // 4
int rem = n_features % 4
floating result = 0
# Chosed same as for dense. Does not have the same impact since with
# sparse data the pairwise distances matrix is not precomputed.
# However, splitting in chunks is necessary to get parallelism.
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
int n_chunks = n_samples // n_samples_chunk
int n_samples_rem = n_samples % n_samples_chunk
int chunk_idx, n_samples_chunk_eff = 0
int start = 0, end = 0
# We manually unroll the loop for better cache optimization.
for i in range(n):
result += ((a[0] - b[0]) * (a[0] - b[0])
+(a[1] - b[1]) * (a[1] - b[1])
+(a[2] - b[2]) * (a[2] - b[2])
+(a[3] - b[3]) * (a[3] - b[3]))
a += 4; b += 4
int j, k
for i in range(rem):
result += (a[i] - b[i]) * (a[i] - b[i])
floating[::1] X_data = X.data
int[::1] X_indices = X.indices
int[::1] X_indptr = X.indptr
return result if squared else sqrt(result)
floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
floating *centers_new_chunk
floating *weight_in_clusters_chunk
cpdef floating _inertia_dense(
np.ndarray[floating, ndim=2, mode='c'] X, # IN
floating[::1] sample_weight, # IN
floating[:, ::1] centers, # IN
int[::1] labels): # IN
"""Compute inertia for dense input data
# count remainder chunk in total number of chunks
n_chunks += n_samples != n_chunks * n_samples_chunk
Sum of squared distance between each sample and its assigned center.
"""
cdef:
int n_samples = X.shape[0]
int n_features = X.shape[1]
int i, j
# number of threads should not be bigger than number of chunks
n_threads = min(n_threads, n_chunks)
floating sq_dist = 0.0
floating inertia = 0.0
if update_centers:
memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
for i in range(n_samples):
j = labels[i]
sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
n_features, True)
inertia += sq_dist * sample_weight[i]
with nogil, parallel(num_threads=n_threads):
# thread local buffers
centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
return inertia
for chunk_idx in prange(n_chunks, schedule='static'):
start = chunk_idx * n_samples_chunk
if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
end = start + n_samples_rem
else:
end = start + n_samples_chunk
_update_chunk_sparse(
X_data[X_indptr[start]: X_indptr[end]],
X_indices[X_indptr[start]: X_indptr[end]],
X_indptr[start: end],
sample_weight[start: end],
x_squared_norms[start: end],
centers_old,
centers_squared_norms,
labels[start: end],
centers_new_chunk,
weight_in_clusters_chunk,
update_centers)
cdef void _average_centers(
floating[:, ::1] centers, # INOUT
floating[::1] weight_in_clusters): # IN
"""Average new centers wrt weights."""
cdef:
int n_clusters = centers.shape[0]
int n_features = centers.shape[1]
int j, k
floating alpha
# reduction from local buffers. The gil is necessary for that to avoid
# race conditions.
if update_centers:
with gil:
for j in range(n_clusters):
weight_in_clusters[j] += weight_in_clusters_chunk[j]
for k in range(n_features):
centers_new[j, k] += centers_new_chunk[j * n_features + k]
for j in range(n_clusters):
if weight_in_clusters[j] > 0:
alpha = 1.0 / weight_in_clusters[j]
for k in range(n_features):
centers[j, k] *= alpha
free(centers_new_chunk)
free(weight_in_clusters_chunk)
if update_centers:
_relocate_empty_clusters_sparse(
X_data, X_indices, X_indptr, sample_weight,
centers_old, centers_new, weight_in_clusters, labels)
cdef void _center_shift(
floating[:, ::1] centers_old, # IN
floating[:, ::1] centers_new, # IN
floating[::1] center_shift): # OUT
"""Compute shift between old and new centers."""
cdef:
int n_clusters = centers_old.shape[0]
int n_features = centers_old.shape[1]
int j
_average_centers(centers_new, weight_in_clusters)
_center_shift(centers_old, centers_new, center_shift)
for j in range(n_clusters):
center_shift[j] = _euclidean_dense_dense(
&centers_new[j, 0], &centers_old[j, 0], n_features, False)
cdef void _update_chunk_sparse(
floating[::1] X_data, # IN
int[::1] X_indices, # IN
int[::1] X_indptr, # IN
cdef void _update_chunk_dense(
floating *X, # IN
# expecting C alinged 2D array. XXX: Can be
# replaced by const memoryview when cython min
# version is >= 0.3
floating[::1] sample_weight, # IN
floating[::1] x_squared_norms, # IN
floating[:, ::1] centers_old, # IN
......@@ -368,8 +394,9 @@ cdef void _update_chunk_sparse(
int[::1] labels, # OUT
floating *centers_new, # OUT
floating *weight_in_clusters, # OUT
floating *pairwise_distances, # OUT
bint update_centers) nogil:
"""K-means combined EM step for one sparse data chunk.
"""K-means combined EM step for one dense data chunk.
Compute the partial contribution of a single data chunk to the labels and
centers.
......@@ -381,33 +408,32 @@ cdef void _update_chunk_sparse(
floating sq_dist, min_sq_dist
int i, j, k, label
floating max_floating = FLT_MAX if floating is float else DBL_MAX
int s = X_indptr[0]
# XXX Precompute the pairwise distances matrix is not worth for sparse
# currently. Should be tested when BLAS (sparse x dense) matrix
# multiplication is available.
# Instead of computing the full pairwise squared distances matrix,
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
# the - 2 X.C^T + ||C||² term since the argmin for a given sample only
# depends on the centers.
# pairwise_distances = ||C||²
for i in range(n_samples):
min_sq_dist = max_floating
label = 0
for j in range(n_clusters):
sq_dist = 0.0
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
sq_dist += centers_old[j, X_indices[k]] * X_data[k]
# Instead of computing the full squared distance with each cluster,
# ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
# the - 2 X.C^T + ||C||² term since the argmin for a given sample
# only depends on the centers C.
sq_dist = centers_squared_norms[j] -2 * sq_dist
pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
# pairwise_distances += -2 * X.dot(C.T)
_gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-2.0, X, n_features, &centers_old[0, 0], n_features,
1.0, pairwise_distances, n_clusters)
for i in range(n_samples):
min_sq_dist = pairwise_distances[i * n_clusters]
label = 0
for j in range(1, n_clusters):
sq_dist = pairwise_distances[i * n_clusters + j]
if sq_dist < min_sq_dist:
min_sq_dist = sq_dist
label = j
labels[i] = label
if update_centers:
weight_in_clusters[label] += sample_weight[i]
for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
for k in range(n_features):
centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
\ No newline at end of file
......@@ -9,7 +9,7 @@ from Cython.Build import build_ext
extensions = [
Extension("kmeans",
sources=["_k_means_fast.pyx"],
sources=["_kmeans.pyx"],
include_dirs=[numpy.get_include()],
)
]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment