Compile minimal kmeans extension

531c3f0e · Julien Jerphanion · e11442ac · 531c3f0e · e11442ac · e11442ac
Commit 531c3f0e authored Apr 06, 2021 by Julien Jerphanion
7 changed files
--- a/kmeans/Makefile
+++ b/kmeans/Makefile
+all:
+	python setup.py build_ext --inplace
+
+mrproper:
+	rm -f *.cpp
+	rm -f *.c
+	rm -f *.so
+	rm -Rf build
--- a/kmeans/_cython_blas.pxd
+++ b/kmeans/_cython_blas.pxd
-from cython cimport floating
-
-
-cpdef enum BLAS_Order:
-    RowMajor  # C contiguous
-    ColMajor  # Fortran contiguous
-
-
-cpdef enum BLAS_Trans:
-    NoTrans = 110  # correspond to 'n'
-    Trans = 116    # correspond to 't'
-
-
-# BLAS Level 1 ################################################################
-cdef floating _dot(int, floating*, int, floating*, int) nogil
-
-cdef floating _asum(int, floating*, int) nogil
-
-cdef void _axpy(int, floating, floating*, int, floating*, int) nogil
-
-cdef floating _nrm2(int, floating*, int) nogil
-
-cdef void _copy(int, floating*, int, floating*, int) nogil
-
-cdef void _scal(int, floating, floating*, int) nogil
-
-cdef void _rotg(floating*, floating*, floating*, floating*) nogil
-
-cdef void _rot(int, floating*, int, floating*, int, floating, floating) nogil
-
-# BLAS Level 2 ################################################################
-cdef void _gemv(BLAS_Order, BLAS_Trans, int, int, floating, floating*, int,
-                floating*, int, floating, floating*, int) nogil
-
-cdef void _ger(BLAS_Order, int, int, floating, floating*, int, floating*, int,
-               floating*, int) nogil
-
-# BLASLevel 3 ################################################################
-cdef void _gemm(BLAS_Order, BLAS_Trans, BLAS_Trans, int, int, int, floating,
-                floating*, int, floating*, int, floating, floating*,
-                int) nogil
--- a/kmeans/_cython_blas.pyx
+++ b/kmeans/_cython_blas.pyx
-# cython: language_level = 3
-
-from cython cimport floating
-
-from scipy.linalg.cython_blas cimport sdot, ddot
-from scipy.linalg.cython_blas cimport sasum, dasum
-from scipy.linalg.cython_blas cimport saxpy, daxpy
-from scipy.linalg.cython_blas cimport snrm2, dnrm2
-from scipy.linalg.cython_blas cimport scopy, dcopy
-from scipy.linalg.cython_blas cimport sscal, dscal
-from scipy.linalg.cython_blas cimport srotg, drotg
-from scipy.linalg.cython_blas cimport srot, drot
-from scipy.linalg.cython_blas cimport sgemv, dgemv
-from scipy.linalg.cython_blas cimport sger, dger
-from scipy.linalg.cython_blas cimport sgemm, dgemm
-
-
-################
-# BLAS Level 1 #
-################
-
-cdef floating _dot(int n, floating *x, int incx,
-                   floating *y, int incy) nogil:
-    """x.T.y"""
-    if floating is float:
-        return sdot(&n, x, &incx, y, &incy)
-    else:
-        return ddot(&n, x, &incx, y, &incy)
-
-
-cpdef _dot_memview(floating[::1] x, floating[::1] y):
-    return _dot(x.shape[0], &x[0], 1, &y[0], 1)
-
-
-cdef floating _asum(int n, floating *x, int incx) nogil:
-    """sum(|x_i|)"""
-    if floating is float:
-        return sasum(&n, x, &incx)
-    else:
-        return dasum(&n, x, &incx)
-
-
-cpdef _asum_memview(floating[::1] x):
-    return _asum(x.shape[0], &x[0], 1)
-
-
-cdef void _axpy(int n, floating alpha, floating *x, int incx,
-                floating *y, int incy) nogil:
-    """y := alpha * x + y"""
-    if floating is float:
-        saxpy(&n, &alpha, x, &incx, y, &incy)
-    else:
-        daxpy(&n, &alpha, x, &incx, y, &incy)
-
-
-cpdef _axpy_memview(floating alpha, floating[::1] x, floating[::1] y):
-    _axpy(x.shape[0], alpha, &x[0], 1, &y[0], 1)
-
-
-cdef floating _nrm2(int n, floating *x, int incx) nogil:
-    """sqrt(sum((x_i)^2))"""
-    if floating is float:
-        return snrm2(&n, x, &incx)
-    else:
-        return dnrm2(&n, x, &incx)
-
-
-cpdef _nrm2_memview(floating[::1] x):
-    return _nrm2(x.shape[0], &x[0], 1)
-
-
-cdef void _copy(int n, floating *x, int incx, floating *y, int incy) nogil:
-    """y := x"""
-    if floating is float:
-        scopy(&n, x, &incx, y, &incy)
-    else:
-        dcopy(&n, x, &incx, y, &incy)
-
-
-cpdef _copy_memview(floating[::1] x, floating[::1] y):
-    _copy(x.shape[0], &x[0], 1, &y[0], 1)
-
-
-cdef void _scal(int n, floating alpha, floating *x, int incx) nogil:
-    """x := alpha * x"""
-    if floating is float:
-        sscal(&n, &alpha, x, &incx)
-    else:
-        dscal(&n, &alpha, x, &incx)
-
-
-cpdef _scal_memview(floating alpha, floating[::1] x):
-    _scal(x.shape[0], alpha, &x[0], 1)
-
-
-cdef void _rotg(floating *a, floating *b, floating *c, floating *s) nogil:
-    """Generate plane rotation"""
-    if floating is float:
-        srotg(a, b, c, s)
-    else:
-        drotg(a, b, c, s)
-
-
-cpdef _rotg_memview(floating a, floating b, floating c, floating s):
-    _rotg(&a, &b, &c, &s)
-    return a, b, c, s
-
-
-cdef void _rot(int n, floating *x, int incx, floating *y, int incy,
-               floating c, floating s) nogil:
-    """Apply plane rotation"""
-    if floating is float:
-        srot(&n, x, &incx, y, &incy, &c, &s)
-    else:
-        drot(&n, x, &incx, y, &incy, &c, &s)
-
-
-cpdef _rot_memview(floating[::1] x, floating[::1] y, floating c, floating s):
-    _rot(x.shape[0], &x[0], 1, &y[0], 1, c, s)
-
-
-################
-# BLAS Level 2 #
-################
-
-cdef void _gemv(BLAS_Order order, BLAS_Trans ta, int m, int n, floating alpha,
-                floating *A, int lda, floating *x, int incx,
-                floating beta, floating *y, int incy) nogil:
-    """y := alpha * op(A).x + beta * y"""
-    cdef char ta_ = ta
-    if order == RowMajor:
-        ta_ = NoTrans if ta == Trans else Trans
-        if floating is float:
-            sgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
-        else:
-            dgemv(&ta_, &n, &m, &alpha, A, &lda, x, &incx, &beta, y, &incy)
-    else:
-        if floating is float:
-            sgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
-        else:
-            dgemv(&ta_, &m, &n, &alpha, A, &lda, x, &incx, &beta, y, &incy)
-
-
-cpdef _gemv_memview(BLAS_Trans ta, floating alpha, floating[:, :] A,
-                    floating[::1] x, floating beta, floating[::1] y):
-    cdef:
-        int m = A.shape[0]
-        int n = A.shape[1]
-        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
-        int lda = m if order == ColMajor else n
-
-    _gemv(order, ta, m, n, alpha, &A[0, 0], lda, &x[0], 1, beta, &y[0], 1)
-
-
-cdef void _ger(BLAS_Order order, int m, int n, floating alpha, floating *x,
-               int incx, floating *y, int incy, floating *A, int lda) nogil:
-    """A := alpha * x.y.T + A"""
-    if order == RowMajor:
-        if floating is float:
-            sger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
-        else:
-            dger(&n, &m, &alpha, y, &incy, x, &incx, A, &lda)
-    else:
-        if floating is float:
-            sger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
-        else:
-            dger(&m, &n, &alpha, x, &incx, y, &incy, A, &lda)
-
-
-cpdef _ger_memview(floating alpha, floating[::1] x, floating[::] y,
-                   floating[:, :] A):
-    cdef:
-        int m = A.shape[0]
-        int n = A.shape[1]
-        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
-        int lda = m if order == ColMajor else n
-
-    _ger(order, m, n, alpha, &x[0], 1, &y[0], 1, &A[0, 0], lda)
-
-
-################
-# BLAS Level 3 #
-################
-
-cdef void _gemm(BLAS_Order order, BLAS_Trans ta, BLAS_Trans tb, int m, int n,
-                int k, floating alpha, floating *A, int lda, floating *B,
-                int ldb, floating beta, floating *C, int ldc) nogil:
-    """C := alpha * op(A).op(B) + beta * C"""
-    cdef:
-        char ta_ = ta
-        char tb_ = tb
-    if order == RowMajor:
-        if floating is float:
-            sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
-                  &ldb, A, &lda, &beta, C, &ldc)
-        else:
-            dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
-                  &ldb, A, &lda, &beta, C, &ldc)
-    else:
-        if floating is float:
-            sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
-                  &lda, B, &ldb, &beta, C, &ldc)
-        else:
-            dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
-                  &lda, B, &ldb, &beta, C, &ldc)
-
-
-cpdef _gemm_memview(BLAS_Trans ta, BLAS_Trans tb, floating alpha,
-                    floating[:, :] A, floating[:, :] B, floating beta,
-                    floating[:, :] C):
-    cdef:
-        int m = A.shape[0] if ta == NoTrans else A.shape[1]
-        int n = B.shape[1] if tb == NoTrans else B.shape[0]
-        int k = A.shape[1] if ta == NoTrans else A.shape[0]
-        int lda, ldb, ldc
-        BLAS_Order order = ColMajor if A.strides[0] == A.itemsize else RowMajor
-
-    if order == RowMajor:
-        lda = k if ta == NoTrans else m
-        ldb = n if tb == NoTrans else k
-        ldc = n
-    else:
-        lda = m if ta == NoTrans else k
-        ldb = k if tb == NoTrans else n
-        ldc = m
-
-    _gemm(order, ta, tb, m, n, k, alpha, &A[0, 0],
-          lda, &B[0, 0], ldb, beta, &C[0, 0], ldc)
--- a/kmeans/_k_means_fast.pxd
+++ b/kmeans/_k_means_fast.pxd
-# cython: language_level=3
-
-
-from cython cimport floating
-cimport numpy as np
-
-
-cdef floating _euclidean_dense_dense(floating*, floating*, int, bint) nogil
-
-cdef floating _euclidean_sparse_dense(floating[::1], int[::1], floating[::1],
-                                      floating, bint) nogil
-
-cpdef void _relocate_empty_clusters_dense(
-    np.ndarray[floating, ndim=2, mode='c'], floating[::1], floating[:, ::1],
-    floating[:, ::1], floating[::1], int[::1])
-
-cpdef void _relocate_empty_clusters_sparse(
-    floating[::1], int[::1], int[::1], floating[::1], floating[:, ::1],
-    floating[:, ::1], floating[::1], int[::1])
-
-cdef void _average_centers(floating[:, ::1], floating[::1])
-
-cdef void _center_shift(floating[:, ::1], floating[:, ::1], floating[::1])
\ No newline at end of file
--- a/kmeans/_k_means_fast.pyx
+++ b/kmeans/_k_means_fast.pyx
-# cython: profile=True, boundscheck=False, wraparound=False, cdivision=True
-# Profiling is enabled by default as the overhead does not seem to be
-# measurable on this specific use case.
-
-# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
-#         Olivier Grisel <olivier.grisel@ensta.org>
-#         Lars Buitinck
-#
-# License: BSD 3 clause
-
-# TODO: We still need to use ndarrays instead of typed memoryviews when using
-# fused types and when the array may be read-only (for instance when it's
-# provided by the user). This is fixed in cython > 0.3.
-
-import numpy as np
-cimport numpy as np
-from cython cimport floating
-from libc.math cimport sqrt
-from scipy import sparse
-
-import numpy as np
-
-np.import_array()
-
-ctypedef fused integral:
-    int
-    long long
-
-ctypedef np.float64_t DOUBLE
-ctypedef np.int32_t INT
-
-
-# Number of samples per data chunk defined as a global constant.
-CHUNK_SIZE = 256
-
-
-def csr_row_norms(X):
-    """L2 norm of each row in CSR matrix X."""
-    if X.dtype not in [np.float32, np.float64]:
-        X = X.astype(np.float64)
-    return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)
-
-def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
-                   shape,
-                   np.ndarray[integral, ndim=1, mode="c"] X_indices,
-                   np.ndarray[integral, ndim=1, mode="c"] X_indptr):
-    cdef:
-        unsigned long long n_samples = shape[0]
-        unsigned long long i
-        integral j
-        double sum_
-
-    norms = np.empty(n_samples, dtype=X_data.dtype)
-    cdef floating[::1] norms_view = norms
-
-    for i in range(n_samples):
-        sum_ = 0.0
-        for j in range(X_indptr[i], X_indptr[i + 1]):
-            sum_ += X_data[j] * X_data[j]
-        norms_view[i] = sum_
-
-    return norms
-
-def row_norms(X, squared=False):
-    """Row-wise (squared) Euclidean norm of X.
-
-    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
-    matrices and does not create an X.shape-sized temporary.
-
-    Performs no input validation.
-
-    Parameters
-    ----------
-    X : array-like
-        The input array.
-    squared : bool, default=False
-        If True, return squared norms.
-
-    Returns
-    -------
-    array-like
-        The row-wise (squared) Euclidean norm of X.
-    """
-    if sparse.issparse(X):
-        if not isinstance(X, sparse.csr_matrix):
-            X = sparse.csr_matrix(X)
-        norms = csr_row_norms(X)
-    else:
-        norms = np.einsum('ij,ij->i', X, X)
-
-    if not squared:
-        np.sqrt(norms, norms)
-    return norms
-
-
-cdef floating _euclidean_dense_dense(
-        floating* a,  # IN
-        floating* b,  # IN
-        int n_features,
-        bint squared) nogil:
-    """Euclidean distance between a dense and b dense"""
-    cdef:
-        int i
-        int n = n_features // 4
-        int rem = n_features % 4
-        floating result = 0
-
-    # We manually unroll the loop for better cache optimization.
-    for i in range(n):
-        result += ((a[0] - b[0]) * (a[0] - b[0])
-                  +(a[1] - b[1]) * (a[1] - b[1])
-                  +(a[2] - b[2]) * (a[2] - b[2])
-                  +(a[3] - b[3]) * (a[3] - b[3]))
-        a += 4; b += 4
-
-    for i in range(rem):
-        result += (a[i] - b[i]) * (a[i] - b[i])
-
-    return result if squared else sqrt(result)
-
-
-def _euclidean_dense_dense_wrapper(floating[::1] a, floating[::1] b,
-                                   bint squared):
-    """Wrapper of _euclidean_dense_dense for testing purpose"""
-    return _euclidean_dense_dense(&a[0], &b[0], a.shape[0], squared)
-
-
-cdef floating _euclidean_sparse_dense(
-        floating[::1] a_data,  # IN
-        int[::1] a_indices,    # IN
-        floating[::1] b,       # IN
-        floating b_squared_norm,
-        bint squared) nogil:
-    """Euclidean distance between a sparse and b dense"""
-    cdef:
-        int nnz = a_indices.shape[0]
-        int i
-        floating tmp, bi
-        floating result = 0.0
-
-    for i in range(nnz):
-        bi = b[a_indices[i]]
-        tmp = a_data[i] - bi
-        result += tmp * tmp - bi * bi
-
-    result += b_squared_norm
-
-    if result < 0: result = 0.0
-
-    return result if squared else sqrt(result)
-
-
-def _euclidean_sparse_dense_wrapper(
-        floating[::1] a_data,
-        int[::1] a_indices,
-        floating[::1] b,
-        floating b_squared_norm,
-        bint squared):
-    """Wrapper of _euclidean_sparse_dense for testing purpose"""
-    return _euclidean_sparse_dense(
-        a_data, a_indices, b, b_squared_norm, squared)
-
-
-cpdef floating _inertia_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers,                  # IN
-        int[::1] labels):                          # IN
-    """Compute inertia for dense input data
-
-    Sum of squared distance between each sample and its assigned center.
-    """
-    cdef:
-        int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int i, j
-
-        floating sq_dist = 0.0
-        floating inertia = 0.0
-
-    for i in range(n_samples):
-        j = labels[i]
-        sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
-                                         n_features, True)
-        inertia += sq_dist * sample_weight[i]
-
-    return inertia
-
-
-cpdef floating _inertia_sparse(
-        X,                            # IN
-        floating[::1] sample_weight,  # IN
-        floating[:, ::1] centers,     # IN
-        int[::1] labels):             # IN
-    """Compute inertia for sparse input data
-
-    Sum of squared distance between each sample and its assigned center.
-    """
-    cdef:
-        floating[::1] X_data = X.data
-        int[::1] X_indices = X.indices
-        int[::1] X_indptr = X.indptr
-
-        int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int i, j
-
-        floating sq_dist = 0.0
-        floating inertia = 0.0
-
-        floating[::1] centers_squared_norms = row_norms(centers, squared=True)
-
-    for i in range(n_samples):
-        j = labels[i]
-        sq_dist = _euclidean_sparse_dense(
-            X_data[X_indptr[i]: X_indptr[i + 1]],
-            X_indices[X_indptr[i]: X_indptr[i + 1]],
-            centers[j], centers_squared_norms[j], True)
-        inertia += sq_dist * sample_weight[i]
-
-    return inertia
-
-
-cpdef void _relocate_empty_clusters_dense(
-        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
-        floating[::1] sample_weight,               # IN
-        floating[:, ::1] centers_old,              # IN
-        floating[:, ::1] centers_new,              # INOUT
-        floating[::1] weight_in_clusters,          # INOUT
-        int[::1] labels):                          # IN
-    """Relocate centers which have no sample assigned to them."""
-    cdef:
-        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
-        int n_empty = empty_clusters.shape[0]
-
-    if n_empty == 0:
-        return
-
-    cdef:
-        int n_features = X.shape[1]
-
-        floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
-        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
-
-        int new_cluster_id, old_cluster_id, far_idx, idx, k
-        floating weight
-
-    for idx in range(n_empty):
-
-        new_cluster_id = empty_clusters[idx]
-
-        far_idx = far_from_centers[idx]
-        weight = sample_weight[far_idx]
-
-        old_cluster_id = labels[far_idx]
-
-        for k in range(n_features):
-            centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
-            centers_new[new_cluster_id, k] = X[far_idx, k] * weight
-
-        weight_in_clusters[new_cluster_id] = weight
-        weight_in_clusters[old_cluster_id] -= weight
-
-
-cpdef void _relocate_empty_clusters_sparse(
-        floating[::1] X_data,              # IN
-        int[::1] X_indices,                # IN
-        int[::1] X_indptr,                 # IN
-        floating[::1] sample_weight,       # IN
-        floating[:, ::1] centers_old,      # IN
-        floating[:, ::1] centers_new,      # INOUT
-        floating[::1] weight_in_clusters,  # INOUT
-        int[::1] labels):                  # IN
-    """Relocate centers which have no sample assigned to them."""
-    cdef:
-        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
-        int n_empty = empty_clusters.shape[0]
-
-    if n_empty == 0:
-        return
-
-    cdef:
-        int n_samples = X_indptr.shape[0] - 1
-        int n_features = centers_old.shape[1]
-        floating x
-        int i, j, k
-
-        floating[::1] distances = np.zeros(n_samples, dtype=X_data.base.dtype)
-        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)
-
-    for i in range(n_samples):
-        j = labels[i]
-        distances[i] = _euclidean_sparse_dense(
-            X_data[X_indptr[i]: X_indptr[i + 1]],
-            X_indices[X_indptr[i]: X_indptr[i + 1]],
-            centers_old[j], centers_squared_norms[j], True)
-
-    cdef:
-        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)
-
-        int new_cluster_id, old_cluster_id, far_idx, idx
-        floating weight
-
-    for idx in range(n_empty):
-
-        new_cluster_id = empty_clusters[idx]
-
-        far_idx = far_from_centers[idx]
-        weight = sample_weight[far_idx]
-
-        old_cluster_id = labels[far_idx]
-
-        for k in range(X_indptr[far_idx], X_indptr[far_idx + 1]):
-            centers_new[old_cluster_id, X_indices[k]] -= X_data[k] * weight
-            centers_new[new_cluster_id, X_indices[k]] = X_data[k] * weight
-
-        weight_in_clusters[new_cluster_id] = weight
-        weight_in_clusters[old_cluster_id] -= weight
-
-
-cdef void _average_centers(
-        floating[:, ::1] centers,           # INOUT
-        floating[::1] weight_in_clusters):  # IN
-    """Average new centers wrt weights."""
-    cdef:
-        int n_clusters = centers.shape[0]
-        int n_features = centers.shape[1]
-        int j, k
-        floating alpha
-
-    for j in range(n_clusters):
-        if weight_in_clusters[j] > 0:
-            alpha = 1.0 / weight_in_clusters[j]
-            for k in range(n_features):
-                centers[j, k] *= alpha
-
-
-cdef void _center_shift(
-        floating[:, ::1] centers_old,  # IN
-        floating[:, ::1] centers_new,  # IN
-        floating[::1] center_shift):   # OUT
-    """Compute shift between old and new centers."""
-    cdef:
-        int n_clusters = centers_old.shape[0]
-        int n_features = centers_old.shape[1]
-        int j
-
-    for j in range(n_clusters):
-        center_shift[j] = _euclidean_dense_dense(
-            &centers_new[j, 0], &centers_old[j, 0], n_features, False)
-
-
-def _mini_batch_update_csr(X, np.ndarray[floating, ndim=1] sample_weight,
-                           np.ndarray[floating, ndim=1] x_squared_norms,
-                           np.ndarray[floating, ndim=2] centers,
-                           np.ndarray[floating, ndim=1] weight_sums,
-                           np.ndarray[INT, ndim=1] nearest_center,
-                           np.ndarray[floating, ndim=1] old_center,
-                           int compute_squared_diff):
-    """Incremental update of the centers for sparse MiniBatchKMeans.
-
-    Parameters
-    ----------
-
-    X : CSR matrix, dtype float
-        The complete (pre allocated) training set as a CSR matrix.
-
-    centers : array, shape (n_clusters, n_features)
-        The cluster centers
-
-    counts : array, shape (n_clusters,)
-         The vector in which we keep track of the numbers of elements in a
-         cluster
-
-    Returns
-    -------
-    inertia : float
-        The inertia of the batch prior to centers update, i.e. the sum
-        of squared distances to the closest center for each sample. This
-        is the objective function being minimized by the k-means algorithm.
-
-    squared_diff : float
-        The sum of squared update (squared norm of the centers position
-        change). If compute_squared_diff is 0, this computation is skipped and
-        0.0 is returned instead.
-
-    Both squared diff and inertia are commonly used to monitor the convergence
-    of the algorithm.
-    """
-    cdef:
-        np.ndarray[floating, ndim=1] X_data = X.data
-        np.ndarray[int, ndim=1] X_indices = X.indices
-        np.ndarray[int, ndim=1] X_indptr = X.indptr
-        unsigned int n_samples = X.shape[0]
-        unsigned int n_clusters = centers.shape[0]
-        unsigned int n_features = centers.shape[1]
-
-        unsigned int sample_idx, center_idx, feature_idx
-        unsigned int k
-        DOUBLE old_weight_sum, new_weight_sum
-        DOUBLE center_diff
-        DOUBLE squared_diff = 0.0
-
-    # move centers to the mean of both old and newly assigned samples
-    for center_idx in range(n_clusters):
-        old_weight_sum = weight_sums[center_idx]
-        new_weight_sum = old_weight_sum
-
-        # count the number of samples assigned to this center
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] == center_idx:
-                new_weight_sum += sample_weight[sample_idx]
-
-        if new_weight_sum == old_weight_sum:
-            # no new sample: leave this center as it stands
-            continue
-
-        # rescale the old center to reflect it previous accumulated weight
-        # with regards to the new data that will be incrementally contributed
-        if compute_squared_diff:
-            old_center[:] = centers[center_idx]
-        centers[center_idx] *= old_weight_sum
-
-        # iterate of over samples assigned to this cluster to move the center
-        # location by inplace summation
-        for sample_idx in range(n_samples):
-            if nearest_center[sample_idx] != center_idx:
-                continue
-
-            # inplace sum with new samples that are members of this cluster
-            # and update of the incremental squared difference update of the
-            # center position
-            for k in range(X_indptr[sample_idx], X_indptr[sample_idx + 1]):
-                centers[center_idx, X_indices[k]] += X_data[k]
-
-        # inplace rescale center with updated count
-        if new_weight_sum > old_weight_sum:
-            # update the count statistics for this center
-            weight_sums[center_idx] = new_weight_sum
-
-            # re-scale the updated center with the total new counts
-            centers[center_idx] /= new_weight_sum
-
-            # update the incremental computation of the squared total
-            # centers position change
-            if compute_squared_diff:
-                for feature_idx in range(n_features):
-                    squared_diff += (old_center[feature_idx]
-                                     - centers[center_idx, feature_idx]) ** 2
-
-    return squared_diff
--- a/kmeans/kmeans.pyx
+++ b/kmeans/kmeans.pyx
@@ -12,17 +12,32 @@ from cython cimport floating
 from cython.parallel import prange, parallel
 from libc.stdlib cimport malloc, calloc, free
 from libc.string cimport memset
-from libc.float cimport DBL_MAX, FLT_MAX
+from libc.math cimport sqrt
+from scipy.linalg.cython_blas cimport sgemm, dgemm
+from scipy import sparse

-from ._cython_blas cimport _gemm
-from ._cython_blas cimport RowMajor, Trans, NoTrans
-from ._k_means_fast import CHUNK_SIZE, row_norms
-from ._k_means_fast cimport _relocate_empty_clusters_dense
-from ._k_means_fast cimport _relocate_empty_clusters_sparse
-from ._k_means_fast cimport _average_centers, _center_shift
+np.import_array()

+ctypedef fused integral:
+    int
+    long long

-np.import_array()
+ctypedef np.float64_t DOUBLE
+ctypedef np.int32_t INT
+
+
+# Number of samples per data chunk defined as a global constant.
+_CHUNK_SIZE = 256
+
+
+cdef enum _BLAS_Order:
+    RowMajor  # C contiguous
+    ColMajor  # Fortran contiguous
+
+
+cdef enum _BLAS_Trans:
+    NoTrans = 110  # correspond to 'n'
+    Trans = 116    # correspond to 't'


 def lloyd_iter_chunked_dense(
@@ -90,7 +105,7 @@ def lloyd_iter_chunked_dense(

        # hard-coded number of samples per chunk. Appeared to be close to
        # optimal in all situations.
-        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
+        int n_samples_chunk = _CHUNK_SIZE if n_samples > _CHUNK_SIZE else n_samples
        int n_chunks = n_samples // n_samples_chunk
        int n_samples_rem = n_samples % n_samples_chunk
        int chunk_idx, n_samples_chunk_eff
@@ -159,208 +174,219 @@ def lloyd_iter_chunked_dense(
        _average_centers(centers_new, weight_in_clusters)
        _center_shift(centers_old, centers_new, center_shift)

+# Fixtures

-cdef void _update_chunk_dense(
-        floating *X,                          # IN
-        # expecting C alinged 2D array. XXX: Can be
-        # replaced by const memoryview when cython min
-        # version is >= 0.3
-        floating[::1] sample_weight,          # IN
-        floating[::1] x_squared_norms,        # IN
-        floating[:, ::1] centers_old,         # IN
-        floating[::1] centers_squared_norms,  # IN
-        int[::1] labels,                      # OUT
-        floating *centers_new,                # OUT
-        floating *weight_in_clusters,         # OUT
-        floating *pairwise_distances,         # OUT
-        bint update_centers) nogil:
-    """K-means combined EM step for one dense data chunk.
+cdef void _gemm(_BLAS_Order order, _BLAS_Trans ta, _BLAS_Trans tb, int m, int n,
+                int k, floating alpha, floating *A, int lda, floating *B,
+                int ldb, floating beta, floating *C, int ldc) nogil:
+    """C := alpha * op(A).op(B) + beta * C"""
+    cdef:
+        char ta_ = ta
+        char tb_ = tb
+    if order == RowMajor:
+        if floating is float:
+            sgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
+                  &ldb, A, &lda, &beta, C, &ldc)
+        else:
+            dgemm(&tb_, &ta_, &n, &m, &k, &alpha, B,
+                  &ldb, A, &lda, &beta, C, &ldc)
+    else:
+        if floating is float:
+            sgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
+                  &lda, B, &ldb, &beta, C, &ldc)
+        else:
+            dgemm(&ta_, &tb_, &m, &n, &k, &alpha, A,
+                  &lda, B, &ldb, &beta, C, &ldc)
+
+
+cpdef void _relocate_empty_clusters_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers_old,              # IN
+        floating[:, ::1] centers_new,              # INOUT
+        floating[::1] weight_in_clusters,          # INOUT
+        int[::1] labels):                          # IN
+    """Relocate centers which have no sample assigned to them."""
+    cdef:
+        int[::1] empty_clusters = np.where(np.equal(weight_in_clusters, 0))[0].astype(np.int32)
+        int n_empty = empty_clusters.shape[0]
+
+    if n_empty == 0:
+        return

-    Compute the partial contribution of a single data chunk to the labels and
-    centers.
-    """
    cdef:
-        int n_samples = labels.shape[0]
-        int n_clusters = centers_old.shape[0]
-        int n_features = centers_old.shape[1]
+        int n_features = X.shape[1]

-        floating sq_dist, min_sq_dist
-        int i, j, k, label
+        floating[::1] distances = ((np.asarray(X) - np.asarray(centers_old)[labels])**2).sum(axis=1)
+        int[::1] far_from_centers = np.argpartition(distances, -n_empty)[:-n_empty-1:-1].astype(np.int32)

-    # Instead of computing the full pairwise squared distances matrix,
-    # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
-    # the - 2 X.C^T + ||C||² term since the argmin for a given sample only
-    # depends on the centers.
-    # pairwise_distances = ||C||²
-    for i in range(n_samples):
-        for j in range(n_clusters):
-            pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
+        int new_cluster_id, old_cluster_id, far_idx, idx, k
+        floating weight

-    # pairwise_distances += -2 * X.dot(C.T)
-    _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
-          -2.0, X, n_features, &centers_old[0, 0], n_features,
-          1.0, pairwise_distances, n_clusters)
+    for idx in range(n_empty):

-    for i in range(n_samples):
-        min_sq_dist = pairwise_distances[i * n_clusters]
-        label = 0
-        for j in range(1, n_clusters):
-            sq_dist = pairwise_distances[i * n_clusters + j]
-            if sq_dist < min_sq_dist:
-                min_sq_dist = sq_dist
-                label = j
-        labels[i] = label
+        new_cluster_id = empty_clusters[idx]

-        if update_centers:
-            weight_in_clusters[label] += sample_weight[i]
-            for k in range(n_features):
-                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
-
-
-def lloyd_iter_chunked_sparse(
-        X,                                 # IN
-        floating[::1] sample_weight,       # IN
-        floating[::1] x_squared_norms,     # IN
-        floating[:, ::1] centers_old,      # IN
-        floating[:, ::1] centers_new,      # OUT
-        floating[::1] weight_in_clusters,  # OUT
-        int[::1] labels,                   # OUT
-        floating[::1] center_shift,        # OUT
-        int n_threads,
-        bint update_centers=True):
-    """Single iteration of K-means lloyd algorithm with sparse input.
+        far_idx = far_from_centers[idx]
+        weight = sample_weight[far_idx]

-    Update labels and centers (inplace), for one iteration, distributed
-    over data chunks.
+        old_cluster_id = labels[far_idx]

-    Parameters
-    ----------
-    X : sparse matrix of shape (n_samples, n_features), dtype=floating
-        The observations to cluster. Must be in CSR format.
+        for k in range(n_features):
+            centers_new[old_cluster_id, k] -= X[far_idx, k] * weight
+            centers_new[new_cluster_id, k] = X[far_idx, k] * weight

-    sample_weight : ndarray of shape (n_samples,), dtype=floating
-        The weights for each observation in X.
+        weight_in_clusters[new_cluster_id] = weight
+        weight_in_clusters[old_cluster_id] -= weight

-    x_squared_norms : ndarray of shape (n_samples,), dtype=floating
-        Squared L2 norm of X.
+def _csr_row_norms(X):
+    """L2 norm of each row in CSR matrix X."""
+    if X.dtype not in [np.float32, np.float64]:
+        X = X.astype(np.float64)
+    return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)

-    centers_old : ndarray of shape (n_clusters, n_features), dtype=floating
-        Centers before previous iteration, placeholder for the centers after
-        previous iteration.
+def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
+                   shape,
+                   np.ndarray[integral, ndim=1, mode="c"] X_indices,
+                   np.ndarray[integral, ndim=1, mode="c"] X_indptr):
+    cdef:
+        unsigned long long n_samples = shape[0]
+        unsigned long long i
+        integral j
+        double sum_

-    centers_new : ndarray of shape (n_clusters, n_features), dtype=floating
-        Centers after previous iteration, placeholder for the new centers
-        computed during this iteration.
+    norms = np.empty(n_samples, dtype=X_data.dtype)
+    cdef floating[::1] norms_view = norms

-    centers_squared_norms : ndarray of shape (n_clusters,), dtype=floating
-        Squared L2 norm of the centers.
+    for i in range(n_samples):
+        sum_ = 0.0
+        for j in range(X_indptr[i], X_indptr[i + 1]):
+            sum_ += X_data[j] * X_data[j]
+        norms_view[i] = sum_

-    weight_in_clusters : ndarray of shape (n_clusters,), dtype=floating
-        Placeholder for the sums of the weights of every observation assigned
-        to each center.
+    return norms

-    labels : ndarray of shape (n_samples,), dtype=int
-        labels assignment.
+def row_norms(X, squared=False):
+    """Row-wise (squared) Euclidean norm of X.

-    center_shift : ndarray of shape (n_clusters,), dtype=floating
-        Distance between old and new centers.
+    Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
+    matrices and does not create an X.shape-sized temporary.

-    n_threads : int
-        The number of threads to be used by openmp.
+    Performs no input validation.

-    update_centers : bool
-        - If True, the labels and the new centers will be computed, i.e. runs
-          the E-step and the M-step of the algorithm.
-        - If False, only the labels will be computed, i.e runs the E-step of
-          the algorithm. This is useful especially when calling predict on a
-          fitted model.
+    Parameters
+    ----------
+    X : array-like
+        The input array.
+    squared : bool, default=False
+        If True, return squared norms.
+
+    Returns
+    -------
+    array-like
+        The row-wise (squared) Euclidean norm of X.
    """
-    # print(X.indices.dtype)
+    if sparse.issparse(X):
+        if not isinstance(X, sparse.csr_matrix):
+            X = sparse.csr_matrix(X)
+        norms = _csr_row_norms(X)
+    else:
+        norms = np.einsum('ij,ij->i', X, X)
+
+    if not squared:
+        np.sqrt(norms, norms)
+    return norms
+
+
+cdef floating _euclidean_dense_dense(
+        floating* a,  # IN
+        floating* b,  # IN
+        int n_features,
+        bint squared) nogil:
+    """Euclidean distance between a dense and b dense"""
    cdef:
-        int n_samples = X.shape[0]
-        int n_features = X.shape[1]
-        int n_clusters = centers_new.shape[0]
+        int i
+        int n = n_features // 4
+        int rem = n_features % 4
+        floating result = 0

-        # Chosed same as for dense. Does not have the same impact since with
-        # sparse data the pairwise distances matrix is not precomputed.
-        # However, splitting in chunks is necessary to get parallelism.
-        int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
-        int n_chunks = n_samples // n_samples_chunk
-        int n_samples_rem = n_samples % n_samples_chunk
-        int chunk_idx, n_samples_chunk_eff = 0
-        int start = 0, end = 0
+    # We manually unroll the loop for better cache optimization.
+    for i in range(n):
+        result += ((a[0] - b[0]) * (a[0] - b[0])
+                  +(a[1] - b[1]) * (a[1] - b[1])
+                  +(a[2] - b[2]) * (a[2] - b[2])
+                  +(a[3] - b[3]) * (a[3] - b[3]))
+        a += 4; b += 4

-        int j, k
+    for i in range(rem):
+        result += (a[i] - b[i]) * (a[i] - b[i])

-        floating[::1] X_data = X.data
-        int[::1] X_indices = X.indices
-        int[::1] X_indptr = X.indptr
+    return result if squared else sqrt(result)

-        floating[::1] centers_squared_norms = row_norms(centers_old, squared=True)

-        floating *centers_new_chunk
-        floating *weight_in_clusters_chunk
+cpdef floating _inertia_dense(
+        np.ndarray[floating, ndim=2, mode='c'] X,  # IN
+        floating[::1] sample_weight,               # IN
+        floating[:, ::1] centers,                  # IN
+        int[::1] labels):                          # IN
+    """Compute inertia for dense input data

-    # count remainder chunk in total number of chunks
-    n_chunks += n_samples != n_chunks * n_samples_chunk
+    Sum of squared distance between each sample and its assigned center.
+    """
+    cdef:
+        int n_samples = X.shape[0]
+        int n_features = X.shape[1]
+        int i, j

-    # number of threads should not be bigger than number of chunks
-    n_threads = min(n_threads, n_chunks)
+        floating sq_dist = 0.0
+        floating inertia = 0.0

-    if update_centers:
-        memset(&centers_new[0, 0], 0, n_clusters * n_features * sizeof(floating))
-        memset(&weight_in_clusters[0], 0, n_clusters * sizeof(floating))
+    for i in range(n_samples):
+        j = labels[i]
+        sq_dist = _euclidean_dense_dense(&X[i, 0], &centers[j, 0],
+                                         n_features, True)
+        inertia += sq_dist * sample_weight[i]

-    with nogil, parallel(num_threads=n_threads):
-        # thread local buffers
-        centers_new_chunk = <floating*> calloc(n_clusters * n_features, sizeof(floating))
-        weight_in_clusters_chunk = <floating*> calloc(n_clusters, sizeof(floating))
+    return inertia

-        for chunk_idx in prange(n_chunks, schedule='static'):
-            start = chunk_idx * n_samples_chunk
-            if chunk_idx == n_chunks - 1 and n_samples_rem > 0:
-                end = start + n_samples_rem
-            else:
-                end = start + n_samples_chunk

-            _update_chunk_sparse(
-                X_data[X_indptr[start]: X_indptr[end]],
-                X_indices[X_indptr[start]: X_indptr[end]],
-                X_indptr[start: end],
-                sample_weight[start: end],
-                x_squared_norms[start: end],
-                centers_old,
-                centers_squared_norms,
-                labels[start: end],
-                centers_new_chunk,
-                weight_in_clusters_chunk,
-                update_centers)
+cdef void _average_centers(
+        floating[:, ::1] centers,           # INOUT
+        floating[::1] weight_in_clusters):  # IN
+    """Average new centers wrt weights."""
+    cdef:
+        int n_clusters = centers.shape[0]
+        int n_features = centers.shape[1]
+        int j, k
+        floating alpha

-        # reduction from local buffers. The gil is necessary for that to avoid
-        # race conditions.
-        if update_centers:
-            with gil:
-                for j in range(n_clusters):
-                    weight_in_clusters[j] += weight_in_clusters_chunk[j]
-                    for k in range(n_features):
-                        centers_new[j, k] += centers_new_chunk[j * n_features + k]
+    for j in range(n_clusters):
+        if weight_in_clusters[j] > 0:
+            alpha = 1.0 / weight_in_clusters[j]
+            for k in range(n_features):
+                centers[j, k] *= alpha

-        free(centers_new_chunk)
-        free(weight_in_clusters_chunk)

-    if update_centers:
-        _relocate_empty_clusters_sparse(
-            X_data, X_indices, X_indptr, sample_weight,
-            centers_old, centers_new, weight_in_clusters, labels)
+cdef void _center_shift(
+        floating[:, ::1] centers_old,  # IN
+        floating[:, ::1] centers_new,  # IN
+        floating[::1] center_shift):   # OUT
+    """Compute shift between old and new centers."""
+    cdef:
+        int n_clusters = centers_old.shape[0]
+        int n_features = centers_old.shape[1]
+        int j

-        _average_centers(centers_new, weight_in_clusters)
-        _center_shift(centers_old, centers_new, center_shift)
+    for j in range(n_clusters):
+        center_shift[j] = _euclidean_dense_dense(
+            &centers_new[j, 0], &centers_old[j, 0], n_features, False)


-cdef void _update_chunk_sparse(
-        floating[::1] X_data,                 # IN
-        int[::1] X_indices,                   # IN
-        int[::1] X_indptr,                    # IN
+cdef void _update_chunk_dense(
+        floating *X,                          # IN
+        # expecting C alinged 2D array. XXX: Can be
+        # replaced by const memoryview when cython min
+        # version is >= 0.3
        floating[::1] sample_weight,          # IN
        floating[::1] x_squared_norms,        # IN
        floating[:, ::1] centers_old,         # IN
@@ -368,8 +394,9 @@ cdef void _update_chunk_sparse(
        int[::1] labels,                      # OUT
        floating *centers_new,                # OUT
        floating *weight_in_clusters,         # OUT
+        floating *pairwise_distances,         # OUT
        bint update_centers) nogil:
-    """K-means combined EM step for one sparse data chunk.
+    """K-means combined EM step for one dense data chunk.

    Compute the partial contribution of a single data chunk to the labels and
    centers.
@@ -381,33 +408,32 @@ cdef void _update_chunk_sparse(

        floating sq_dist, min_sq_dist
        int i, j, k, label
-        floating max_floating = FLT_MAX if floating is float else DBL_MAX
-        int s = X_indptr[0]

-    # XXX Precompute the pairwise distances matrix is not worth for sparse
-    # currently. Should be tested when BLAS (sparse x dense) matrix
-    # multiplication is available.
+    # Instead of computing the full pairwise squared distances matrix,
+    # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to store
+    # the - 2 X.C^T + ||C||² term since the argmin for a given sample only
+    # depends on the centers.
+    # pairwise_distances = ||C||²
    for i in range(n_samples):
-        min_sq_dist = max_floating
-        label = 0
-
        for j in range(n_clusters):
-            sq_dist = 0.0
-            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
-                sq_dist += centers_old[j, X_indices[k]] * X_data[k]
-
-            # Instead of computing the full squared distance with each cluster,
-            # ||X - C||² = ||X||² - 2 X.C^T + ||C||², we only need to compute
-            # the - 2 X.C^T + ||C||² term since the argmin for a given sample
-            # only depends on the centers C.
-            sq_dist = centers_squared_norms[j] -2 * sq_dist
+            pairwise_distances[i * n_clusters + j] = centers_squared_norms[j]
+
+    # pairwise_distances += -2 * X.dot(C.T)
+    _gemm(RowMajor, NoTrans, Trans, n_samples, n_clusters, n_features,
+          -2.0, X, n_features, &centers_old[0, 0], n_features,
+          1.0, pairwise_distances, n_clusters)
+
+    for i in range(n_samples):
+        min_sq_dist = pairwise_distances[i * n_clusters]
+        label = 0
+        for j in range(1, n_clusters):
+            sq_dist = pairwise_distances[i * n_clusters + j]
            if sq_dist < min_sq_dist:
                min_sq_dist = sq_dist
                label = j
-
        labels[i] = label

        if update_centers:
            weight_in_clusters[label] += sample_weight[i]
-            for k in range(X_indptr[i] - s, X_indptr[i + 1] - s):
-                centers_new[label * n_features + X_indices[k]] += X_data[k] * sample_weight[i]
+            for k in range(n_features):
+                centers_new[label * n_features + k] += X[i * n_features + k] * sample_weight[i]
\ No newline at end of file
--- a/kmeans/setup.py
+++ b/kmeans/setup.py
@@ -9,7 +9,7 @@ from Cython.Build import build_ext

 extensions = [
    Extension("kmeans",
-              sources=["_k_means_fast.pyx"],
+              sources=["_kmeans.pyx"],
              include_dirs=[numpy.get_include()],
              )
 ]