[WIP] KDTree implementation

Add some types definitions and node partitionning in.

[WIP] KDTree implementation
Add some types definitions and node partitionning in.
c952315e · Julien Jerphanion · 1a848e92 · c952315e
Commit c952315e authored Apr 19, 2021 by Julien Jerphanion
Hide whitespace changes
Inline Side-by-side

Showing with 132 additions and 26 deletions

kdtree/main.pyx kdtree/main.pyx +132 -26

No files found.
--- a/kdtree/main.pyx
+++ b/kdtree/main.pyx
 # distutils: language = c++

+import numpy as np
+cimport numpy as cnp
+
+cnp.import_array()
+
 from cython.view cimport array as cvarray
 import numpy as np

@@ -17,9 +22,106 @@ from stdlib.string cimport string

 from posix.unistd cimport readlink

+## Types declaration
+ctypedef int I_t
+ctypedef double D_t

 cdef lock Scheduler scheduler

+cdef extern from *:
+    """
+    #include <algorithm>
+    template<class D, class I>
+    class IndexComparator {
+    private:
+        const D *data;
+        I split_dim, n_features;
+    public:
+        IndexComparator(const D *data, const I &split_dim, const I &n_features):
+            data(data), split_dim(split_dim), n_features(n_features) {}
+        bool operator()(const I &a, const I &b) const {
+            D a_value = data[a * n_features + split_dim];
+            D b_value = data[b * n_features + split_dim];
+            return a_value == b_value ? a < b : a_value < b_value;
+        }
+    };
+    template<class D, class I>
+    void partition_node_indices_inner(
+        const D *data,
+        I *node_indices,
+        const I &split_dim,
+        const I &split_index,
+        const I &n_features,
+        const I &n_points) {
+        IndexComparator<D, I> index_comparator(data, split_dim, n_features);
+        std::nth_element(
+            node_indices,
+            node_indices + split_index,
+            node_indices + n_points,
+            index_comparator);
+    }
+    """
+    void partition_node_indices_inner[D, I](
+                D *data,
+                I *node_indices,
+                I split_dim,
+                I split_index,
+                I n_features,
+                I n_points) except +
+
+
+cdef I_t partition_node_indices(
+        D_t *data,
+        I_t *node_indices,
+        I_t split_dim,
+        I_t split_index,
+        I_t n_features,
+        I_t n_points) except -1:
+    """Partition points in the node into two equal-sized groups.
+    Upon return, the values in node_indices will be rearranged such that
+    (assuming numpy-style indexing):
+        data[node_indices[0:split_index], split_dim]
+          <= data[node_indices[split_index], split_dim]
+    and
+        data[node_indices[split_index], split_dim]
+          <= data[node_indices[split_index:n_points], split_dim]
+    The algorithm is essentially a partial in-place quicksort around a
+    set pivot.
+    Parameters
+    ----------
+    data : D_t pointer
+        Pointer to a 2D array of the training data, of shape [N, n_features].
+        N must be greater than any of the values in node_indices.
+    node_indices : I_t pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.  This will be modified
+        in-place.
+    split_dim : int
+        the dimension on which to split.  This will usually be computed via
+        the routine ``find_node_split_dim``.
+    split_index : int
+        the index within node_indices around which to split the points.
+    n_features: int
+        the number of features (i.e columns) in the 2D array pointed by data.
+    n_points : int
+        the length of node_indices. This is also the number of points in
+        the original dataset.
+    Returns
+    -------
+    status : int
+        integer exit status.  On return, the contents of node_indices are
+        modified as noted above.
+    """
+    partition_node_indices_inner(
+        data,
+        node_indices,
+        split_dim,
+        split_index,
+        n_features,
+        n_points)
+    return 0
+
+

 cdef cypclass Node activable:
    """A KDTree Node"""
@@ -36,22 +138,24 @@ cdef cypclass Node activable:

    void build_node(
            self,
-            double * points,
-            int n,
-            int depth,
-            int dims,
-            int dim_for_split
+            D_t * points,
+            I_t * indices,
+            I_t n,
+            I_t depth,
+            I_t n_dims,
+            I_t dim
    ):
-        cdef int i
+        cdef I_t i
        if (depth < 0):
            return

        printf("Depth %d\n", depth)
-        printf("Dim %d\n", dim_for_split)
+        printf("Number of dimensions %d\n", n_dims)
+        printf("Splitting dimension %d\n", dim)
        for i in range(n):
            printf("X[%d, %d] = %f\n",
-                    i, dim_for_split,
-                    points[i*dims+dim_for_split])
+                    i, dim,
+                    points[i*n_dims + dim])

        printf("\n")
        # TODO: find a way to partitions indices here
@@ -59,32 +163,33 @@ cdef cypclass Node activable:
        self.left = activate(consume Node())
        self.right = activate(consume Node())

-        self.left.build_node(NULL, points, depth - 1, n, dims, (dim_for_split +
-                1)
-                % dims)
-        self.right.build_node(NULL, points, depth - 1, n, dims, (dim_for_split
-                + 1
-                %
-                dims))
+        self.left.build_node(NULL, points, indices,
+                n, depth - 1, n_dims, (dim + 1) % n_dims)
+        self.right.build_node(NULL, points, indices,
+                n, depth - 1, n_dims, (dim + 1) % n_dims)


-cdef int start() nogil:
+cdef I_t start() nogil:
    global scheduler
    scheduler = Scheduler()
-    cdef int i
-    cdef int n = 12
-    cdef int d = 2
+    cdef I_t i
+    cdef I_t n = 12
+    cdef I_t d = 2
+    cdef I_t depth = 5

    # TODO: use memory view for convenience
-    # cdef double p[12][2]
-    # cdef double [:, ::1] points_views = p
+    # cdef D_t p[12][2]
+    # cdef D_t [:, ::1] points_views = p

    # Use Golden Spiral for the layout
-    cdef double golden_ratio = (1 + 5**0.5)/2
-    cdef double * points = <double *> malloc(n * d * sizeof(double))
+    cdef D_t golden_ratio = (1 + 5**0.5)/2
+    cdef D_t * points = <D_t *> malloc(n * d * sizeof(D_t))
+    cdef I_t * indices = <I_t *> malloc(n* sizeof(I_t))
+
    for i in range(n):
+        indices[i] = i
        points[i * d] = (i / golden_ratio) % 1
-        points[i *d +1] = i / n
+        points[i * d + 1] = i / n

    printf("Before\n")
    node = consume Node()
@@ -92,11 +197,12 @@ cdef int start() nogil:
        return -1

    root = activate(consume node)
-    root.build_node(NULL, points, n, 5, d, 0)
+    root.build_node(NULL, points, indices, n, depth, d, 0)
    scheduler.finish()

    del scheduler
    free(points)
+    free(indices)

    return 0