Add heuristic for dimension choice

2b26f7d8 · Julien Jerphanion · 78ab113b · 2b26f7d8
Commit 2b26f7d8 authored Nov 04, 2021 by Julien Jerphanion
Hide whitespace changes
Inline Side-by-side

Showing with 55 additions and 5 deletions

kdtree.pyx kdtree.pyx +55 -5

No files found.
--- a/kdtree.pyx
+++ b/kdtree.pyx
@@ -6,7 +6,7 @@ import numpy as np
 np.import_array()

 from runtime.runtime cimport BatchMailBox, NullResult, Scheduler, WaitResult
-from libc.math cimport log2, fmax
+from libc.math cimport log2, fmax, fmin
 from libc.stdio cimport printf
 from libc.stdlib cimport malloc, free
 from openmp cimport omp_get_max_threads
@@ -75,6 +75,54 @@ cdef extern from *:
                I n_features
    ) nogil except +

+cdef I_t find_node_split_dim(D_t* data,
+                             I_t* node_indices,
+                             I_t n_features,
+                             I_t n_points) nogil except -1:
+    """Find the dimension with the largest spread.
+
+    Parameters
+    ----------
+    data : double pointer
+        Pointer to a 2D array of the training data, of shape (n_samples, n_features).
+        n_samples must be greater than any of the values in node_indices.
+    node_indices : int pointer
+        Pointer to a 1D array of length n_points.  This lists the indices of
+        each of the points within the current node.
+
+    Returns
+    -------
+    j_max : int
+        The index of the feature (dimension) within the node that has the
+        largest spread.
+
+    Notes
+    -----
+    In numpy, this operation is equivalent to
+
+    def find_node_split_dim(data, node_indices):
+        return np.argmax(data[node_indices].max(0) - data[node_indices].min(0))
+
+    The cython version is much more efficient in both computation and memory.
+    """
+    cdef D_t min_val, max_val, val, spread, max_spread
+    cdef I_t i, j, j_max
+
+    j_max = 0
+    max_spread = 0
+
+    for j in range(n_features):
+        max_val = data[node_indices[0] * n_features + j]
+        min_val = max_val
+        for i in range(1, n_points):
+            val = data[node_indices[i] * n_features + j]
+            max_val = fmax(max_val, val)
+            min_val = fmin(min_val, val)
+        spread = max_val - min_val
+        if spread > max_spread:
+            max_spread = spread
+            j_max = j
+    return j_max

 cdef cypclass Counter activable:
    """ A simple Counter.
@@ -379,9 +427,11 @@ cdef cypclass Node activable:
            I_t end,
            active Counter counter,
    ):
-        # Simple round-robin on dimensions.
-        # TODO: Choose the dimension with maximum spread at each recursion instead.
-        cdef I_t next_dim = (dim + 1) % n_dims
+        # Choose the dimension with maximum spread at each recursion instead.
+        cdef I_t next_dim = find_node_split_dim(data_ptr,
+                                                indices_ptr + start,
+                                                n_dims,
+                                                end - start)
        cdef I_t mid = (start + end) // 2

        cdef NodeData_t * node_data = self._node_data_ptr + node_index
@@ -391,7 +441,7 @@ cdef cypclass Node activable:
        if (end - start <= leaf_size):
            deref(node_data).is_leaf = True
            # Adding to the global counter the number
-            # of samples the leaf is responsible of.
+            # of samples the leaf is responsible for.
            counter.add(NULL, end - start)
            return