[WIP] Adapt querying logic

This change the logic to query the tree for nearest neighbors. Start with a simple sequential query for each point.

[WIP] Adapt querying logic
This change the logic to query the tree for nearest neighbors. Start with a simple sequential query for each point.
eea55685 · Julien Jerphanion · 5ba8649c · eea55685 · eea55685
Commit eea55685 authored Nov 08, 2021 by Julien Jerphanion
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 45 deletions

kdtree.pyx kdtree.pyx +25 -34

tests/test_conf.py tests/test_conf.py +11 -11

No files found.
--- a/kdtree.pyx
+++ b/kdtree.pyx
@@ -300,7 +300,7 @@ cdef cypclass NeighborsHeaps:
    ):
        self._n_pts = n_pts
        self._n_nbrs = n_nbrs
-        self._distances = distances # <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
+        self._distances = distances
        self._indices = indices
        self._n_pushes = 0
        self._sorted = False
@@ -321,11 +321,8 @@ cdef cypclass NeighborsHeaps:

        self._n_pushes += 1

-        # printf("Pushing for %d, (%d, %lf)\n", row, i_val, val)
-
        # check if val should be in heap
        if val > distances[0]:
-            # printf("Discarding %d\n", row)
            return

        # insert val at position zero
@@ -494,7 +491,7 @@ cdef cypclass Node activable:
        # Recursing on both partitions.
        self._left.build_node(
            sync_method=NULL,
-            node_index=2 * node_index,
+            node_index=2 * node_index + 1,
            data_ptr=data_ptr,
            indices_ptr=indices_ptr,
            leaf_size=leaf_size,
@@ -507,7 +504,7 @@ cdef cypclass Node activable:

        self._right.build_node(
            sync_method=NULL,
-            node_index=2 * node_index + 1,
+            node_index=2 * node_index + 2,
            data_ptr=data_ptr,
            indices_ptr=indices_ptr,
            leaf_size=leaf_size,
@@ -566,7 +563,7 @@ cdef cypclass KDTree:
        self._leaf_size = leaf_size

        self._n_levels = <I_t> (log2(fmax(1, (self._n_samples - 1) / self._leaf_size)) + 1)
-        self._n_nodes = <I_t> (2 ** self._n_levels)
+        self._n_nodes = <I_t> (2 ** (self._n_levels + 1))

        self._data_ptr = <D_t *> X.data
        self._indices_ptr = <I_t *> malloc(self._n_samples * sizeof(I_t))
@@ -628,65 +625,62 @@ cdef cypclass KDTree:
        free(self._node_bounds_ptr)

    int _query_single_depthfirst(self,
-        I_t i_node,
-        D_t* pt,
-        I_t i_pt,
+        I_t idx_node,
+        D_t* query_points,
+        I_t idx_pt,
        NeighborsHeaps heaps,
        D_t reduced_dist_LB,
    ) except -1:
        """Recursive Single-tree k-neighbors query, depth-first approach"""
-        cdef NodeData_t node_info = self._node_data_ptr[i_node]
+        cdef NodeData_t node_info = self._node_data_ptr[idx_node]
+
+        cdef D_t sq_dist, reduced_dist_LB_1, reduced_dist_LB_2
+        cdef I_t i, idx_left_node, idx_right_node

-        cdef D_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
-        cdef I_t i, i1, i2
+        cdef D_t * this_query_point = query_points + idx_pt * self._n_features

        #------------------------------------------------------------
        # Case 1: query point is outside node radius:
        #         trim it from the query
-        cdef D_t largest = heaps.largest(i_pt)
-
-        # printf("reduced_dist_LB=%lf\n", reduced_dist_LB)
+        cdef D_t largest = heaps.largest(idx_pt)

        if reduced_dist_LB > largest:
-            # printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest)
            pass

        #------------------------------------------------------------
        # Case 2: this is a leaf node.  Update set of nearby points
        elif node_info.is_leaf:
-            # printf("Inspecting vector in leaf %d\n", i_node)
            for i in range(node_info.idx_start, node_info.idx_end):
-                dist_pt = sqeuclidean_dist(
-                            x1=pt,
+                sq_dist = sqeuclidean_dist(
+                            x1=this_query_point,
                            x2=self._data_ptr + self._indices_ptr[i] * self._n_features,
                            k=self._n_features,
                        )
-                heaps.push(i_pt, dist_pt, self._indices_ptr[i])
+                heaps.push(idx_pt, sq_dist, self._indices_ptr[i])

        #------------------------------------------------------------
        # Case 3: Node is not a leaf.  Recursively query subnodes
        #         starting with the closest
        else:
-            # printf("Deleguating to children %d\n", i_node)
-            i1 = 2 * i_node + 1
-            i2 = i1 + 1
-            reduced_dist_LB_1 = self.min_rdist(i1, pt)
-            reduced_dist_LB_2 = self.min_rdist(i2, pt)
+            idx_left_node = 2 * idx_node + 1
+            idx_right_node = idx_left_node + 1
+            reduced_dist_LB_1 = self.min_rdist(idx_left_node, this_query_point)
+            reduced_dist_LB_2 = self.min_rdist(idx_right_node, this_query_point)

            # recursively query subnodes
            if reduced_dist_LB_1 <= reduced_dist_LB_2:
-                self._query_single_depthfirst(i1, pt, i_pt, heaps, reduced_dist_LB_1)
-                self._query_single_depthfirst(i2, pt, i_pt, heaps, reduced_dist_LB_2)
+                self._query_single_depthfirst(idx_left_node, query_points, idx_pt, heaps, reduced_dist_LB_1)
+                self._query_single_depthfirst(idx_right_node, query_points, idx_pt, heaps, reduced_dist_LB_2)
            else:
-                self._query_single_depthfirst(i2, pt, i_pt, heaps, reduced_dist_LB_2)
-                self._query_single_depthfirst(i1, pt, i_pt, heaps, reduced_dist_LB_1)
+                self._query_single_depthfirst(idx_right_node, query_points, idx_pt, heaps, reduced_dist_LB_2)
+                self._query_single_depthfirst(idx_left_node, query_points, idx_pt, heaps, reduced_dist_LB_1)
        return 0


    void query(self,
        np.ndarray query_points,  # IN
        np.ndarray knn_indices,   # IN/OUT
-        np.ndarray knn_distances,   # IN/OUT
+        np.ndarray knn_distances, # IN/OUT
    ):
        cdef:
            I_t completed_queries = 0
@@ -694,7 +688,6 @@ cdef cypclass KDTree:
            I_t n_query = query_points.shape[0]
            I_t n_features = query_points.shape[1]
            I_t n_neighbors = knn_indices.shape[1]
-            I_t total_n_pushes = n_query * self._n_samples
            D_t * _query_points_ptr = <D_t *> query_points.data
            D_t rdist_lower_bound

@@ -706,10 +699,8 @@ cdef cypclass KDTree:
            )

        for i in range(n_query):
-            # printf("Querying vector %d\n", i)
            rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features)
            self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound)
-            # printf("Done Querying vector %d\n\n", i)

        heaps.sort()


--- a/tests/test_conf.py
+++ b/tests/test_conf.py
@@ -4,24 +4,24 @@ import kdtree
 from sklearn.neighbors import KDTree


-@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
-@pytest.mark.parametrize("d", [10, 100])
+@pytest.mark.parametrize("n_samples", [10, 100, 1000, 10000])
+@pytest.mark.parametrize("n_features", [10, 100])
 @pytest.mark.parametrize("leaf_size", [256, 1024])
-def test_creation_deletion(n, d, leaf_size):
-    np.random.seed(1)
-    X = np.random.rand(n, d)
+def test_creation_deletion(n_samples, n_features, leaf_size):
+    rng = np.random.RandomState(1)
+    X = rng.rand(n_samples, n_features)

    tree = kdtree.KDTree(X, leaf_size=256)
    del tree

-@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
-@pytest.mark.parametrize("d", [10, 100])
+@pytest.mark.parametrize("n_samples", [10, 100, 1000])
+@pytest.mark.parametrize("n_features", [10, 100])
 @pytest.mark.parametrize("k", [1, 2, 5, 10])
 @pytest.mark.parametrize("leaf_size", [256, 1024])
-def test_against_sklearn(n, d, k, leaf_size, n_query=1):
-    np.random.seed(2)
-    X = np.random.rand(n, d)
-    query_points = np.random.rand(n_query, d)
+def test_against_sklearn(n_samples, n_features, k, leaf_size, n_query=10):
+    rng = np.random.RandomState(1)
+    X = rng.rand(n_samples, n_features)
+    query_points = rng.rand(n_query, n_features)

    tree = kdtree.KDTree(X, leaf_size=256)
    skl_tree = KDTree(X, leaf_size=256)