Commit eea55685 authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] Adapt querying logic

This change the logic to query the tree for nearest neighbors.
Start with a simple sequential query for each point.
parent 5ba8649c
......@@ -300,7 +300,7 @@ cdef cypclass NeighborsHeaps:
):
self._n_pts = n_pts
self._n_nbrs = n_nbrs
self._distances = distances # <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
self._distances = distances
self._indices = indices
self._n_pushes = 0
self._sorted = False
......@@ -321,11 +321,8 @@ cdef cypclass NeighborsHeaps:
self._n_pushes += 1
# printf("Pushing for %d, (%d, %lf)\n", row, i_val, val)
# check if val should be in heap
if val > distances[0]:
# printf("Discarding %d\n", row)
return
# insert val at position zero
......@@ -494,7 +491,7 @@ cdef cypclass Node activable:
# Recursing on both partitions.
self._left.build_node(
sync_method=NULL,
node_index=2 * node_index,
node_index=2 * node_index + 1,
data_ptr=data_ptr,
indices_ptr=indices_ptr,
leaf_size=leaf_size,
......@@ -507,7 +504,7 @@ cdef cypclass Node activable:
self._right.build_node(
sync_method=NULL,
node_index=2 * node_index + 1,
node_index=2 * node_index + 2,
data_ptr=data_ptr,
indices_ptr=indices_ptr,
leaf_size=leaf_size,
......@@ -566,7 +563,7 @@ cdef cypclass KDTree:
self._leaf_size = leaf_size
self._n_levels = <I_t> (log2(fmax(1, (self._n_samples - 1) / self._leaf_size)) + 1)
self._n_nodes = <I_t> (2 ** self._n_levels)
self._n_nodes = <I_t> (2 ** (self._n_levels + 1))
self._data_ptr = <D_t *> X.data
self._indices_ptr = <I_t *> malloc(self._n_samples * sizeof(I_t))
......@@ -628,65 +625,62 @@ cdef cypclass KDTree:
free(self._node_bounds_ptr)
int _query_single_depthfirst(self,
I_t i_node,
D_t* pt,
I_t i_pt,
I_t idx_node,
D_t* query_points,
I_t idx_pt,
NeighborsHeaps heaps,
D_t reduced_dist_LB,
) except -1:
"""Recursive Single-tree k-neighbors query, depth-first approach"""
cdef NodeData_t node_info = self._node_data_ptr[i_node]
cdef NodeData_t node_info = self._node_data_ptr[idx_node]
cdef D_t sq_dist, reduced_dist_LB_1, reduced_dist_LB_2
cdef I_t i, idx_left_node, idx_right_node
cdef D_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
cdef I_t i, i1, i2
cdef D_t * this_query_point = query_points + idx_pt * self._n_features
#------------------------------------------------------------
# Case 1: query point is outside node radius:
# trim it from the query
cdef D_t largest = heaps.largest(i_pt)
# printf("reduced_dist_LB=%lf\n", reduced_dist_LB)
cdef D_t largest = heaps.largest(idx_pt)
if reduced_dist_LB > largest:
# printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest)
pass
#------------------------------------------------------------
# Case 2: this is a leaf node. Update set of nearby points
elif node_info.is_leaf:
# printf("Inspecting vector in leaf %d\n", i_node)
for i in range(node_info.idx_start, node_info.idx_end):
dist_pt = sqeuclidean_dist(
x1=pt,
sq_dist = sqeuclidean_dist(
x1=this_query_point,
x2=self._data_ptr + self._indices_ptr[i] * self._n_features,
k=self._n_features,
)
heaps.push(i_pt, dist_pt, self._indices_ptr[i])
heaps.push(idx_pt, sq_dist, self._indices_ptr[i])
#------------------------------------------------------------
# Case 3: Node is not a leaf. Recursively query subnodes
# starting with the closest
else:
# printf("Deleguating to children %d\n", i_node)
i1 = 2 * i_node + 1
i2 = i1 + 1
reduced_dist_LB_1 = self.min_rdist(i1, pt)
reduced_dist_LB_2 = self.min_rdist(i2, pt)
idx_left_node = 2 * idx_node + 1
idx_right_node = idx_left_node + 1
reduced_dist_LB_1 = self.min_rdist(idx_left_node, this_query_point)
reduced_dist_LB_2 = self.min_rdist(idx_right_node, this_query_point)
# recursively query subnodes
if reduced_dist_LB_1 <= reduced_dist_LB_2:
self._query_single_depthfirst(i1, pt, i_pt, heaps, reduced_dist_LB_1)
self._query_single_depthfirst(i2, pt, i_pt, heaps, reduced_dist_LB_2)
self._query_single_depthfirst(idx_left_node, query_points, idx_pt, heaps, reduced_dist_LB_1)
self._query_single_depthfirst(idx_right_node, query_points, idx_pt, heaps, reduced_dist_LB_2)
else:
self._query_single_depthfirst(i2, pt, i_pt, heaps, reduced_dist_LB_2)
self._query_single_depthfirst(i1, pt, i_pt, heaps, reduced_dist_LB_1)
self._query_single_depthfirst(idx_right_node, query_points, idx_pt, heaps, reduced_dist_LB_2)
self._query_single_depthfirst(idx_left_node, query_points, idx_pt, heaps, reduced_dist_LB_1)
return 0
void query(self,
np.ndarray query_points, # IN
np.ndarray knn_indices, # IN/OUT
np.ndarray knn_distances, # IN/OUT
np.ndarray knn_distances, # IN/OUT
):
cdef:
I_t completed_queries = 0
......@@ -694,7 +688,6 @@ cdef cypclass KDTree:
I_t n_query = query_points.shape[0]
I_t n_features = query_points.shape[1]
I_t n_neighbors = knn_indices.shape[1]
I_t total_n_pushes = n_query * self._n_samples
D_t * _query_points_ptr = <D_t *> query_points.data
D_t rdist_lower_bound
......@@ -706,10 +699,8 @@ cdef cypclass KDTree:
)
for i in range(n_query):
# printf("Querying vector %d\n", i)
rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features)
self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound)
# printf("Done Querying vector %d\n\n", i)
heaps.sort()
......
......@@ -4,24 +4,24 @@ import kdtree
from sklearn.neighbors import KDTree
@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
@pytest.mark.parametrize("d", [10, 100])
@pytest.mark.parametrize("n_samples", [10, 100, 1000, 10000])
@pytest.mark.parametrize("n_features", [10, 100])
@pytest.mark.parametrize("leaf_size", [256, 1024])
def test_creation_deletion(n, d, leaf_size):
np.random.seed(1)
X = np.random.rand(n, d)
def test_creation_deletion(n_samples, n_features, leaf_size):
rng = np.random.RandomState(1)
X = rng.rand(n_samples, n_features)
tree = kdtree.KDTree(X, leaf_size=256)
del tree
@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
@pytest.mark.parametrize("d", [10, 100])
@pytest.mark.parametrize("n_samples", [10, 100, 1000])
@pytest.mark.parametrize("n_features", [10, 100])
@pytest.mark.parametrize("k", [1, 2, 5, 10])
@pytest.mark.parametrize("leaf_size", [256, 1024])
def test_against_sklearn(n, d, k, leaf_size, n_query=1):
np.random.seed(2)
X = np.random.rand(n, d)
query_points = np.random.rand(n_query, d)
def test_against_sklearn(n_samples, n_features, k, leaf_size, n_query=10):
rng = np.random.RandomState(1)
X = rng.rand(n_samples, n_features)
query_points = rng.rand(n_query, n_features)
tree = kdtree.KDTree(X, leaf_size=256)
skl_tree = KDTree(X, leaf_size=256)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment