Commit acfcdbe0 authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] Adapt querying logic

This change the logic to query the tree for nearest neighbors.
Start with a simple sequential query for each point.
parent d49871ec
......@@ -6,7 +6,7 @@ import numpy as np
np.import_array()
from runtime.runtime cimport BatchMailBox, NullResult, Scheduler, WaitResult
from libc.math cimport log2, fmax, fmin
from libc.math cimport log2, fmax, fmin, fabs
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free
from openmp cimport omp_get_max_threads
......@@ -175,10 +175,21 @@ cdef cypclass Container activable:
self._n_updates += 1
self._array[idx] = value
int n_updates(self):
I_t n_updates(self):
return self._n_updates
cdef inline D_t sqeuclidean_dist(D_t* x1, D_t* x2, I_t k) nogil:
cdef:
I_t i = 0
D_t distance = 0
for i in range(k):
distance += (x1[i] - x2[i]) ** 2
return distance
cdef inline void dual_swap(D_t* darr, I_t* iarr, I_t i1, I_t i2) nogil:
"""swap the values at inex i1 and i2 of both darr and iarr"""
cdef D_t dtmp = darr[i1]
......@@ -254,7 +265,7 @@ cdef void _simultaneous_sort(
size - pivot_idx - 1)
cdef cypclass NeighborsHeaps activable:
cdef cypclass NeighborsHeaps:
"""A max-heap structure to keep track of distances/indices of neighbors
This implements an efficient pre-allocated set of fixed-size heaps
......@@ -265,12 +276,6 @@ cdef cypclass NeighborsHeaps activable:
Taken and adapted from:
https://github.com/scikit-learn/scikit-learn/blob/e4bb9fa86b0df873ad750b6d59090843d9d23d50/sklearn/neighbors/_binary_tree.pxi#L513
The initial caller is responsible for providing the array of indices
which will be modified in place by the actors logic.
n_pushes and is_sorted can be used by the initial caller to know
when to pursue.
Parameters
----------
n_pts : int
......@@ -290,8 +295,6 @@ cdef cypclass NeighborsHeaps activable:
cdef I_t i
self._n_pts = n_pts
self._n_nbrs = n_nbrs
self._active_result_class = WaitResult.construct
self._active_queue_class = consume BatchMailBox(scheduler)
self._distances = <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
self._indices = indices
self._n_pushes = 0
......@@ -355,7 +358,6 @@ cdef cypclass NeighborsHeaps activable:
distances[i] = val
indices[i] = i_val
void sort(self):
# NOTE: Ideally we could sort results in parallel, but
# OpenMP threadpool and this runtime's aren't working
......@@ -381,6 +383,9 @@ cdef cypclass NeighborsHeaps activable:
# use of getIntResult
return 1 if self._sorted else 0
I_t largest(self, I_t index):
return self._indices[index * self._n_nbrs]
cdef cypclass Node activable:
"""A KDTree Node
......@@ -488,6 +493,8 @@ cdef cypclass KDTree:
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
I_t _node_bounds_ptr_offset
__init__(self,
np.ndarray X,
......@@ -515,12 +522,17 @@ cdef cypclass KDTree:
self._indices_ptr = <I_t *> malloc(n * sizeof(I_t))
self._node_data_ptr = <NodeData_t *> malloc(self._n_nodes * sizeof(NodeData_t))
self._node_bounds_ptr = <D_t *> malloc(2 * self._n_nodes * self._d * sizeof(D_t))
self._node_bounds_ptr_offset = self._n_nodes * self._d
# To be seen as a [2, n_nodes, d] with:
# - elements in [0, :, :] as min
# - elements in [1, :, :] as max
self._node_bounds_ptr = <D_t *> malloc(2 * self._node_bounds_ptr_offset * sizeof(D_t))
for i in range(n):
self._indices_ptr[i] = i
# Recurvisely building the tree here
# Recursively building the tree here
global scheduler
scheduler = Scheduler(num_workers)
......@@ -567,20 +579,18 @@ cdef cypclass KDTree:
free(self._node_bounds_ptr)
cdef int _query_single_depthfirst(self,
ITYPE_t i_node,
DTYPE_t* pt,
ITYPE_t i_pt,
int _query_single_depthfirst(self,
I_t i_node,
D_t* pt,
I_t i_pt,
NeighborsHeaps heaps,
DTYPE_t reduced_dist_LB,
D_t reduced_dist_LB,
) nogil except -1:
"""Recursive Single-tree k-neighbors query, depth-first approach"""
cdef NodeData_t node_info = self._node_data_ptr[i_node]
cdef DTYPE_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
cdef ITYPE_t i, i1, i2
cdef DTYPE_t* data = &self.data[0, 0]
cdef D_t dist_pt, reduced_dist_LB_1, reduced_dist_LB_2
cdef I_t i, i1, i2
#------------------------------------------------------------
# Case 1: query point is outside node radius:
......@@ -593,11 +603,11 @@ cdef cypclass KDTree:
elif node_info.is_leaf:
for i in range(node_info.idx_start, node_info.idx_end):
dist_pt = sqeuclidean_dist(
pt,
&self.data[self.idx_array[i], 0],
self.data.shape[1]
x1=pt,
x2=self._data_ptr + self._indices_ptr[i] * self._d,
k=self._d,
)
heaps._push(i_pt, dist_pt, self.idx_array[i])
heaps.push(i_pt, dist_pt, self._indices_ptr[i])
#------------------------------------------------------------
# Case 3: Node is not a leaf. Recursively query subnodes
......@@ -605,8 +615,8 @@ cdef cypclass KDTree:
else:
i1 = 2 * i_node + 1
i2 = i1 + 1
reduced_dist_LB_1 = min_rdist(self, i1, pt)
reduced_dist_LB_2 = min_rdist(self, i2, pt)
reduced_dist_LB_1 = self.min_rdist(i1, pt)
reduced_dist_LB_2 = self.min_rdist(i2, pt)
# recursively query subnodes
if reduced_dist_LB_1 <= reduced_dist_LB_2:
......@@ -629,45 +639,42 @@ cdef cypclass KDTree:
I_t n_features = query_points.shape[1]
I_t n_neighbors = closests.shape[1]
I_t total_n_pushes = n_query * self._n
D_t reduced_dist_LB
D_t * query_point = query_points.data
D_t * _query_points_ptr = <D_t *> query_points.data
D_t rdist_lower_bound
active NeighborsHeaps heaps
NeighborsHeaps heaps = NeighborsHeaps(<I_t *> closests.data, n_query, n_neighbors)
heaps = consume NeighborsHeaps(<I_t *> closests.data,
n_query,
n_neighbors)
for i in range(n_query):
rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features)
self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound)
for i in range(Xarr.shape[0]):
rdist_lower_bound = min_rdist(self, 0, query_point)
_query_single_depthfirst(0, pt, i, heaps, rdist_lower_bound)
query_point += n_features
heaps.sort()
heaps.sort(NULL)
while not(heaps.is_sorted(NULL).getIntResult()):
pass
D_t min_rdist(
KDTree self,
I_t i_node,
D_t* pt,
) nogil except -1:
"""Compute the minimum reduced-distance between a point and a node"""
cdef I_t node_idx
cdef D_t d, d_lo, d_hi, rdist=0.0
cdef I_t j
cdef D_t node_min_j, node_max_j
cdef inline DTYPE_t min_rdist(
KDTree tree,
ITYPE_t i_node,
DTYPE_t* pt,
) nogil except -1:
"""Compute the minimum reduced-distance between a point and a node"""
cdef ITYPE_t n_features = tree.data.shape[1]
cdef DTYPE_t d, d_lo, d_hi, rdist=0.0
cdef ITYPE_t j
for j in range(self._d):
node_min_j = deref(self._node_bounds_ptr + node_idx + j)
node_max_j = deref(self._node_bounds_ptr + node_idx + j + self._node_bounds_ptr_offset)
for j in range(n_features):
d_lo = tree._node_bounds_ptr[0, i_node, j] - pt[j]
d_hi = pt[j] - tree._node_bounds_ptr[1, i_node, j]
d_lo = node_min_j - pt[j]
d_hi = pt[j] - node_max_j
# We use the following identity:
#
# 0.5 * (x + abs(x) = max(x, 0)
# 0.5 * (x + abs(x)) = 0.5 * max(x, 0)
#
# twice.
d = 0.5 (d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi))
d = 0.5 * ((d_lo + fabs(d_lo)) + (d_hi + fabs(d_hi)))
rdist += d ** 2
return rdist
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment