Commit 06debbda authored by Julien Jerphanion's avatar Julien Jerphanion

Add query on closest point

This makes use of a class of actors wrapping a numpy array of
the nearest neighbors indices.

Also:
 - Replace depth for leaf_size
 - Make the counter an argument
 - Correctly manage the scheduler lifecycle
parent b8f4a92c
......@@ -66,53 +66,101 @@ cdef extern from *:
cdef cypclass Node activable:
"""A KDTree Node"""
D_t * data_ptr
I_t * indices_ptr
D_t * point
I_t n_dims
active Counter counter
I_t dim
I_t start
I_t end
bint is_leaf
active Node left
active Node right
__init__(self, active Counter counter):
self.counter = counter
self._active_result_class = NullResult
__init__(self):
self._active_result_class = WaitResult.construct
self._active_queue_class = consume BatchMailBox(scheduler)
self.left = NULL
self.right = NULL
self.is_leaf = False
void build_node(
self,
D_t * data_ptr,
I_t * indices_ptr,
I_t depth,
I_t leaf_size,
I_t n_dims,
I_t dim,
I_t start,
I_t end,
active Counter counter,
):
cdef I_t i
cdef I_t next_dim = (dim + 1) % n_dims
cdef I_t mid = (start + end) // 2
self.data_ptr = data_ptr
self.indices_ptr = indices_ptr
self.dim = dim
self.n_dims = n_dims
self.start = start
self.end = end
if (depth < 0) or (end - start <= 1):
self.counter.add(NULL, end - start)
if (end - start <= leaf_size):
self.is_leaf = True
counter.add(NULL, end - start)
return
partition_node_indices(data_ptr, indices_ptr, start, mid, end, dim, n_dims)
self.point = data_ptr + mid
self.left = consume Node(self.counter)
self.right = consume Node(self.counter)
self.left = consume Node()
self.right = consume Node()
self.left.build_node(NULL,
data_ptr, indices_ptr,
depth - 1, n_dims, next_dim,
start, mid)
leaf_size, n_dims, next_dim,
start, mid, counter)
self.right.build_node(NULL,
data_ptr, indices_ptr,
depth - 1, n_dims, next_dim,
mid, end)
leaf_size, n_dims, next_dim,
mid, end, counter)
void get_closest(self,
D_t * query_points,
I_t i,
active Container container
):
cdef:
I_t j, k, closest = -1
D_t dist = 1e38
D_t tmp
D_t min_distance = 1e38
if self.is_leaf:
for j in range(start, end):
dist = 0
for k in range(self.n_dims):
tmp = (
query_points[i * self.n_dims + k] -
self.data_ptr[self.indices_ptr[j] * self.n_dims + k]
)
dist += tmp * tmp
if dist < min_distance:
closest = j
min_distance = dist
container.update(NULL, i, closest)
return
if query_points[ i * self.n_dims + self.dim] < self.point[self.dim]:
self.left.get_closest(NULL, query_points, i, container)
else:
self.right.get_closest(NULL, query_points, i, container)
cdef cypclass Counter activable:
""" A simple Counter.
......@@ -136,6 +184,33 @@ cdef cypclass Counter activable:
I_t value(self):
return self.n
cdef cypclass Container activable:
""" A simple wrapper of an array.
Useful for synchronisation, as it can be used as a barrier.
"""
I_t * _array
I_t _size
I_t _n_updates
__init__(self, I_t *array, I_t size):
self._array = array
self._size = size
self._active_result_class = WaitResult.construct
self._active_queue_class = consume BatchMailBox(scheduler)
self._n_updates = 0
void update(self, I_t idx, I_t value):
self._n_updates += 1
self._array[idx] = value
int get_n_updates(self):
return self._n_updates
cdef cypclass KDTree:
"""A KDTree based on asynchronous and parallel computations.
......@@ -150,69 +225,89 @@ cdef cypclass KDTree:
I_t n # number of data
I_t d # number of dimensions / features
I_t depth # max_depth of the tree (to be unified with leaf_size)
I_t leaf_size # maximum number of vectors at leaf
active Node root
active Counter initialised_vec_counter
active Node _root
D_t *data_ptr
I_t *indices_ptr
__init__(self,
np.ndarray data,
I_t depth,
np.ndarray X,
I_t leaf_size,
):
cdef I_t i
cdef I_t n = data.shape[0]
cdef I_t d = data.shape[1]
cdef I_t n = X.shape[0]
cdef I_t d = X.shape[1]
self.n = n
self.d = d
self.depth = depth
self.leaf_size = leaf_size
self.data_ptr = <D_t *> data.data
self.data_ptr = <D_t *> X.data
self.indices_ptr = <I_t *> malloc(n * sizeof(I_t))
for i in range(n):
self.indices_ptr[i] = i
global scheduler
scheduler = Scheduler()
self._recursive_build()
void _recursive_build(self):
# TODO: introducing a context manager for the runtime
# would be nice here:
# ```
# with scheduler:
# self.root = ...
# ```
cdef I_t initialised
global scheduler
scheduler = Scheduler()
self.initialised_vec_counter = consume Counter()
self.root = consume Node(self.initialised_vec_counter)
if self.root is NULL:
cdef active Counter counter = consume Counter()
self._root = consume Node()
if self._root is NULL:
printf("Error consuming node\n")
# When object are activated (set as Actors), methods
# are reified. When using those reified methods
# a new argument is prepredend for a predicate,
# which we aren't using using here, hence the extra NULL.
self.root.build_node(NULL,
self.data_ptr,
self.indices_ptr,
depth, n_dims=d, dim=0, start=0, end=n)
self._root.build_node(NULL,
self.data_ptr,
self.indices_ptr,
leaf_size, n_dims=d, dim=0, start=0, end=n,
counter=counter)
initialised = self.initialised_vec_counter.value(NULL).getIntResult()
initialised = counter.value(NULL).getIntResult()
while(initialised < self.n):
initialised = self.initialised_vec_counter.value(NULL).getIntResult()
scheduler.finish()
initialised = counter.value(NULL).getIntResult()
counter.reset(NULL)
void __dealloc__(self):
scheduler.finish()
free(self.indices_ptr)
void get_closest(self,
np.ndarray query_points, # IN
np.ndarray closests, # OUT
):
cdef:
I_t completed_queries = 0
I_t i
I_t n_query = query_points.shape[0]
active Container closests_container
global scheduler
closests_container = consume Container(<I_t *> closests.data, n_query)
for i in range(n_query):
self._root.get_closest(NULL,
<D_t *> query_points.data,
i, closests_container)
while(completed_queries < n_query):
completed_queries = closests_container.get_n_updates(NULL).getIntResult()
cdef public int main() nogil:
# Entry point for the compiled binary file
printf("empty public int main() nogil:")
......
......@@ -2,11 +2,15 @@ import numpy as np
import kdtree
if __name__ == "__main__":
n, d = 1000000, 10
np.random.seed(1)
n, d = 1000000, 2
golden_ratio = (1 + 5 ** 0.5) / 2
X = np.zeros((n, d))
for i in range(n):
X[i, 0] = (i / golden_ratio) % 1
X[i, 1] = i / n
tree = kdtree.KDTree(X, depth=2)
del tree
query_points = np.random.rand(100000, 2)
closests = np.zeros((query_points.shape[0]), dtype=np.int32)
tree.get_closest(query_points, closests)
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment