Commit 78ab113b authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] Use KDTree-local compact datastructures

This move the Node informations to dedicated datastructure
decoupling the actor hierarchy from the data allowing using
adapted pattern for both constructing and querying the tree.
parent cb190664
...@@ -6,11 +6,11 @@ import numpy as np ...@@ -6,11 +6,11 @@ import numpy as np
np.import_array() np.import_array()
from runtime.runtime cimport BatchMailBox, NullResult, Scheduler, WaitResult from runtime.runtime cimport BatchMailBox, NullResult, Scheduler, WaitResult
from libc.math cimport log2, fmax
from libc.stdio cimport printf from libc.stdio cimport printf
from libc.stdlib cimport malloc, free from libc.stdlib cimport malloc, free
from openmp cimport omp_get_max_threads from openmp cimport omp_get_max_threads
from cython.operator cimport dereference as deref
from cython.parallel import prange from cython.parallel import prange
## Types declaration ## Types declaration
...@@ -20,6 +20,11 @@ ctypedef double D_t ...@@ -20,6 +20,11 @@ ctypedef double D_t
cdef lock Scheduler scheduler cdef lock Scheduler scheduler
cdef D_t INF = 1e38 cdef D_t INF = 1e38
cdef struct NodeData_t:
I_t idx_start
I_t idx_end
I_t is_leaf
# NOTE: The following extern definition is used to interface # NOTE: The following extern definition is used to interface
# std::nth_element, a robust partitioning algorithm, in Cython # std::nth_element, a robust partitioning algorithm, in Cython
cdef extern from *: cdef extern from *:
...@@ -339,39 +344,32 @@ cdef cypclass Node activable: ...@@ -339,39 +344,32 @@ cdef cypclass Node activable:
Leafs are terminal Nodes and do not have children. Leafs are terminal Nodes and do not have children.
""" """
# KDTree pointer reference
# TODO: can those be made class attributes?
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
# Reference to the head of the allocated arrays # Reference to the head of the allocated arrays
# data gets not modified via _data_ptr # data gets not modified via _data_ptr
D_t * _data_ptr I_t node_index
I_t * _indices_ptr
# The point the Node split on
D_t * _point
I_t _n_dims
I_t _dim
# Portion of _indices covered by the Node is:
# _indices_ptr[_start:_end]
I_t _start
I_t _end
bint _is_leaf
active Node _left active Node _left
active Node _right active Node _right
__init__(self): __init__(self, NodeData_t * node_data_ptr, D_t * node_bounds_ptr):
# Needed by for Cython+ actors
self._active_result_class = WaitResult.construct self._active_result_class = WaitResult.construct
self._active_queue_class = consume BatchMailBox(scheduler) self._active_queue_class = consume BatchMailBox(scheduler)
self._left = NULL
self._right = NULL self._node_data_ptr = node_data_ptr
self._is_leaf = False self._node_bounds_ptr = node_bounds_ptr
# We use this to allow using actors for initialisation # We use this to allow using actors for initialisation
# because __init__ can't be reified. # because __init__ can't be reified.
void build_node( void build_node(
self, self,
I_t node_index,
D_t * data_ptr, D_t * data_ptr,
I_t * indices_ptr, I_t * indices_ptr,
I_t leaf_size, I_t leaf_size,
...@@ -381,20 +379,17 @@ cdef cypclass Node activable: ...@@ -381,20 +379,17 @@ cdef cypclass Node activable:
I_t end, I_t end,
active Counter counter, active Counter counter,
): ):
cdef I_t i # Simple round-robin on dimensions.
# TODO: Choose the dimension with maximum spread at each recursion instead.
cdef I_t next_dim = (dim + 1) % n_dims cdef I_t next_dim = (dim + 1) % n_dims
cdef I_t mid = (start + end) // 2 cdef I_t mid = (start + end) // 2
self._data_ptr = data_ptr cdef NodeData_t * node_data = self._node_data_ptr + node_index
self._indices_ptr = indices_ptr deref(node_data).idx_start = start
deref(node_data).idx_end = end
self._dim = dim
self._n_dims = n_dims
self._start = start
self._end = end
if (end - start <= leaf_size): if (end - start <= leaf_size):
self._is_leaf = True deref(node_data).is_leaf = True
# Adding to the global counter the number # Adding to the global counter the number
# of samples the leaf is responsible of. # of samples the leaf is responsible of.
counter.add(NULL, end - start) counter.add(NULL, end - start)
...@@ -403,17 +398,16 @@ cdef cypclass Node activable: ...@@ -403,17 +398,16 @@ cdef cypclass Node activable:
# We partition the samples in two nodes on a given dimension, # We partition the samples in two nodes on a given dimension,
# with the middle point as a pivot. # with the middle point as a pivot.
partition_node_indices(data_ptr, indices_ptr, start, mid, end, dim, n_dims) partition_node_indices(data_ptr, indices_ptr, start, mid, end, dim, n_dims)
self._point = data_ptr + mid
self._left = consume Node() self._left = consume Node(self._node_data_ptr, self._node_bounds_ptr)
self._right = consume Node() self._right = consume Node(self._node_data_ptr, self._node_bounds_ptr)
# Recursing on both partitions. # Recursing on both partitions.
self._left.build_node(NULL, self._left.build_node(NULL, <I_t> 2 * node_index,
data_ptr, indices_ptr, data_ptr, indices_ptr,
leaf_size, n_dims, next_dim, leaf_size, n_dims, next_dim,
start, mid, counter) start, mid, counter)
self._right.build_node(NULL, self._right.build_node(NULL, <I_t> (2 * node_index + 1),
data_ptr, indices_ptr, data_ptr, indices_ptr,
leaf_size, n_dims, next_dim, leaf_size, n_dims, next_dim,
mid, end, counter) mid, end, counter)
...@@ -428,21 +422,23 @@ cdef cypclass Node activable: ...@@ -428,21 +422,23 @@ cdef cypclass Node activable:
D_t dist D_t dist
D_t tmp D_t tmp
if self._is_leaf: if True: #self._is_leaf:
# Computing all the euclideans distances here. # Computing all the euclideans distances here.
for j in range(self._start, self._end): # for j in range(self._start, self._end):
dist = 0 # dist = 0
for k in range(self._n_dims): # for k in range(self._n_dims):
tmp = ( # tmp = (
query_points[i * self._n_dims + k] - # query_points[i * self._n_dims + k] -
self._data_ptr[self._indices_ptr[j] * self._n_dims + k] # self._data_ptr[self._indices_ptr[j] * self._n_dims + k]
) # )
dist += tmp * tmp # dist += tmp * tmp
#
# The heap is responsible for keeping the closest #
# points for each query point i. # # The heap is responsible for keeping the closest
heaps.push(NULL, i, dist, self._indices_ptr[j]) # # points for each query point i.
# heaps.push(NULL, i, dist, self._indices_ptr[j])
#
# return
return return
# TODO: one can implement a pruning strategy here. # TODO: one can implement a pruning strategy here.
...@@ -465,6 +461,8 @@ cdef cypclass KDTree: ...@@ -465,6 +461,8 @@ cdef cypclass KDTree:
I_t _n # number of examples I_t _n # number of examples
I_t _d # number of dimensions / features I_t _d # number of dimensions / features
I_t _leaf_size # maximum number of vectors at leaf I_t _leaf_size # maximum number of vectors at leaf
I_t _n_levels # number of nodes levels in the tree
I_t _n_nodes # number of nodes in the tree
I_t _n_leafs I_t _n_leafs
...@@ -472,6 +470,9 @@ cdef cypclass KDTree: ...@@ -472,6 +470,9 @@ cdef cypclass KDTree:
D_t *_data_ptr D_t *_data_ptr
I_t *_indices_ptr I_t *_indices_ptr
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
__init__(self, __init__(self,
np.ndarray X, np.ndarray X,
...@@ -492,17 +493,27 @@ cdef cypclass KDTree: ...@@ -492,17 +493,27 @@ cdef cypclass KDTree:
self._d = d self._d = d
self._leaf_size = leaf_size self._leaf_size = leaf_size
self._n_levels = <I_t> (log2(fmax(1, (self._n - 1) / self._leaf_size)) + 1)
self._n_nodes = <I_t> (2 ** self._n_levels)
self._data_ptr = <D_t *> X.data self._data_ptr = <D_t *> X.data
self._indices_ptr = <I_t *> malloc(n * sizeof(I_t)) self._indices_ptr = <I_t *> malloc(n * sizeof(I_t))
self._node_data_ptr = <NodeData_t *> malloc(self._n_nodes * sizeof(NodeData_t))
self._node_bounds_ptr = <D_t *> malloc(2 * self._n_nodes * self._d * sizeof(D_t))
for i in range(n): for i in range(n):
self._indices_ptr[i] = i self._indices_ptr[i] = i
# Recurvisely building the tree here # Recurvisely building the tree here
global scheduler global scheduler
scheduler = Scheduler(num_workers) scheduler = Scheduler(num_workers)
# This Counter is used as a way to implement a barrier for
# the asynchronous construction of the tree.
cdef active Counter counter = consume Counter() cdef active Counter counter = consume Counter()
self._root = consume Node() self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr)
if self._root is NULL: if self._root is NULL:
printf("Error consuming node\n") printf("Error consuming node\n")
...@@ -513,7 +524,7 @@ cdef cypclass KDTree: ...@@ -513,7 +524,7 @@ cdef cypclass KDTree:
# #
# Also using this separate method allowing using actors # Also using this separate method allowing using actors
# because __init__ can't be reified. # because __init__ can't be reified.
self._root.build_node(NULL, self._root.build_node(NULL, 0,
self._data_ptr, self._data_ptr,
self._indices_ptr, self._indices_ptr,
self._leaf_size, n_dims=self._d, self._leaf_size, n_dims=self._d,
...@@ -531,6 +542,8 @@ cdef cypclass KDTree: ...@@ -531,6 +542,8 @@ cdef cypclass KDTree:
void __dealloc__(self): void __dealloc__(self):
scheduler.finish() scheduler.finish()
free(self._indices_ptr) free(self._indices_ptr)
free(self._node_data_ptr)
free(self._node_bounds_ptr)
void query(self, void query(self,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment