[WIP] Use KDTree-local compact datastructures

This move the Node informations to dedicated datastructure
decoupling the actor hierarchy from the data allowing using
adapted pattern for both constructing and querying the tree.
parent cb190664
......@@ -6,11 +6,11 @@ import numpy as np
np.import_array()
from runtime.runtime cimport BatchMailBox, NullResult, Scheduler, WaitResult
from libc.math cimport log2, fmax
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free
from openmp cimport omp_get_max_threads
from cython.operator cimport dereference as deref
from cython.parallel import prange
## Types declaration
......@@ -20,6 +20,11 @@ ctypedef double D_t
cdef lock Scheduler scheduler
cdef D_t INF = 1e38
cdef struct NodeData_t:
I_t idx_start
I_t idx_end
I_t is_leaf
# NOTE: The following extern definition is used to interface
# std::nth_element, a robust partitioning algorithm, in Cython
cdef extern from *:
......@@ -339,39 +344,32 @@ cdef cypclass Node activable:
Leafs are terminal Nodes and do not have children.
"""
# KDTree pointer reference
# TODO: can those be made class attributes?
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
# Reference to the head of the allocated arrays
# data gets not modified via _data_ptr
D_t * _data_ptr
I_t * _indices_ptr
# The point the Node split on
D_t * _point
I_t _n_dims
I_t _dim
# Portion of _indices covered by the Node is:
# _indices_ptr[_start:_end]
I_t _start
I_t _end
bint _is_leaf
I_t node_index
active Node _left
active Node _right
__init__(self):
__init__(self, NodeData_t * node_data_ptr, D_t * node_bounds_ptr):
# Needed by for Cython+ actors
self._active_result_class = WaitResult.construct
self._active_queue_class = consume BatchMailBox(scheduler)
self._left = NULL
self._right = NULL
self._is_leaf = False
self._node_data_ptr = node_data_ptr
self._node_bounds_ptr = node_bounds_ptr
# We use this to allow using actors for initialisation
# because __init__ can't be reified.
void build_node(
self,
I_t node_index,
D_t * data_ptr,
I_t * indices_ptr,
I_t leaf_size,
......@@ -381,20 +379,17 @@ cdef cypclass Node activable:
I_t end,
active Counter counter,
):
cdef I_t i
# Simple round-robin on dimensions.
# TODO: Choose the dimension with maximum spread at each recursion instead.
cdef I_t next_dim = (dim + 1) % n_dims
cdef I_t mid = (start + end) // 2
self._data_ptr = data_ptr
self._indices_ptr = indices_ptr
self._dim = dim
self._n_dims = n_dims
self._start = start
self._end = end
cdef NodeData_t * node_data = self._node_data_ptr + node_index
deref(node_data).idx_start = start
deref(node_data).idx_end = end
if (end - start <= leaf_size):
self._is_leaf = True
deref(node_data).is_leaf = True
# Adding to the global counter the number
# of samples the leaf is responsible of.
counter.add(NULL, end - start)
......@@ -403,46 +398,47 @@ cdef cypclass Node activable:
# We partition the samples in two nodes on a given dimension,
# with the middle point as a pivot.
partition_node_indices(data_ptr, indices_ptr, start, mid, end, dim, n_dims)
self._point = data_ptr + mid
self._left = consume Node()
self._right = consume Node()
self._left = consume Node(self._node_data_ptr, self._node_bounds_ptr)
self._right = consume Node(self._node_data_ptr, self._node_bounds_ptr)
# Recursing on both partitions.
self._left.build_node(NULL,
self._left.build_node(NULL, <I_t> 2 * node_index,
data_ptr, indices_ptr,
leaf_size, n_dims, next_dim,
start, mid, counter)
self._right.build_node(NULL,
self._right.build_node(NULL, <I_t> (2 * node_index + 1),
data_ptr, indices_ptr,
leaf_size, n_dims, next_dim,
mid, end, counter)
void query(self,
D_t * query_points,
I_t i,
active NeighborsHeap heaps,
D_t * query_points,
I_t i,
active NeighborsHeap heaps,
):
cdef:
I_t j, k
D_t dist
D_t tmp
if self._is_leaf:
if True: #self._is_leaf:
# Computing all the euclideans distances here.
for j in range(self._start, self._end):
dist = 0
for k in range(self._n_dims):
tmp = (
query_points[i * self._n_dims + k] -
self._data_ptr[self._indices_ptr[j] * self._n_dims + k]
)
dist += tmp * tmp
# The heap is responsible for keeping the closest
# points for each query point i.
heaps.push(NULL, i, dist, self._indices_ptr[j])
# for j in range(self._start, self._end):
# dist = 0
# for k in range(self._n_dims):
# tmp = (
# query_points[i * self._n_dims + k] -
# self._data_ptr[self._indices_ptr[j] * self._n_dims + k]
# )
# dist += tmp * tmp
#
#
# # The heap is responsible for keeping the closest
# # points for each query point i.
# heaps.push(NULL, i, dist, self._indices_ptr[j])
#
# return
return
# TODO: one can implement a pruning strategy here.
......@@ -465,6 +461,8 @@ cdef cypclass KDTree:
I_t _n # number of examples
I_t _d # number of dimensions / features
I_t _leaf_size # maximum number of vectors at leaf
I_t _n_levels # number of nodes levels in the tree
I_t _n_nodes # number of nodes in the tree
I_t _n_leafs
......@@ -472,6 +470,9 @@ cdef cypclass KDTree:
D_t *_data_ptr
I_t *_indices_ptr
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
__init__(self,
np.ndarray X,
......@@ -492,17 +493,27 @@ cdef cypclass KDTree:
self._d = d
self._leaf_size = leaf_size
self._n_levels = <I_t> (log2(fmax(1, (self._n - 1) / self._leaf_size)) + 1)
self._n_nodes = <I_t> (2 ** self._n_levels)
self._data_ptr = <D_t *> X.data
self._indices_ptr = <I_t *> malloc(n * sizeof(I_t))
self._node_data_ptr = <NodeData_t *> malloc(self._n_nodes * sizeof(NodeData_t))
self._node_bounds_ptr = <D_t *> malloc(2 * self._n_nodes * self._d * sizeof(D_t))
for i in range(n):
self._indices_ptr[i] = i
# Recurvisely building the tree here
global scheduler
scheduler = Scheduler(num_workers)
# This Counter is used as a way to implement a barrier for
# the asynchronous construction of the tree.
cdef active Counter counter = consume Counter()
self._root = consume Node()
self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr)
if self._root is NULL:
printf("Error consuming node\n")
......@@ -513,7 +524,7 @@ cdef cypclass KDTree:
#
# Also using this separate method allowing using actors
# because __init__ can't be reified.
self._root.build_node(NULL,
self._root.build_node(NULL, 0,
self._data_ptr,
self._indices_ptr,
self._leaf_size, n_dims=self._d,
......@@ -531,6 +542,8 @@ cdef cypclass KDTree:
void __dealloc__(self):
scheduler.finish()
free(self._indices_ptr)
free(self._node_data_ptr)
free(self._node_bounds_ptr)
void query(self,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment