Commit c952315e authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] KDTree implementation

Add some types definitions and node partitionning in.
parent 1a848e92
# distutils: language = c++
import numpy as np
cimport numpy as cnp
cnp.import_array()
from cython.view cimport array as cvarray
import numpy as np
......@@ -17,9 +22,106 @@ from stdlib.string cimport string
from posix.unistd cimport readlink
## Types declaration
ctypedef int I_t
ctypedef double D_t
cdef lock Scheduler scheduler
cdef extern from *:
"""
#include <algorithm>
template<class D, class I>
class IndexComparator {
private:
const D *data;
I split_dim, n_features;
public:
IndexComparator(const D *data, const I &split_dim, const I &n_features):
data(data), split_dim(split_dim), n_features(n_features) {}
bool operator()(const I &a, const I &b) const {
D a_value = data[a * n_features + split_dim];
D b_value = data[b * n_features + split_dim];
return a_value == b_value ? a < b : a_value < b_value;
}
};
template<class D, class I>
void partition_node_indices_inner(
const D *data,
I *node_indices,
const I &split_dim,
const I &split_index,
const I &n_features,
const I &n_points) {
IndexComparator<D, I> index_comparator(data, split_dim, n_features);
std::nth_element(
node_indices,
node_indices + split_index,
node_indices + n_points,
index_comparator);
}
"""
void partition_node_indices_inner[D, I](
D *data,
I *node_indices,
I split_dim,
I split_index,
I n_features,
I n_points) except +
cdef I_t partition_node_indices(
D_t *data,
I_t *node_indices,
I_t split_dim,
I_t split_index,
I_t n_features,
I_t n_points) except -1:
"""Partition points in the node into two equal-sized groups.
Upon return, the values in node_indices will be rearranged such that
(assuming numpy-style indexing):
data[node_indices[0:split_index], split_dim]
<= data[node_indices[split_index], split_dim]
and
data[node_indices[split_index], split_dim]
<= data[node_indices[split_index:n_points], split_dim]
The algorithm is essentially a partial in-place quicksort around a
set pivot.
Parameters
----------
data : D_t pointer
Pointer to a 2D array of the training data, of shape [N, n_features].
N must be greater than any of the values in node_indices.
node_indices : I_t pointer
Pointer to a 1D array of length n_points. This lists the indices of
each of the points within the current node. This will be modified
in-place.
split_dim : int
the dimension on which to split. This will usually be computed via
the routine ``find_node_split_dim``.
split_index : int
the index within node_indices around which to split the points.
n_features: int
the number of features (i.e columns) in the 2D array pointed by data.
n_points : int
the length of node_indices. This is also the number of points in
the original dataset.
Returns
-------
status : int
integer exit status. On return, the contents of node_indices are
modified as noted above.
"""
partition_node_indices_inner(
data,
node_indices,
split_dim,
split_index,
n_features,
n_points)
return 0
cdef cypclass Node activable:
"""A KDTree Node"""
......@@ -36,22 +138,24 @@ cdef cypclass Node activable:
void build_node(
self,
double * points,
int n,
int depth,
int dims,
int dim_for_split
D_t * points,
I_t * indices,
I_t n,
I_t depth,
I_t n_dims,
I_t dim
):
cdef int i
cdef I_t i
if (depth < 0):
return
printf("Depth %d\n", depth)
printf("Dim %d\n", dim_for_split)
printf("Number of dimensions %d\n", n_dims)
printf("Splitting dimension %d\n", dim)
for i in range(n):
printf("X[%d, %d] = %f\n",
i, dim_for_split,
points[i*dims+dim_for_split])
i, dim,
points[i*n_dims + dim])
printf("\n")
# TODO: find a way to partitions indices here
......@@ -59,32 +163,33 @@ cdef cypclass Node activable:
self.left = activate(consume Node())
self.right = activate(consume Node())
self.left.build_node(NULL, points, depth - 1, n, dims, (dim_for_split +
1)
% dims)
self.right.build_node(NULL, points, depth - 1, n, dims, (dim_for_split
+ 1
%
dims))
self.left.build_node(NULL, points, indices,
n, depth - 1, n_dims, (dim + 1) % n_dims)
self.right.build_node(NULL, points, indices,
n, depth - 1, n_dims, (dim + 1) % n_dims)
cdef int start() nogil:
cdef I_t start() nogil:
global scheduler
scheduler = Scheduler()
cdef int i
cdef int n = 12
cdef int d = 2
cdef I_t i
cdef I_t n = 12
cdef I_t d = 2
cdef I_t depth = 5
# TODO: use memory view for convenience
# cdef double p[12][2]
# cdef double [:, ::1] points_views = p
# cdef D_t p[12][2]
# cdef D_t [:, ::1] points_views = p
# Use Golden Spiral for the layout
cdef double golden_ratio = (1 + 5**0.5)/2
cdef double * points = <double *> malloc(n * d * sizeof(double))
cdef D_t golden_ratio = (1 + 5**0.5)/2
cdef D_t * points = <D_t *> malloc(n * d * sizeof(D_t))
cdef I_t * indices = <I_t *> malloc(n* sizeof(I_t))
for i in range(n):
indices[i] = i
points[i * d] = (i / golden_ratio) % 1
points[i *d +1] = i / n
points[i * d + 1] = i / n
printf("Before\n")
node = consume Node()
......@@ -92,11 +197,12 @@ cdef int start() nogil:
return -1
root = activate(consume node)
root.build_node(NULL, points, n, 5, d, 0)
root.build_node(NULL, points, indices, n, depth, d, 0)
scheduler.finish()
del scheduler
free(points)
free(indices)
return 0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment