Extract prototype from experimentations

This has been extracted from: https://lab.nexedi.com/jjerphan/cython_plus_experiments/tree/queue-dispatch/kdtree The runtime was written by Xavier T. and adapted by Julien J. The experimentation KDTree was written by Julien J. Co-authored-by: Xavier Thompson <xavier.thompson@nexedi.com>

Extract prototype from experimentations
This has been extracted from: https://lab.nexedi.com/jjerphan/cython_plus_experiments/tree/queue-dispatch/kdtree The runtime was written by Xavier T. and adapted by Julien J. The experimentation KDTree was written by Julien J. Co-authored-by: Xavier Thompson <xavier.thompson@nexedi.com>
4254a348 · Julien Jerphanion · 4254a348 · 4254a348 · 4254a348 · 4254a348
Commit 4254a348 authored Nov 02, 2021 by Julien Jerphanion
19 changed files
--- a/.gitignore
+++ b/.gitignore
+.idea
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+**/*.c
+**/*.cpp
+**/kdtree
+**/*.h
+
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
--- a/Makefile
+++ b/Makefile
+SHELL = /bin/bash
+PROJECT = cython+
+
+VENV_PATH=`conda info --base`/envs/${PROJECT}
+PIP_EXECUTABLE=${VENV_PATH}/bin/pip
+PYTHON_EXECUTABLE=${VENV_PATH}/bin/python
+PYTEST_EXECUTABLE=${VENV_PATH}/bin/pytest
+
+# Used when not using the python runtime
+INCLUDE_DIRS = -I/usr/include/python3.9
+EXE = kdtree
+CXX = g++
+CPPFLAGS = -O2 -g -Wno-unused-result -Wsign-compare -pthread $(INCLUDE_DIRS) -fopenmp
+LDFLAGS += -Wl,--unresolved-symbols=ignore-all
+MACROS = -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
+EXT_SUFFIX := $(shell python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
+EXT = $(EXE)$(EXT_SUFFIX)
+
+.DEFAULT_GOAL := all
+
+## help: Display list of commands
+.PHONY: help
+help: Makefile
+	@sed -n 's|^##||p' $< | column -t -s ':' | sed -e 's|^| |'
+
+## all: Run the main targets
+.PHONY: all
+all: setup benchmark
+
+## setup: Setup the conda environment
+.PHONY: setup
+setup: clean
+	conda env create --force -f environment.yml
+	${PIP_EXECUTABLE} install -e . -v
+
+## install: install the project in the env
+.PHONY: install
+install:
+	${PIP_EXECUTABLE} install -e . -v
+
+# nopython: Build without the Python runtime
+.PHONY: nopython
+nopython: $(EXE)
+
+%.cpp: %.pyx
+	@echo "[Cython Compiling $^ -> $@]"
+	${PYTEST_EXECUTABLE} -c "from Cython.Compiler.Main import main; main(command_line=1)" $^ --cplus -3
+	@rm -f $(subst .cpp,.h,$@)
+
+%: %.cpp
+	@echo "[C++ Compiling $^ -> $@]"
+	$(LINK.cpp) $^ $(MACROS) -o $@
+
+## runnopython: Run without Python runtime
+.PHONY: runnopython
+runnopython: $(EXE)
+	# Information of the runtime are currently redirected to stderr.
+	# This is just a simple way to mute them.
+	./$(EXE) 2>/dev/null
+
+## clean: Remove generated files from Cython and C/C++ compilation
+.PHONY: clean
+clean:
+	-rm -f *.c *.cpp *.html
+	-rm -f *.h
+	-rm -f *.so
+	-rm -f $(EXE)
+	-rm -f *.o
+	-rm -f -r build
+	-rm -f *.json
+
+.PRECIOUS: %.cpp
+
+## benchmark: Run benchmarks
+# Uses taskset to cap to a cpu solely
+.PHONY: benchmark
+benchmark:
+	for i in {0..5}; do \
+		taskset -c 0-$$((2**i-1)) ${PYTHON_EXECUTABLE} benchmarks/benchmark.py `git rev-parse --short HEAD`_$$((2**i))_threads ;\
+	done
+	${PYTHON_EXECUTABLE} benchmarks/report.py `git rev-parse --short HEAD`
+
+## report: Report benchmark results
+.PHONY: report
+report:
+	${PYTHON_EXECUTABLE} benchmarks/report.py `git rev-parse --short HEAD`
+
+## test: Launch all the test.
+.PHONY: test
+test:
+	${PYTEST_EXECUTABLE} tests
--- a/__main__.py
+++ b/__main__.py
+import numpy as np
+import kdtree
+
+if __name__ == "__main__":
+    X = np.load("X.npy")
+    tree = kdtree.KDTree(X, 256)
--- a/benchmarks/analysis.ipynb
+++ b/benchmarks/analysis.ipynb
--- a/benchmarks/benchmark.py
+++ b/benchmarks/benchmark.py
+import argparse
+import glob
+import json
+import os
+import sys
+import subprocess
+import time
+import kdtree
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import threadpoolctl
+import yaml
+
+from pprint import pprint
+from matplotlib import pyplot as plt
+from memory_profiler import memory_usage
+from sklearn import set_config
+from sklearn.neighbors import KDTree
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+
+
+# Be gentle with eyes
+plt.rcParams["figure.dpi"] = 200
+
+
+def benchmark(config, results_folder, bench_name):
+    datasets = config["datasets"]
+    estimators = config["estimators"]
+    leaf_sizes = config["leaf_sizes"]
+    n_neighbors = config.get("n_neighbors", [])
+
+    n_trials = config.get("n_trials", 3)
+    return_distance = config.get("return_distance", False)
+    one_GiB = 1e9
+    benchmarks = pd.DataFrame()
+
+    n_threads = _openmp_effective_n_threads()
+
+    env_specs_file = f"{results_folder}/{bench_name}.json"
+
+    # TODO: This is ugly, but I haven't found something better.
+    commit = (
+        str(subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]))
+        .replace("b'", "")
+        .replace("\\n'", "")
+    )
+
+    env_specs = dict(
+        threadpool_info=threadpoolctl.threadpool_info(),
+        commit=commit,
+        config=config,
+        n_threads=n_threads,
+    )
+
+    set_config(assume_finite=True)
+
+    with open(env_specs_file, "w") as outfile:
+        json.dump(env_specs, outfile)
+
+    for dataset in datasets:
+        for leaf_size in leaf_sizes:
+            for trial in range(n_trials):
+                dataset = {k: int(float(v)) for k, v in dataset.items()}
+                ns_train, ns_test, n_features = dataset.values()
+                X_train = np.random.rand(ns_train, n_features)
+                X_test = np.random.rand(ns_test, n_features)
+                bytes_processed_data_init = X_train.nbytes
+                bytes_processed_data_query = X_test.nbytes
+
+                t0_ = time.perf_counter()
+                sk_tree = KDTree(X_train, leaf_size=leaf_size)
+                t1_ = time.perf_counter()
+                time_elapsed = round(t1_ - t0_, 5)
+
+                row = dict(
+                    trial=trial,
+                    func="init",
+                    implementation="sklearn",
+                    n_threads=n_threads,
+                    leaf_size=leaf_size,
+                    n_samples_train=ns_train,
+                    n_samples_test=ns_test,
+                    n_features=n_features,
+                    n_neighbors=np.nan,
+                    time_elapsed=time_elapsed,
+                    throughput=bytes_processed_data_init / time_elapsed / one_GiB,
+                )
+
+                benchmarks = benchmarks.append(row, ignore_index=True)
+                pprint(row)
+                print("---")
+
+                t0_ = time.perf_counter()
+                tree = kdtree.KDTree(X_train, leaf_size=leaf_size)
+                t1_ = time.perf_counter()
+                time_elapsed = round(t1_ - t0_, 5)
+
+                row = dict(
+                    trial=trial,
+                    func="init",
+                    implementation="kdtree",
+                    n_threads=n_threads,
+                    leaf_size=leaf_size,
+                    n_samples_train=ns_train,
+                    n_samples_test=ns_test,
+                    n_features=n_features,
+                    n_neighbors=np.nan,
+                    time_elapsed=time_elapsed,
+                    throughput=bytes_processed_data_init / time_elapsed / one_GiB,
+                )
+
+                benchmarks = benchmarks.append(row, ignore_index=True)
+                pprint(row)
+                print("---")                
+
+                benchmarks.to_csv(
+                    f"{results_folder}/{bench_name}.csv",
+                    mode="w+",
+                    index=False,
+                )
+                
+                for k in n_neighbors:
+                    
+                    t0_ = time.perf_counter()
+                    sk_tree.query(X_test, k=k, return_distance=False)
+                    t1_ = time.perf_counter()
+                    time_elapsed = round(t1_ - t0_, 5)
+                    
+                    row = dict(
+                        trial=trial,
+                        func="query",
+                        implementation="sklearn",
+                        leaf_size=leaf_size,
+                        n_samples_train=ns_train,
+                        n_samples_test=ns_test,
+                        n_features=n_features,
+                        n_neighbors=k,
+                        time_elapsed=time_elapsed,
+                        throughput=bytes_processed_data_query / time_elapsed / one_GiB,
+                    )
+
+                    benchmarks = benchmarks.append(row, ignore_index=True)
+                    pprint(row)
+                    print("---")
+                        
+                    closests = np.zeros((ns_test, k), dtype=np.int32) 
+                    t0_ = time.perf_counter()
+                    tree.query(X_test, closests)
+                    t1_ = time.perf_counter()
+                    time_elapsed = round(t1_ - t0_, 5)
+
+                    row = dict(
+                        trial=trial,
+                        func="query",
+                        implementation="kdtree",
+                        leaf_size=leaf_size,
+                        n_samples_train=ns_train,
+                        n_samples_test=ns_test,
+                        n_features=n_features,
+                        n_neighbors=k,
+                        time_elapsed=time_elapsed,
+                        throughput=bytes_processed_data_query / time_elapsed / one_GiB,
+                    )
+
+                    benchmarks = benchmarks.append(row, ignore_index=True)
+                    pprint(row)
+                    print("---")                
+
+                    benchmarks.to_csv(
+                        f"{results_folder}/{bench_name}.csv",
+                        mode="w+",
+                        index=False,
+                    )
+
+                
+
+    # Overriding again now that all the dyn. lib. have been loaded
+    env_specs["threadpool_info"] = threadpoolctl.threadpool_info()
+
+    with open(env_specs_file, "w") as outfile:
+        json.dump(env_specs, outfile)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("benchmark")
+
+    parser.add_argument("bench_name")
+
+    args = parser.parse_args()
+
+    bench_name = args.bench_name
+    with open("benchmarks/config.yml", "r") as f:
+        config = yaml.full_load(f)
+
+    results_folder = f"benchmarks/results/{bench_name}"
+    os.makedirs(results_folder, exist_ok=True)
+
+    print(f"Benchmarking {bench_name}")
+    benchmark(config, results_folder, bench_name)
+    print(f"Benchmark results wrote in {results_folder}")
\ No newline at end of file
--- a/benchmarks/config.yml
+++ b/benchmarks/config.yml
+estimators:
+  - name: sklearn
+    estimator: sklearn.neighbors.KDTree
+  - name: cython+
+    estimator: kdtree.KDTree
+
+n_trials: 5
+
+datasets:
+  - n_samples_train: 1e5
+    n_samples_test: 1 # not used yet
+    n_features: 32
+  - n_samples_train: 1e6
+    n_samples_test: 1 # not used yet
+    n_features: 32
+
+leaf_sizes:
+  - 512
+  - 1024
+  - 2048
+  - 4096
+  - 8192
--- a/benchmarks/report.py
+++ b/benchmarks/report.py
+import os
+import argparse
+import numpy as np
+import glob
+import subprocess
+import seaborn as sns
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("report")
+    parser.add_argument("commit")
+    args = parser.parse_args()
+    results_folder = os.path.abspath(os.path.join(__file__, os.pardir, "results"))
+
+    commit = args.commit
+    def n_threads(filename):
+        # Extracts '2742685_1_threads.csv'
+        basename = os.path.basename(filename)
+        return int(basename.split("_")[1])
+
+    commit_result_folder = f"{results_folder}/{commit}"
+    csv_bench_results = sorted(glob.glob(f"{commit_result_folder}*/*.csv"),
+    key=n_threads)
+    if len(csv_bench_results) == 0:
+        raise RuntimeError(f"No results for commit {commit}")
+
+
+    os.makedirs(commit_result_folder, exist_ok=True)
+
+    df = pd.concat(map(pd.read_csv, csv_bench_results))
+    df = df.drop(columns=["n_neighbors", "func"])
+
+    cols = [
+        "n_samples_train",
+        "n_samples_test",
+        "n_features",
+        "leaf_size",
+    ]
+
+    # This creates a category used for grouping
+    df['t'] = df.n_threads.apply(str)
+    df_grouped = df.groupby(cols)
+
+    for i, (vals, df_g) in enumerate(df_grouped):
+        # 16:9 ratio
+        fig = plt.figure(figsize=(24, 13.5))
+        ax = plt.gca()
+        splot = sns.barplot(
+            y="t", x="throughput", hue="implementation", data=df_g, ax=ax
+        )
+        _ = ax.set_xlabel("Throughput (in GB/s)")
+        _ = ax.set_ylabel("Number of threads")
+        _ = ax.tick_params(labelrotation=45)
+
+        # Adding the numerical values of "x" to bar
+        for p in splot.patches:
+            _ = splot.annotate(
+                f"{p.get_width():.4e}",
+                (p.get_width(), p.get_y() + p.get_height() / 2),
+                ha="center",
+                va="center",
+                size=10,
+                xytext=(0, -12),
+                textcoords="offset points",
+            )
+
+        title = (
+            f"KDTree.__init__@{commit} - "
+            f"Euclidean Distance, dtype=np.float64, {df_g.trial.max() + 1} trials\n"
+        )
+        title += (
+            "n_samples_train=%s - n_samples_test=%s - "
+            "n_features=%s - leaf_size=%s"
+        % vals
+        )
+        _ = fig.suptitle(title, fontsize=16)
+        plt.savefig(f"{commit_result_folder}/{i}.pdf", bbox_inches="tight")
--- a/environment.yml
+++ b/environment.yml
+name: cython+
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+  - compilers
+  - jupyter
+  - numpy
+  - matplotlib
+  - seaborn
+  - pandas
+  - pyaml
+  - pip
+  - threadpoolctl
+  - pytest
+  - scikit-learn
+  - memory_profiler
+  - pip:
+      # Install cython+ from upstream directly
+    - -e git+https://lab.nexedi.com/nexedi/cython.git@b30eafec6a7b174afdc4f023b45b21f85104e2fe#egg=Cython
+      # The installation of the 'kdtree' module is made then
--- a/kdtree.pyx
+++ b/kdtree.pyx
--- a/runtime/pthreads.pxd
+++ b/runtime/pthreads.pxd
+cdef extern from "<sys/types.h>" nogil:
+    ctypedef long unsigned int pthread_t
+
+    ctypedef union pthread_attr_t:
+        pass
+    ctypedef union pthread_mutex_t:
+        pass
+    ctypedef union pthread_mutexattr_t:
+        pass
+    ctypedef union pthread_barrier_t:
+        pass
+    ctypedef union pthread_barrierattr_t:
+        pass
+    ctypedef union pthread_cond_t:
+        pass
+    ctypedef union pthread_condattr_t:
+        pass
+
+cdef extern from "<pthread.h>" nogil:
+    int pthread_create(pthread_t *, const pthread_attr_t *, void *(*)(void *), void *)
+    void pthread_exit(void *)
+    int pthread_join(pthread_t, void **)
+    int pthread_cancel(pthread_t thread)
+    int pthread_attr_init(pthread_attr_t *)
+    int pthread_attr_setdetachstate(pthread_attr_t *, int)
+    int pthread_attr_destroy(pthread_attr_t *)
+
+    int pthread_mutex_init(pthread_mutex_t *, const pthread_mutexattr_t *)
+    int pthread_mutex_destroy(pthread_mutex_t *)
+    int pthread_mutex_lock(pthread_mutex_t *)
+    int pthread_mutex_unlock(pthread_mutex_t *)
+    int pthread_mutex_trylock(pthread_mutex_t *)
+
+    int pthread_barrier_init(pthread_barrier_t *, const pthread_barrierattr_t *, unsigned int)
+    int pthread_barrier_destroy(pthread_barrier_t *)
+    int pthread_barrier_wait(pthread_barrier_t *)
+
+    int pthread_cond_init(pthread_cond_t * cond, const pthread_condattr_t * attr)
+    int pthread_cond_destroy(pthread_cond_t *cond)
+    int pthread_cond_wait(pthread_cond_t * cond, pthread_mutex_t * mutex)
+    int pthread_cond_broadcast(pthread_cond_t *cond)
+    int pthread_cond_signal(pthread_cond_t *cond)
+
+    enum: PTHREAD_CREATE_JOINABLE
\ No newline at end of file
--- a/runtime/runtime.pxd
+++ b/runtime/runtime.pxd
+# distutils: language = c++
+
+from libcpp.deque cimport deque
+from libcpp.vector cimport vector
+from libcpp.atomic cimport atomic
+from libc.stdio cimport printf
+from libc.stdlib cimport rand
+from posix.unistd cimport sysconf
+from runtime.pthreads cimport *
+from runtime.semaphore cimport *
+
+cdef extern from "<unistd.h>" nogil:
+    enum: _SC_NPROCESSORS_ONLN # Seems to not be included in "posix.unistd".
+
+
+cdef cypclass Scheduler
+cdef cypclass Worker
+
+# The 'inline' qualifier on this function is a hack to convince Cython to allow a definition in a .pxd file.
+# The C compiler will dismiss it because we pass the function pointer to create a thread which prevents inlining.
+cdef inline void * worker_function(void * arg) nogil:
+    worker = <lock Worker> arg
+    sch = <Scheduler> <void*> worker.scheduler
+    # Wait until all the workers are ready.
+    pthread_barrier_wait(&sch.barrier)
+    while 1:
+        # Wait until a queue becomes available.
+        sem_wait(&sch.num_free_queues)
+        # If the scheduler is done there is nothing to do anymore.
+        if sch.is_done:
+            return <void*> 0
+        # Pop or steal a queue.
+        queue = worker.get_queue()
+        with wlocked queue:
+            # Do one task on the queue.
+            queue.activate()
+            if queue.is_empty():
+                # Mark the empty queue as not assigned to any worker.
+                queue.has_worker = False
+                # Decrement the number of non-completed queues.
+                if sch.num_pending_queues.fetch_sub(1) == 1:
+                    # Signal that there are no more queues.
+                    sem_post(&sch.done)
+                # Discard the empty queue and continue the main loop.
+                continue
+        # The queue is not empty: reinsert it in this worker's queues.
+        worker.queues.push_back(queue)
+        # Signal that the queue is available.
+        sem_post(&sch.num_free_queues)
+
+
+cdef cypclass Worker:
+    deque[lock SequentialMailBox] queues
+    lock Scheduler scheduler
+    pthread_t thread
+
+    lock Worker __new__(alloc, lock Scheduler scheduler):
+        instance = consume alloc()
+        instance.scheduler = scheduler
+        locked_instance = <lock Worker> consume instance
+        if not pthread_create(&locked_instance.thread, NULL, worker_function, <void *> locked_instance):
+            return locked_instance
+        printf("pthread_create() failed\n")
+
+    lock SequentialMailBox get_queue(lock self):
+        # Get the next queue in the worker's list or steal one.
+        with wlocked self:
+            if not self.queues.empty():
+                queue = self.queues.front()
+                self.queues.pop_front()
+                return queue
+        return self.steal_queue()
+
+    lock SequentialMailBox steal_queue(lock self):
+        # Steal a queue from another worker:
+        # - inspect each worker in order starting at a random offset
+        # - skip any worker with an empty queue list
+        # - return the last queue of the first worker with a non-empty list
+        # - continue looping until a queue is found
+        cdef int i, index, num_workers, random_offset
+        sch = <Scheduler> <void*> self.scheduler
+        num_workers = <int> sch.workers.size()
+        index = rand() % num_workers
+        while True:
+            victim = sch.workers[index]
+            with wlocked victim:
+                if not victim.queues.empty():
+                    stolen_queue = victim.queues.back()
+                    victim.queues.pop_back()
+                    return stolen_queue
+            index += 1
+            if index >= num_workers:
+                index = 0
+
+    int join(self):
+        # Join the worker thread.
+        return pthread_join(self.thread, NULL)
+
+
+cdef cypclass Scheduler:
+    vector[lock Worker] workers
+    pthread_barrier_t barrier
+    sem_t num_free_queues
+    atomic[int] num_pending_queues
+    sem_t done
+    volatile bint is_done
+    int num_workers
+
+    lock Scheduler __new__(alloc, int num_workers=0):
+        self = <lock Scheduler> consume alloc()
+        if num_workers == 0: num_workers = sysconf(_SC_NPROCESSORS_ONLN)
+        self.num_workers = num_workers
+        sem_init(&self.num_free_queues, 0, 0)
+        sem_init(&self.done, 0, 0)
+        self.num_pending_queues.store(0)
+        if pthread_barrier_init(&self.barrier, NULL, num_workers + 1):
+            printf("Could not allocate memory for the thread barrier\n")
+            # Signal that no work will be done.
+            sem_post(&self.done)
+            return self
+        self.is_done = False
+        self.workers.reserve(num_workers)
+        for i in range(num_workers):
+            worker = Worker(self)
+            if worker is NULL:
+                # Signal that no work will be done.
+                sem_post(&self.done)
+                return self
+            self.workers.push_back(worker)
+        # Wait until all the worker threads are ready.
+        pthread_barrier_wait(&self.barrier)
+        return self
+
+    __dealloc__(self):
+        pthread_barrier_destroy(&self.barrier)
+        sem_destroy(&self.num_free_queues)
+        sem_destroy(&self.done)
+
+    void post_queue(lock self, lock SequentialMailBox queue):
+        cdef int num_workers, random_offset
+        sch = <Scheduler> <void*> self
+        # Add a queue to a random worker.
+        num_workers = <int> sch.workers.size()
+        random_offset = rand() % num_workers
+        receiver = sch.workers[random_offset]
+        with wlocked receiver:
+            queue.has_worker = True
+            receiver.queues.push_back(queue)
+        # Increment the number of non-completed queues.
+        sch.num_pending_queues.fetch_add(1)
+        # Signal that a queue is available.
+        sem_post(&sch.num_free_queues)
+
+    void finish(lock self):
+        # Wait until there is no more work.
+        done = &self.done
+        sem_wait(done)
+        # Signal the worker threads that there is no more work.
+        self.is_done = True
+        # Pretend that there are new queues to wake up the workers.
+        num_free_queues = &self.num_free_queues
+        for worker in self.workers:
+            sem_post(num_free_queues)
+        # Clear the workers to break reference cycles.
+        self.workers.clear()
+
+
+cdef cypclass SequentialMailBox(ActhonQueueInterface):
+    deque[ActhonMessageInterface] messages
+    lock Scheduler scheduler
+    bint has_worker
+
+    __init__(self, lock Scheduler scheduler):
+        self.scheduler = scheduler
+        self.has_worker = False
+
+    bint is_empty(const self):
+        return self.messages.empty()
+
+    void push(locked self, ActhonMessageInterface message):
+        # Add a task to the queue.
+        self.messages.push_back(message)
+        if message._sync_method is not NULL:
+            message._sync_method.insertActivity()
+        # If no worker is already assigned this queue
+        # register it with the scheduler.
+        if not self.has_worker:
+            self.scheduler.post_queue(self)
+
+    bint activate(self):
+        # Try to process the first message in the queue.
+        cdef bint one_message_processed
+        if self.messages.empty():
+            return False
+        next_message = self.messages.front()
+        self.messages.pop_front()
+        one_message_processed = next_message.activate()
+        if one_message_processed:
+            if next_message._sync_method is not NULL:
+                next_message._sync_method.removeActivity()
+        else:
+            printf("Pushed front message to back :/\n")
+            self.messages.push_back(next_message)
+        return one_message_processed
+
+
+cdef cypclass BatchMailBox(SequentialMailBox):
+    bint activate(self):
+        # Process as many messages as possible.
+        while not self.messages.empty():
+            next_message = self.messages.front()
+            self.messages.pop_front()
+            if not next_message.activate():
+                printf("Pushed front message to back :/\n")
+                self.messages.push_back(next_message)
+                return False
+            if next_message._sync_method is not NULL:
+                next_message._sync_method.removeActivity()
+        return True
+
+
+cdef inline ActhonResultInterface NullResult() nogil:
+    return NULL
+
+
+# Taken from:
+# https://lab.nexedi.com/nexedi/cython/blob/3.0a6-cypclass/tests/run/cypclass_acthon.pyx#L66
+cdef cypclass WaitResult(ActhonResultInterface):
+    union result_t:
+        int int_val
+        void* ptr
+    result_t result
+    sem_t semaphore
+
+    __init__(self):
+        self.result.ptr = NULL
+        sem_init(&self.semaphore, 0, 0)
+
+    __dealloc__(self):
+        sem_destroy(&self.semaphore)
+
+    @staticmethod
+    ActhonResultInterface construct():
+        return WaitResult()
+
+    void pushVoidStarResult(self, void* result):
+        self.result.ptr = result
+        sem_post(&self.semaphore)
+
+    void pushIntResult(self, int result):
+        self.result.int_val = result
+        sem_post(&self.semaphore)
+
+    result_t _getRawResult(const self):
+        # We must ensure a result exists, but we can let others access it immediately
+        # The cast here is a way of const-casting (we're modifying the semaphore in a const method)
+        sem_wait(<sem_t*> &self.semaphore)
+        sem_post(<sem_t*> &self.semaphore)
+        return self.result
+
+    void* getVoidStarResult(const self):
+        res = self._getRawResult()
+        return res.ptr
+
+    int getIntResult(const self):
+        res = self._getRawResult()
+        return res.int_val
--- a/runtime/semaphore.pxd
+++ b/runtime/semaphore.pxd
+cdef extern from "<semaphore.h>" nogil:
+    ctypedef struct sem_t:
+        pass
+    int sem_init(sem_t *sem, int pshared, unsigned int value)
+    int sem_wait(sem_t *sem)
+    int sem_post(sem_t *sem)
+    int sem_getvalue(sem_t *, int *)
+    int sem_destroy(sem_t* sem)
--- a/setup.py
+++ b/setup.py
+from distutils.core import setup
+from distutils.extension import Extension
+
+import numpy
+from Cython.Build import cythonize
+
+extensions = [
+    Extension(
+        "kdtree",
+        language="c++",
+        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=["-fopenmp"],
+        extra_link_args=["-fopenmp"],
+        sources=["kdtree.pyx"],
+    ),
+]
+
+setup(
+    ext_modules=cythonize(extensions),
+    name="kdtree",
+    version="0.1",
+)
--- a/stdlib/digest.pxd
+++ b/stdlib/digest.pxd
+# distutils: language = c++
+
+from stdlib.string cimport string
+
+
+cdef extern from "<openssl/evp.h>" nogil:
+    ctypedef struct EVP_MD_CTX:
+        pass
+    ctypedef struct EVP_MD
+    ctypedef struct ENGINE
+
+    void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
+    EVP_MD_CTX *EVP_MD_CTX_create()
+
+    int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl)
+    int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *d, unsigned int cnt)
+    int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s)
+
+    int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
+    void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
+
+    int EVP_MD_CTX_copy_ex(EVP_MD_CTX *out, const EVP_MD_CTX *_in)
+
+    int EVP_DigestInit(EVP_MD_CTX *ctx, const EVP_MD *type)
+    int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *s)
+
+    int EVP_MD_CTX_copy(EVP_MD_CTX *out,EVP_MD_CTX *_in)
+
+    # Max size
+    const int EVP_MAX_MD_SIZE
+
+    # Algorithms
+    const EVP_MD *md5sum "EVP_md5"()
+    const EVP_MD *sha1sum "EVP_sha1"()
+    const EVP_MD *sha256sum "EVP_sha256"()
+    const EVP_MD *sha512sum "EVP_sha512"()
+
+    const EVP_MD *EVP_get_digestbyname(const char *name)
+
+
+cdef extern from * nogil:
+    '''
+    static const char hexdigits [] = "0123456789abcdef";
+    '''
+    cdef const char hexdigits[]
+
+
+cdef cypclass MessageDigest:
+    EVP_MD_CTX * md_ctx
+
+    MessageDigest __new__(alloc, const EVP_MD * algo):
+        md_ctx = EVP_MD_CTX_create()
+        if md_ctx is NULL:
+            return NULL
+        if EVP_DigestInit_ex(md_ctx, algo, NULL) != 1:
+            return NULL
+        instance = alloc()
+        instance.md_ctx = md_ctx
+        return instance
+
+    __dealloc__(self):
+        EVP_MD_CTX_destroy(md_ctx)
+
+    int update(self, unsigned char * message, size_t size):
+        return EVP_DigestUpdate(self.md_ctx, message, size) - 1
+
+    string hexdigest(self):
+        cdef char result[EVP_MAX_MD_SIZE*2]
+        cdef unsigned char hashbuffer[EVP_MAX_MD_SIZE]
+        cdef unsigned int size
+        cdef unsigned int i
+        if EVP_DigestFinal_ex(self.md_ctx, hashbuffer, &size) == 1:
+            for i in range(size):
+                result[2*i] = hexdigits[hashbuffer[i] >> 4]
+                result[2*i+1] = hexdigits[hashbuffer[i] & 15]
+            return string(result, 2*size)
--- a/stdlib/dirent.pxd
+++ b/stdlib/dirent.pxd
+from posix.types cimport ino_t
+
+cdef extern from "<sys/types.h>" nogil:
+    ctypedef struct DIR
+
+cdef extern from "<dirent.h>" nogil:
+    cdef struct struct_dirent "dirent":
+        ino_t           d_ino
+        char            d_name[256]
+
+    DIR *opendir(const char *name)
+    struct_dirent *readdir(DIR *dirp)
+    int readdir_r(DIR *dirp, struct_dirent *entry, struct_dirent **result)
+    int closedir(DIR *dirp)
--- a/stdlib/fmt.pxd
+++ b/stdlib/fmt.pxd
+from stdlib.string cimport string
+
+cdef extern from "<fmt/printf.h>" namespace "fmt" nogil:
+
+    ctypedef struct FILE
+
+    int printf       (const char* template, ...)
+    int fprintf      (FILE *stream, const char* template, ...)
+    string sprintf   (const char* template, ...)
+
--- a/stdlib/stat.pxd
+++ b/stdlib/stat.pxd
+# distutils: language = c++
+
+# Differences with posix.stat:
+#
+# - the declaration for the non-standard field st_birthtime was removed
+#   because cypclass wrapping triggers the generation of a conversion
+#   function for the stat structure which references this field.
+#
+# - the absent declaration in posix.time of struct timespec was added.
+#
+# - the declarations for the time_t fields st_atime, st_mtime, st_ctime
+#   were replaced by the fields st_atim, st_mtim, st_ctim
+#   of type struct timespec.
+
+from stdlib.string cimport string
+from stdlib.fmt cimport sprintf
+
+from posix.types cimport (blkcnt_t, blksize_t, dev_t, gid_t, ino_t, mode_t,
+                          nlink_t, off_t, time_t, uid_t)
+
+cdef extern from "<sys/time.h>" nogil:
+    cdef struct struct_timespec "timespec":
+        time_t tv_sec
+        long int tv_nsec
+
+
+cdef extern from "<sys/stat.h>" nogil:
+    cdef struct struct_stat "stat":
+        dev_t   st_dev
+        ino_t   st_ino
+        mode_t  st_mode
+        nlink_t st_nlink
+        uid_t   st_uid
+        gid_t   st_gid
+        dev_t   st_rdev
+        off_t   st_size
+        blksize_t st_blksize
+        blkcnt_t st_blocks
+        struct_timespec  st_atim
+        struct_timespec  st_mtim
+        struct_timespec  st_ctim
+
+# POSIX prescribes including both <sys/stat.h> and <unistd.h> for these
+cdef extern from "<unistd.h>" nogil:
+    int fchmod(int, mode_t)
+    int chmod(const char *, mode_t)
+
+    int fstat(int, struct_stat *)
+    int lstat(const char *, struct_stat *)
+    int stat(const char *, struct_stat *)
+
+    # Macros for st_mode
+    mode_t S_ISREG(mode_t)
+    mode_t S_ISDIR(mode_t)
+    mode_t S_ISCHR(mode_t)
+    mode_t S_ISBLK(mode_t)
+    mode_t S_ISFIFO(mode_t)
+    mode_t S_ISLNK(mode_t)
+    mode_t S_ISSOCK(mode_t)
+
+    mode_t S_IFMT
+    mode_t S_IFREG
+    mode_t S_IFDIR
+    mode_t S_IFCHR
+    mode_t S_IFBLK
+    mode_t S_IFIFO
+    mode_t S_IFLNK
+    mode_t S_IFSOCK
+
+    # Permissions
+    mode_t S_ISUID
+    mode_t S_ISGID
+    mode_t S_ISVTX
+
+    mode_t S_IRWXU
+    mode_t S_IRUSR
+    mode_t S_IWUSR
+    mode_t S_IXUSR
+
+    mode_t S_IRWXG
+    mode_t S_IRGRP
+    mode_t S_IWGRP
+    mode_t S_IXGRP
+
+    mode_t S_IRWXO
+    mode_t S_IROTH
+    mode_t S_IWOTH
+    mode_t S_IXOTH
+
+
+# Cypclass to expose minimal stat support.
+
+cdef cypclass Stat:
+    struct_stat st_data
+
+    Stat __new__(alloc, string path):
+        instance = alloc()
+        if not lstat(path.c_str(), &instance.st_data):
+            return instance
+
+    bint is_regular(self):
+        return S_ISREG(self.st_data.st_mode)
+
+    bint is_symlink(self):
+        return S_ISLNK(self.st_data.st_mode)
+
+    bint is_dir(self):
+        return S_ISDIR(self.st_data.st_mode)
+
+    string to_json(self):
+        return sprintf("""{
+        "st_dev": %lu,
+        "st_ino": %lu,
+        "st_mode": %lu,
+        "st_nlink": %lu,
+        "st_uid": %d,
+        "st_gid": %d,
+        "st_rdev": %lu,
+        "st_size": %ld,
+        "st_blksize": %ld,
+        "st_blocks": %ld,
+        "st_atime": %ld,
+        "st_mtime": %ld,
+        "st_ctime": %ld,
+        "st_atime_ns": %ld,
+        "st_mtime_ns": %ld,
+        "st_ctime_ns": %ld
+      }""",
+            self.st_data.st_dev,
+            self.st_data.st_ino,
+            self.st_data.st_mode,
+            self.st_data.st_nlink,
+            self.st_data.st_uid,
+            self.st_data.st_gid,
+            self.st_data.st_rdev,
+            self.st_data.st_size,
+            self.st_data.st_blksize,
+            self.st_data.st_blocks,
+            self.st_data.st_atim.tv_sec,
+            self.st_data.st_mtim.tv_sec,
+            self.st_data.st_ctim.tv_sec,
+            self.st_data.st_atim.tv_nsec,
+            self.st_data.st_mtim.tv_nsec,
+            self.st_data.st_ctim.tv_nsec,
+        )
--- a/stdlib/string.pxd
+++ b/stdlib/string.pxd
+# Differences with libcpp.string:
+#
+# - declarations for operator+= have been added.
+
+
+cdef extern from "<string>" namespace "std" nogil:
+
+    size_t npos = -1
+
+    cdef cppclass string:
+        cppclass iterator:
+            iterator()
+            char& operator*()
+            iterator(iterator &)
+            iterator operator++()
+            iterator operator--()
+            bint operator==(iterator)
+            bint operator!=(iterator)
+        cppclass reverse_iterator:
+            char& operator*()
+            iterator operator++()
+            iterator operator--()
+            iterator operator+(size_t)
+            iterator operator-(size_t)
+            bint operator==(reverse_iterator)
+            bint operator!=(reverse_iterator)
+            bint operator<(reverse_iterator)
+            bint operator>(reverse_iterator)
+            bint operator<=(reverse_iterator)
+            bint operator>=(reverse_iterator)
+        cppclass const_iterator(iterator):
+            pass
+        cppclass const_reverse_iterator(reverse_iterator):
+            pass
+
+        string() except +
+        string(const char *) except +
+        string(const char *, size_t) except +
+        string(const string&) except +
+        # as a string formed by a repetition of character c, n times.
+        string(size_t, char) except +
+        # from a pair of iterators
+        string(iterator first, iterator last) except +
+
+        iterator begin()
+        const_iterator const_begin "begin"()
+        iterator end()
+        const_iterator const_end "end"()
+        reverse_iterator rbegin()
+        const_reverse_iterator const_rbegin "rbegin"()
+        reverse_iterator rend()
+        const_reverse_iterator const_rend "rend"()
+
+        const char* c_str()
+        const char* data()
+        size_t size()
+        size_t max_size()
+        size_t length()
+        void resize(size_t)
+        void resize(size_t, char c)
+        size_t capacity()
+        void reserve(size_t)
+        void clear()
+        bint empty()
+        iterator erase(iterator position)
+        iterator erase(const_iterator position)
+        iterator erase(iterator first, iterator last)
+        iterator erase(const_iterator first, const_iterator last)
+
+        char& at(size_t)
+        char& operator[](size_t)
+        char& front()  # C++11
+        char& back()   # C++11
+        int compare(const string&)
+
+        string& operator+=(const string&)
+        string& operator+=(char)
+        string& operator+=(const char *)
+
+        string& append(const string&)
+        string& append(const string&, size_t, size_t)
+        string& append(const char *)
+        string& append(const char *, size_t)
+        string& append(size_t, char)
+
+        void push_back(char c)
+
+        string& assign (const string&)
+        string& assign (const string&, size_t, size_t)
+        string& assign (const char *, size_t)
+        string& assign (const char *)
+        string& assign (size_t n, char c)
+
+        string& insert(size_t, const string&)
+        string& insert(size_t, const string&, size_t, size_t)
+        string& insert(size_t, const char* s, size_t)
+
+
+        string& insert(size_t, const char* s)
+        string& insert(size_t, size_t, char c)
+
+        size_t copy(char *, size_t, size_t)
+
+        size_t find(const string&)
+        size_t find(const string&, size_t)
+        size_t find(const char*, size_t pos, size_t)
+        size_t find(const char*, size_t pos)
+        size_t find(char, size_t pos)
+
+        size_t rfind(const string&, size_t)
+        size_t rfind(const char* s, size_t, size_t)
+        size_t rfind(const char*, size_t pos)
+        size_t rfind(char c, size_t)
+        size_t rfind(char c)
+
+        size_t find_first_of(const string&, size_t)
+        size_t find_first_of(const char* s, size_t, size_t)
+        size_t find_first_of(const char*, size_t pos)
+        size_t find_first_of(char c, size_t)
+        size_t find_first_of(char c)
+
+        size_t find_first_not_of(const string&, size_t)
+        size_t find_first_not_of(const char* s, size_t, size_t)
+        size_t find_first_not_of(const char*, size_t pos)
+        size_t find_first_not_of(char c, size_t)
+        size_t find_first_not_of(char c)
+
+        size_t find_last_of(const string&, size_t)
+        size_t find_last_of(const char* s, size_t, size_t)
+        size_t find_last_of(const char*, size_t pos)
+        size_t find_last_of(char c, size_t)
+        size_t find_last_of(char c)
+
+        size_t find_last_not_of(const string&, size_t)
+        size_t find_last_not_of(const char* s, size_t, size_t)
+        size_t find_last_not_of(const char*, size_t pos)
+
+        string substr(size_t, size_t)
+        string substr()
+        string substr(size_t)
+
+        size_t find_last_not_of(char c, size_t)
+        size_t find_last_not_of(char c)
+
+        #string& operator= (const string&)
+        #string& operator= (const char*)
+        #string& operator= (char)
+
+        string operator+ (const string& rhs)
+        string operator+ (const char* rhs)
+
+        bint operator==(const string&)
+        bint operator==(const char*)
+
+        bint operator!= (const string& rhs )
+        bint operator!= (const char* )
+
+        bint operator< (const string&)
+        bint operator< (const char*)
+
+        bint operator> (const string&)
+        bint operator> (const char*)
+
+        bint operator<= (const string&)
+        bint operator<= (const char*)
+
+        bint operator>= (const string&)
+        bint operator>= (const char*)
--- a/tests/test_conf.py
+++ b/tests/test_conf.py
+import numpy as np
+import pytest
+import kdtree
+from sklearn.neighbors import KDTree
+
+@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
+@pytest.mark.parametrize("d", [10, 100])
+@pytest.mark.parametrize("k", [1, 2, 5, 10])
+@pytest.mark.parametrize("leaf_size", [256, 1024])
+def test_against_sklearn(n, d, k, leaf_size):
+    np.random.seed(1)
+    X = np.random.rand(n, d)
+    query_points = np.random.rand(n, d)
+    
+    tree = kdtree.KDTree(X, leaf_size=256)
+    skl_tree = KDTree(X, leaf_size=256)
+    
+    closests = np.zeros((n, k), dtype=np.int32)
+    tree.query(query_points, closests)
+    skl_closests = skl_tree.query(query_points, k=k, return_distance=False).astype(np.int32)
+    
+    np.testing.assert_equal(closests, skl_closests)
\ No newline at end of file