Merge branch 'synchro' into 'main'

Add querying for KNN See merge request jjerphan/cython_plus_experiments!1

Merge branch 'synchro' into 'main'
Add querying for KNN See merge request jjerphan/cython_plus_experiments!1
ed81d102 · Julien Jerphanion · f68236da · 146eee39 · ed81d102 · ed81d102
Commit ed81d102 authored Jun 22, 2021 by Julien Jerphanion
10 changed files
--- a/kdtree/Makefile
+++ b/kdtree/Makefile
-INCLUDE_DIRS = -I/usr/include/python3.8
+SHELL = /bin/bash
+PROJECT = cython+
+
+VENV_PATH=`conda info --base`/envs/${PROJECT}
+PIP_EXECUTABLE=${VENV_PATH}/bin/pip
+PYTHON_EXECUTABLE=${VENV_PATH}/bin/python
+PYTEST_EXECUTABLE=${VENV_PATH}/bin/pytest
+
+# Used when not using the python runtime
+INCLUDE_DIRS = -I/usr/include/python3.9
 EXE = kdtree
 CXX = g++
-CPPFLAGS = -O2 -g -Wno-unused-result -Wsign-compare -pthread $(INCLUDE_DIRS)
+CPPFLAGS = -O2 -g -Wno-unused-result -Wsign-compare -pthread $(INCLUDE_DIRS) -fopenmp
 LDFLAGS += -Wl,--unresolved-symbols=ignore-all
 MACROS = -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
-LDLIBS = -lcrypto -lfmt
-EXT_SUFFIX := $(shell python3 -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
+EXT_SUFFIX := $(shell python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
 EXT = $(EXE)$(EXT_SUFFIX)

-# Build with Python runtime
-all: $(EXT)
+.DEFAULT_GOAL := all

-$(EXT): setup.py
-	@echo "[Cython Compiling $^ -> $@]"
-	python3 setup.py build_ext --inplace
+## help: Display list of commands
+.PHONY: help
+help: Makefile
+	@sed -n 's|^##||p' $< | column -t -s ':' | sed -e 's|^| |'

-# Run with Python runtime
-run: $(EXT)
-	python3 -c "import $(EXE); $(EXE).python_main()" 2>/dev/null
+## all: Run the main targets
+.PHONY: all
+all: install benchmark

-# Build without Python runtime
+## install: Install conda env.
+.PHONY: install
+install: clean
+	conda env create --force -f environment.yml
+	${PIP_EXECUTABLE} install -e . -v
+
+# nopython: Build without the Python runtime
+.PHONY: nopython
 nopython: $(EXE)

 %.cpp: %.pyx
 	@echo "[Cython Compiling $^ -> $@]"
-	python3 -c "from Cython.Compiler.Main import main; main(command_line=1)" $^ --cplus -3
+	${PYTEST_EXECUTABLE} -c "from Cython.Compiler.Main import main; main(command_line=1)" $^ --cplus -3
 	@rm -f $(subst .cpp,.h,$@)

 %: %.cpp
 	@echo "[C++ Compiling $^ -> $@]"
-	$(LINK.cpp) $^ $(LOADLIBES) $(LDLIBS) $(MACROS) -o $@
+	$(LINK.cpp) $^ $(MACROS) -o $@

-# Run without Python runtime
+## runnopython: Run without Python runtime
+.PHONY: runnopython
 runnopython: $(EXE)
 	# Information of the runtime are currently redirected to stderr.
 	# This is just a simple way to mute them.
 	./$(EXE) 2>/dev/null

+## clean: Remove generated files from Cython and C/C++ compilation
+.PHONY: clean
 clean:
 	-rm -f *.c *.cpp *.html
 	-rm -f *.h
@@ -46,5 +64,17 @@ clean:
 	-rm -f -r build
 	-rm -f *.json

-.PHONY: all run nopython runnopython clean
 .PRECIOUS: %.cpp
+
+## benchmark: Run benchmarks
+# Uses taskset to cap to a cpu solely
+.PHONY: benchmark
+benchmark:
+	for i in {0..5}; do \
+		taskset -c 0-$$((2**i-1)) ${PYTHON_EXECUTABLE} benchmarks/benchmark.py `git rev-parse --short HEAD`_$$((2**i-1))_thread ;\
+	done
+
+## test: Launch all the test.
+.PHONY: test
+test:
+	${PYTEST_EXECUTABLE} tests
--- a/kdtree/benchmarks/benchmark.py
+++ b/kdtree/benchmarks/benchmark.py
+import argparse
+import glob
+import importlib
+import json
+import os
+import subprocess
+import time
+import kdtree
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import threadpoolctl
+import yaml
+
+from pprint import pprint
+from matplotlib import pyplot as plt
+from memory_profiler import memory_usage
+from sklearn import set_config
+from sklearn.neighbors import KDTree
+
+
+# Be gentle with eyes
+plt.rcParams["figure.dpi"] = 200
+
+
+def benchmark(config, results_folder, bench_name):
+    datasets = config["datasets"]
+    estimators = config["estimators"]
+    leaf_sizes = config["leaf_sizes"]
+    n_neighbors = config.get("n_neighbors", [])
+
+    n_trials = config.get("n_trials", 3)
+    return_distance = config.get("return_distance", False)
+    one_GiB = 1e9
+    benchmarks = pd.DataFrame()
+
+    env_specs_file = f"{results_folder}/{bench_name}.json"
+
+    # TODO: This is ugly, but I haven't found something better.
+    commit = (
+        str(subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]))
+        .replace("b'", "")
+        .replace("\\n'", "")
+    )
+
+    env_specs = dict(
+        threadpool_info=threadpoolctl.threadpool_info(),
+        commit=commit,
+        config=config,
+    )
+
+    set_config(assume_finite=True)
+
+    with open(env_specs_file, "w") as outfile:
+        json.dump(env_specs, outfile)
+
+    for dataset in datasets:
+        for leaf_size in leaf_sizes:
+            for trial in range(n_trials):
+                dataset = {k: int(float(v)) for k, v in dataset.items()}
+                ns_train, ns_test, n_features = dataset.values()
+                X_train = np.random.rand(ns_train, n_features)
+                X_test = np.random.rand(ns_test, n_features)
+                bytes_processed_data_init = X_train.nbytes
+                bytes_processed_data_query = X_test.nbytes
+
+                t0_ = time.perf_counter()
+                sk_tree = KDTree(X_train, leaf_size=leaf_size)
+                t1_ = time.perf_counter()
+                time_elapsed = round(t1_ - t0_, 5)
+
+                row = dict(
+                    trial=trial,
+                    func="init",
+                    implementation="sklearn",
+                    leaf_size=leaf_size,
+                    n_samples_train=ns_train,
+                    n_samples_test=ns_test,
+                    n_features=n_features,
+                    n_neighbors=np.nan,
+                    time_elapsed=time_elapsed,
+                    throughput=bytes_processed_data_init / time_elapsed / one_GiB,
+                )
+
+                benchmarks = benchmarks.append(row, ignore_index=True)
+                pprint(row)
+                print("---")
+
+                t0_ = time.perf_counter()
+                tree = kdtree.KDTree(X_train, leaf_size=leaf_size)
+                t1_ = time.perf_counter()
+                time_elapsed = round(t1_ - t0_, 5)
+
+                row = dict(
+                    trial=trial,
+                    func="init",
+                    implementation="kdtree",
+                    leaf_size=leaf_size,
+                    n_samples_train=ns_train,
+                    n_samples_test=ns_test,
+                    n_features=n_features,
+                    n_neighbors=np.nan,
+                    time_elapsed=time_elapsed,
+                    throughput=bytes_processed_data_init / time_elapsed / one_GiB,
+                )
+
+                benchmarks = benchmarks.append(row, ignore_index=True)
+                pprint(row)
+                print("---")                
+
+                benchmarks.to_csv(
+                    f"{results_folder}/{bench_name}.csv",
+                    mode="w+",
+                    index=False,
+                )
+                
+                for k in n_neighbors:
+                    
+                    t0_ = time.perf_counter()
+                    sk_tree.query(X_test, k=k, return_distance=False)
+                    t1_ = time.perf_counter()
+                    time_elapsed = round(t1_ - t0_, 5)
+                    
+                    row = dict(
+                        trial=trial,
+                        func="query",
+                        implementation="sklearn",
+                        leaf_size=leaf_size,
+                        n_samples_train=ns_train,
+                        n_samples_test=ns_test,
+                        n_features=n_features,
+                        n_neighbors=k,
+                        time_elapsed=time_elapsed,
+                        throughput=bytes_processed_data_query / time_elapsed / one_GiB,
+                    )
+
+                    benchmarks = benchmarks.append(row, ignore_index=True)
+                    pprint(row)
+                    print("---")
+                        
+                    closests = np.zeros((ns_test, k), dtype=np.int32) 
+                    t0_ = time.perf_counter()
+                    tree.query(X_test, closests)
+                    t1_ = time.perf_counter()
+                    time_elapsed = round(t1_ - t0_, 5)
+
+                    row = dict(
+                        trial=trial,
+                        func="query",
+                        implementation="kdtree",
+                        leaf_size=leaf_size,
+                        n_samples_train=ns_train,
+                        n_samples_test=ns_test,
+                        n_features=n_features,
+                        n_neighbors=k,
+                        time_elapsed=time_elapsed,
+                        throughput=bytes_processed_data_query / time_elapsed / one_GiB,
+                    )
+
+                    benchmarks = benchmarks.append(row, ignore_index=True)
+                    pprint(row)
+                    print("---")                
+
+                    benchmarks.to_csv(
+                        f"{results_folder}/{bench_name}.csv",
+                        mode="w+",
+                        index=False,
+                    )
+
+                
+
+    # Overriding again now that all the dyn. lib. have been loaded
+    env_specs["threadpool_info"] = threadpoolctl.threadpool_info()
+
+    with open(env_specs_file, "w") as outfile:
+        json.dump(env_specs, outfile)
+
+
+def report(results_folder, bench_name):
+    df = pd.read_csv(glob.glob(f"{results_folder}/*.csv")[0])
+    with open(glob.glob(f"{results_folder}/*.json")[0], "r") as json_file:
+        env_specs = json.load(json_file)
+
+    cols = [
+        "n_samples_train",
+        "n_features",
+        "leaf_size",
+    ]
+
+    df[cols] = df[cols].astype(np.uint32)
+    df['d'] = df.n_features.apply(str)
+    df['leaf'] = df.leaf_size.apply(str)
+
+    df_grouped = df.groupby(cols)
+
+    for i, (vals, df) in enumerate(df_grouped):
+        # 16:9 ratio
+        fig = plt.figure(figsize=(24, 13.5))
+        ax = plt.gca()
+        splot = sns.barplot(
+            y="leaf", x="throughput", hue="implementation", data=df, ax=ax
+        )
+        _ = ax.set_xlabel("Throughput (in GB/s)")
+        _ = ax.set_ylabel("Leaf Size")
+        _ = ax.tick_params(labelrotation=45)
+
+        # Adding the numerical values of "x" to bar
+        for p in splot.patches:
+            _ = splot.annotate(
+                f"{p.get_width():.4e}",
+                (p.get_width(), p.get_y() + p.get_height() / 2),
+                ha="center",
+                va="center",
+                size=10,
+                xytext=(0, -12),
+                textcoords="offset points",
+            )
+
+        title = (
+            f"KDTree@{env_specs['commit']} - "
+            f"Euclidean Distance, dtype=np.float64, {df.trial.max() + 1} trials - Bench. Name: {bench_name}\n"
+        )
+        title += (
+            "n_samples_train=%s - n_features=%s - leaf_size=%s" % vals
+        )
+        _ = fig.suptitle(title, fontsize=16)
+        plt.savefig(f"{results_folder}/{bench_name}_{i}.pdf", bbox_inches="tight")
+
+    # Unifying pdf files into one
+    pdf_files = sorted(glob.glob(f"{results_folder}/{bench_name}*.pdf"))
+    subprocess.check_output(
+        ["pdfunite", *pdf_files, f"{results_folder}/{bench_name}.pdf"]
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("benchmark")
+
+    parser.add_argument("bench_name")
+
+    args = parser.parse_args()
+
+    bench_name = args.bench_name
+    with open("benchmarks/config.yml", "r") as f:
+        config = yaml.full_load(f)
+
+    results_folder = f"benchmarks/results/{bench_name}"
+    os.makedirs(results_folder, exist_ok=True)
+
+    print(f"Benchmarking {bench_name}")
+    benchmark(config, results_folder, bench_name)
+    print(f"Benchmark results wrote in {results_folder}")
+
+    print(f"Reporting results for {bench_name}")
+    report(results_folder, bench_name)
+    print(f"Reporting results wrote in {results_folder}")
\ No newline at end of file
--- a/kdtree/benchmarks/config.yml
+++ b/kdtree/benchmarks/config.yml
+estimators:
+  - name: sklearn
+    estimator: sklearn.neighbors.KDTree
+  - name: cython+
+    estimator: kdtree.KDTree
+
+n_trials: 3
+
+datasets:
+  - n_samples_train: 1e6
+    n_samples_test: 1e6
+    n_features: 5
+  - n_samples_train: 1e6
+    n_samples_test: 1e6
+    n_features: 10
+  - n_samples_train: 1e6
+    n_samples_test: 1e6
+    n_features: 50
+  - n_samples_train: 1e6
+    n_samples_test: 1e6
+    n_features: 100
+
+leaf_sizes:
+  - 64
+  - 128
+  - 256
+  - 512
+  - 1024
+  - 2048
+  - 4096
--- a/kdtree/environment.yml
+++ b/kdtree/environment.yml
+name: cython+
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+  - compilers
+  - jupyter
+  - numpy
+  - matplotlib
+  - seaborn
+  - pandas
+  - pyaml
+  - pip
+  - threadpoolctl
+  - pytest
+  - scikit-learn
+  - memory_profiler
+  - pip:
+      # Install cython+ from upstream directly
+    - -e git+https://gitlab.inria.fr/jjerphan/cython.git@b30eafec6a7b174afdc4f023b45b21f85104e2fe#egg=Cython
+      # The installation of the 'kdtree' module is made then
--- a/kdtree/kdtree.pyx
+++ b/kdtree/kdtree.pyx
--- a/kdtree/main.py
+++ b/kdtree/main.py
-import numpy as np
-import kdtree
-
-if __name__ == "__main__":
-    n, d = 1000000, 2
-    golden_ratio = (1 + 5 ** 0.5) / 2
-    X = np.zeros((n, d))
-    for i in range(n):
-        X[i, 0] = (i / golden_ratio) % 1
-        X[i, 1] = i / n
-    tree = kdtree.KDTree(X, depth=10)
-    del tree
--- a/kdtree/runtime/runtime.pxd
+++ b/kdtree/runtime/runtime.pxd
@@ -217,3 +217,47 @@ cdef cypclass BatchMailBox(SequentialMailBox):

 cdef inline ActhonResultInterface NullResult() nogil:
    return NULL
+
+
+# Taken from:
+# https://lab.nexedi.com/nexedi/cython/blob/3.0a6-cypclass/tests/run/cypclass_acthon.pyx#L66
+cdef cypclass WaitResult(ActhonResultInterface):
+    union result_t:
+        int int_val
+        void* ptr
+    result_t result
+    sem_t semaphore
+
+    __init__(self):
+        self.result.ptr = NULL
+        sem_init(&self.semaphore, 0, 0)
+
+    __dealloc__(self):
+        sem_destroy(&self.semaphore)
+
+    @staticmethod
+    ActhonResultInterface construct():
+        return WaitResult()
+
+    void pushVoidStarResult(self, void* result):
+        self.result.ptr = result
+        sem_post(&self.semaphore)
+
+    void pushIntResult(self, int result):
+        self.result.int_val = result
+        sem_post(&self.semaphore)
+
+    result_t _getRawResult(const self):
+        # We must ensure a result exists, but we can let others access it immediately
+        # The cast here is a way of const-casting (we're modifying the semaphore in a const method)
+        sem_wait(<sem_t*> &self.semaphore)
+        sem_post(<sem_t*> &self.semaphore)
+        return self.result
+
+    void* getVoidStarResult(const self):
+        res = self._getRawResult()
+        return res.ptr
+
+    int getIntResult(const self):
+        res = self._getRawResult()
+        return res.int_val
--- a/kdtree/setup.py
+++ b/kdtree/setup.py
 from distutils.core import setup
 from distutils.extension import Extension
+
+import numpy
 from Cython.Build import cythonize

 extensions = [
@@ -7,11 +9,15 @@ extensions = [
        "kdtree",
        language="c++",
        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        include_dirs=[numpy.get_include()],
+        extra_compile_args=["-fopenmp"],
+        extra_link_args=["-fopenmp"],
        sources=["kdtree.pyx"],
-        libraries=["crypto", "fmt"],
    ),
 ]

 setup(
-    ext_modules=cythonize(extensions)
+    ext_modules=cythonize(extensions),
+    name="kdtree",
+    version="0.1",
 )
--- a/kdtree/tests/test_conf.py
+++ b/kdtree/tests/test_conf.py
+import numpy as np
+import pytest
+import kdtree
+from sklearn.neighbors import KDTree
+
+@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
+@pytest.mark.parametrize("d", [10, 100])
+@pytest.mark.parametrize("k", [1, 2, 5, 10])
+@pytest.mark.parametrize("leaf_size", [256, 1024])
+def test_against_sklearn(n, d, k, leaf_size):
+    np.random.seed(1)
+    X = np.random.rand(n, d)
+    query_points = np.random.rand(n, d)
+    
+    tree = kdtree.KDTree(X, leaf_size=256)
+    skl_tree = KDTree(X, leaf_size=256)
+    
+    closests = np.zeros((n, k), dtype=np.int32)
+    tree.query(query_points, closests)
+    skl_closests = skl_tree.query(query_points, k=k, return_distance=False).astype(np.int32)
+    
+    np.testing.assert_equal(closests, skl_closests)
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
-ipython==7.22.0
-jupyter==1.0.0
-line-profiler==3.1.0
-matplotlib==3.4.1
-numpy==1.20.2
-e git+https://lab.nexedi.com/nexedi/cython.git@608c29a9ab7b7803c900621424279a9e71fac106#egg=Cython&subdirectory=../../cython