Commit ed81d102 authored by Julien Jerphanion's avatar Julien Jerphanion

Merge branch 'synchro' into 'main'

Add querying for KNN

See merge request jjerphan/cython_plus_experiments!1
parents f68236da 146eee39
INCLUDE_DIRS = -I/usr/include/python3.8
SHELL = /bin/bash
PROJECT = cython+
VENV_PATH=`conda info --base`/envs/${PROJECT}
PIP_EXECUTABLE=${VENV_PATH}/bin/pip
PYTHON_EXECUTABLE=${VENV_PATH}/bin/python
PYTEST_EXECUTABLE=${VENV_PATH}/bin/pytest
# Used when not using the python runtime
INCLUDE_DIRS = -I/usr/include/python3.9
EXE = kdtree
CXX = g++
CPPFLAGS = -O2 -g -Wno-unused-result -Wsign-compare -pthread $(INCLUDE_DIRS)
CPPFLAGS = -O2 -g -Wno-unused-result -Wsign-compare -pthread $(INCLUDE_DIRS) -fopenmp
LDFLAGS += -Wl,--unresolved-symbols=ignore-all
MACROS = -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
LDLIBS = -lcrypto -lfmt
EXT_SUFFIX := $(shell python3 -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
EXT_SUFFIX := $(shell python -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))")
EXT = $(EXE)$(EXT_SUFFIX)
# Build with Python runtime
all: $(EXT)
.DEFAULT_GOAL := all
$(EXT): setup.py
@echo "[Cython Compiling $^ -> $@]"
python3 setup.py build_ext --inplace
## help: Display list of commands
.PHONY: help
help: Makefile
@sed -n 's|^##||p' $< | column -t -s ':' | sed -e 's|^| |'
# Run with Python runtime
run: $(EXT)
python3 -c "import $(EXE); $(EXE).python_main()" 2>/dev/null
## all: Run the main targets
.PHONY: all
all: install benchmark
# Build without Python runtime
## install: Install conda env.
.PHONY: install
install: clean
conda env create --force -f environment.yml
${PIP_EXECUTABLE} install -e . -v
# nopython: Build without the Python runtime
.PHONY: nopython
nopython: $(EXE)
%.cpp: %.pyx
@echo "[Cython Compiling $^ -> $@]"
python3 -c "from Cython.Compiler.Main import main; main(command_line=1)" $^ --cplus -3
${PYTEST_EXECUTABLE} -c "from Cython.Compiler.Main import main; main(command_line=1)" $^ --cplus -3
@rm -f $(subst .cpp,.h,$@)
%: %.cpp
@echo "[C++ Compiling $^ -> $@]"
$(LINK.cpp) $^ $(LOADLIBES) $(LDLIBS) $(MACROS) -o $@
$(LINK.cpp) $^ $(MACROS) -o $@
# Run without Python runtime
## runnopython: Run without Python runtime
.PHONY: runnopython
runnopython: $(EXE)
# Information of the runtime are currently redirected to stderr.
# This is just a simple way to mute them.
./$(EXE) 2>/dev/null
## clean: Remove generated files from Cython and C/C++ compilation
.PHONY: clean
clean:
-rm -f *.c *.cpp *.html
-rm -f *.h
......@@ -46,5 +64,17 @@ clean:
-rm -f -r build
-rm -f *.json
.PHONY: all run nopython runnopython clean
.PRECIOUS: %.cpp
## benchmark: Run benchmarks
# Uses taskset to cap to a cpu solely
.PHONY: benchmark
benchmark:
for i in {0..5}; do \
taskset -c 0-$$((2**i-1)) ${PYTHON_EXECUTABLE} benchmarks/benchmark.py `git rev-parse --short HEAD`_$$((2**i-1))_thread ;\
done
## test: Launch all the test.
.PHONY: test
test:
${PYTEST_EXECUTABLE} tests
import argparse
import glob
import importlib
import json
import os
import subprocess
import time
import kdtree
import numpy as np
import pandas as pd
import seaborn as sns
import threadpoolctl
import yaml
from pprint import pprint
from matplotlib import pyplot as plt
from memory_profiler import memory_usage
from sklearn import set_config
from sklearn.neighbors import KDTree
# Be gentle with eyes
plt.rcParams["figure.dpi"] = 200
def benchmark(config, results_folder, bench_name):
datasets = config["datasets"]
estimators = config["estimators"]
leaf_sizes = config["leaf_sizes"]
n_neighbors = config.get("n_neighbors", [])
n_trials = config.get("n_trials", 3)
return_distance = config.get("return_distance", False)
one_GiB = 1e9
benchmarks = pd.DataFrame()
env_specs_file = f"{results_folder}/{bench_name}.json"
# TODO: This is ugly, but I haven't found something better.
commit = (
str(subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]))
.replace("b'", "")
.replace("\\n'", "")
)
env_specs = dict(
threadpool_info=threadpoolctl.threadpool_info(),
commit=commit,
config=config,
)
set_config(assume_finite=True)
with open(env_specs_file, "w") as outfile:
json.dump(env_specs, outfile)
for dataset in datasets:
for leaf_size in leaf_sizes:
for trial in range(n_trials):
dataset = {k: int(float(v)) for k, v in dataset.items()}
ns_train, ns_test, n_features = dataset.values()
X_train = np.random.rand(ns_train, n_features)
X_test = np.random.rand(ns_test, n_features)
bytes_processed_data_init = X_train.nbytes
bytes_processed_data_query = X_test.nbytes
t0_ = time.perf_counter()
sk_tree = KDTree(X_train, leaf_size=leaf_size)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="init",
implementation="sklearn",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=np.nan,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_init / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
t0_ = time.perf_counter()
tree = kdtree.KDTree(X_train, leaf_size=leaf_size)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="init",
implementation="kdtree",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=np.nan,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_init / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
benchmarks.to_csv(
f"{results_folder}/{bench_name}.csv",
mode="w+",
index=False,
)
for k in n_neighbors:
t0_ = time.perf_counter()
sk_tree.query(X_test, k=k, return_distance=False)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="query",
implementation="sklearn",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=k,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_query / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
closests = np.zeros((ns_test, k), dtype=np.int32)
t0_ = time.perf_counter()
tree.query(X_test, closests)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="query",
implementation="kdtree",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=k,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_query / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
benchmarks.to_csv(
f"{results_folder}/{bench_name}.csv",
mode="w+",
index=False,
)
# Overriding again now that all the dyn. lib. have been loaded
env_specs["threadpool_info"] = threadpoolctl.threadpool_info()
with open(env_specs_file, "w") as outfile:
json.dump(env_specs, outfile)
def report(results_folder, bench_name):
df = pd.read_csv(glob.glob(f"{results_folder}/*.csv")[0])
with open(glob.glob(f"{results_folder}/*.json")[0], "r") as json_file:
env_specs = json.load(json_file)
cols = [
"n_samples_train",
"n_features",
"leaf_size",
]
df[cols] = df[cols].astype(np.uint32)
df['d'] = df.n_features.apply(str)
df['leaf'] = df.leaf_size.apply(str)
df_grouped = df.groupby(cols)
for i, (vals, df) in enumerate(df_grouped):
# 16:9 ratio
fig = plt.figure(figsize=(24, 13.5))
ax = plt.gca()
splot = sns.barplot(
y="leaf", x="throughput", hue="implementation", data=df, ax=ax
)
_ = ax.set_xlabel("Throughput (in GB/s)")
_ = ax.set_ylabel("Leaf Size")
_ = ax.tick_params(labelrotation=45)
# Adding the numerical values of "x" to bar
for p in splot.patches:
_ = splot.annotate(
f"{p.get_width():.4e}",
(p.get_width(), p.get_y() + p.get_height() / 2),
ha="center",
va="center",
size=10,
xytext=(0, -12),
textcoords="offset points",
)
title = (
f"KDTree@{env_specs['commit']} - "
f"Euclidean Distance, dtype=np.float64, {df.trial.max() + 1} trials - Bench. Name: {bench_name}\n"
)
title += (
"n_samples_train=%s - n_features=%s - leaf_size=%s" % vals
)
_ = fig.suptitle(title, fontsize=16)
plt.savefig(f"{results_folder}/{bench_name}_{i}.pdf", bbox_inches="tight")
# Unifying pdf files into one
pdf_files = sorted(glob.glob(f"{results_folder}/{bench_name}*.pdf"))
subprocess.check_output(
["pdfunite", *pdf_files, f"{results_folder}/{bench_name}.pdf"]
)
if __name__ == "__main__":
parser = argparse.ArgumentParser("benchmark")
parser.add_argument("bench_name")
args = parser.parse_args()
bench_name = args.bench_name
with open("benchmarks/config.yml", "r") as f:
config = yaml.full_load(f)
results_folder = f"benchmarks/results/{bench_name}"
os.makedirs(results_folder, exist_ok=True)
print(f"Benchmarking {bench_name}")
benchmark(config, results_folder, bench_name)
print(f"Benchmark results wrote in {results_folder}")
print(f"Reporting results for {bench_name}")
report(results_folder, bench_name)
print(f"Reporting results wrote in {results_folder}")
\ No newline at end of file
estimators:
- name: sklearn
estimator: sklearn.neighbors.KDTree
- name: cython+
estimator: kdtree.KDTree
n_trials: 3
datasets:
- n_samples_train: 1e6
n_samples_test: 1e6
n_features: 5
- n_samples_train: 1e6
n_samples_test: 1e6
n_features: 10
- n_samples_train: 1e6
n_samples_test: 1e6
n_features: 50
- n_samples_train: 1e6
n_samples_test: 1e6
n_features: 100
leaf_sizes:
- 64
- 128
- 256
- 512
- 1024
- 2048
- 4096
name: cython+
channels:
- conda-forge
dependencies:
- python=3.9
- compilers
- jupyter
- numpy
- matplotlib
- seaborn
- pandas
- pyaml
- pip
- threadpoolctl
- pytest
- scikit-learn
- memory_profiler
- pip:
# Install cython+ from upstream directly
- -e git+https://gitlab.inria.fr/jjerphan/cython.git@b30eafec6a7b174afdc4f023b45b21f85104e2fe#egg=Cython
# The installation of the 'kdtree' module is made then
This diff is collapsed.
import numpy as np
import kdtree
if __name__ == "__main__":
n, d = 1000000, 2
golden_ratio = (1 + 5 ** 0.5) / 2
X = np.zeros((n, d))
for i in range(n):
X[i, 0] = (i / golden_ratio) % 1
X[i, 1] = i / n
tree = kdtree.KDTree(X, depth=10)
del tree
......@@ -217,3 +217,47 @@ cdef cypclass BatchMailBox(SequentialMailBox):
cdef inline ActhonResultInterface NullResult() nogil:
return NULL
# Taken from:
# https://lab.nexedi.com/nexedi/cython/blob/3.0a6-cypclass/tests/run/cypclass_acthon.pyx#L66
cdef cypclass WaitResult(ActhonResultInterface):
union result_t:
int int_val
void* ptr
result_t result
sem_t semaphore
__init__(self):
self.result.ptr = NULL
sem_init(&self.semaphore, 0, 0)
__dealloc__(self):
sem_destroy(&self.semaphore)
@staticmethod
ActhonResultInterface construct():
return WaitResult()
void pushVoidStarResult(self, void* result):
self.result.ptr = result
sem_post(&self.semaphore)
void pushIntResult(self, int result):
self.result.int_val = result
sem_post(&self.semaphore)
result_t _getRawResult(const self):
# We must ensure a result exists, but we can let others access it immediately
# The cast here is a way of const-casting (we're modifying the semaphore in a const method)
sem_wait(<sem_t*> &self.semaphore)
sem_post(<sem_t*> &self.semaphore)
return self.result
void* getVoidStarResult(const self):
res = self._getRawResult()
return res.ptr
int getIntResult(const self):
res = self._getRawResult()
return res.int_val
from distutils.core import setup
from distutils.extension import Extension
import numpy
from Cython.Build import cythonize
extensions = [
......@@ -7,11 +9,15 @@ extensions = [
"kdtree",
language="c++",
define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
include_dirs=[numpy.get_include()],
extra_compile_args=["-fopenmp"],
extra_link_args=["-fopenmp"],
sources=["kdtree.pyx"],
libraries=["crypto", "fmt"],
),
]
setup(
ext_modules=cythonize(extensions)
ext_modules=cythonize(extensions),
name="kdtree",
version="0.1",
)
import numpy as np
import pytest
import kdtree
from sklearn.neighbors import KDTree
@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
@pytest.mark.parametrize("d", [10, 100])
@pytest.mark.parametrize("k", [1, 2, 5, 10])
@pytest.mark.parametrize("leaf_size", [256, 1024])
def test_against_sklearn(n, d, k, leaf_size):
np.random.seed(1)
X = np.random.rand(n, d)
query_points = np.random.rand(n, d)
tree = kdtree.KDTree(X, leaf_size=256)
skl_tree = KDTree(X, leaf_size=256)
closests = np.zeros((n, k), dtype=np.int32)
tree.query(query_points, closests)
skl_closests = skl_tree.query(query_points, k=k, return_distance=False).astype(np.int32)
np.testing.assert_equal(closests, skl_closests)
\ No newline at end of file
ipython==7.22.0
jupyter==1.0.0
line-profiler==3.1.0
matplotlib==3.4.1
numpy==1.20.2
-e git+https://lab.nexedi.com/nexedi/cython.git@608c29a9ab7b7803c900621424279a9e71fac106#egg=Cython&subdirectory=../../cython
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment