Commit c286077c authored by Julien Jerphanion's avatar Julien Jerphanion

Add simple benchmark setup

parent f672883e
import argparse
import glob
import importlib
import json
import os
import subprocess
import time
import kdtree
import numpy as np
import pandas as pd
import seaborn as sns
import threadpoolctl
import yaml
from pprint import pprint
from matplotlib import pyplot as plt
from memory_profiler import memory_usage
from sklearn import set_config
from sklearn.neighbors import KDTree
# Be gentle with eyes
plt.rcParams["figure.dpi"] = 200
def benchmark(config, results_folder, bench_name):
datasets = config["datasets"]
estimators = config["estimators"]
leaf_sizes = config["leaf_sizes"]
n_trials = config.get("n_trials", 3)
return_distance = config.get("return_distance", False)
one_GiB = 1e9
benchmarks = pd.DataFrame()
env_specs_file = f"{results_folder}/{bench_name}.json"
# TODO: This is ugly, but I haven't found something better.
commit = (
str(subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]))
.replace("b'", "")
.replace("\\n'", "")
)
env_specs = dict(
threadpool_info=threadpoolctl.threadpool_info(),
commit=commit,
config=config,
)
set_config(assume_finite=True)
with open(env_specs_file, "w") as outfile:
json.dump(env_specs, outfile)
for dataset in datasets:
for leaf_size in leaf_sizes:
for trial in range(n_trials):
dataset = {k: int(float(v)) for k, v in dataset.items()}
ns_train, n_features = dataset.values()
X_train = np.random.rand(ns_train, n_features)
bytes_processed_data = X_train.nbytes
t0_ = time.perf_counter()
tree = KDTree(X_train, leaf_size=256)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
implementation="sklearn",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_features=n_features,
time_elapsed=time_elapsed,
throughput=bytes_processed_data / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
t0_ = time.perf_counter()
tree = kdtree.KDTree(X_train, leaf_size=256)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
implementation="kdtree",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_features=n_features,
time_elapsed=time_elapsed,
throughput=bytes_processed_data / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
benchmarks.to_csv(
f"{results_folder}/{bench_name}.csv",
mode="w+",
index=False,
)
# Overriding again now that all the dyn. lib. have been loaded
env_specs["threadpool_info"] = threadpoolctl.threadpool_info()
with open(env_specs_file, "w") as outfile:
json.dump(env_specs, outfile)
def report(results_folder, bench_name):
df = pd.read_csv(glob.glob(f"{results_folder}/*.csv")[0])
with open(glob.glob(f"{results_folder}/*.json")[0], "r") as json_file:
env_specs = json.load(json_file)
cols = [
"n_samples_train",
"n_features",
]
df[cols] = df[cols].astype(np.uint32)
df['d'] = df.n_features.apply(str)
df['leaf'] = df.leaf_size.apply(str)
df_grouped = df.groupby(cols)
for i, (vals, df) in enumerate(df_grouped):
# 16:9 ratio
fig = plt.figure(figsize=(24, 13.5))
ax = plt.gca()
splot = sns.barplot(
y="leaf", x="throughput", hue="implementation", data=df, ax=ax
)
_ = ax.set_xlabel("Throughput (in GB/s)")
_ = ax.set_ylabel("Leaf Size")
_ = ax.tick_params(labelrotation=45)
# Adding the numerical values of "x" to bar
for p in splot.patches:
_ = splot.annotate(
f"{p.get_width():.4e}",
(p.get_width(), p.get_y() + p.get_height() / 2),
ha="center",
va="center",
size=10,
xytext=(0, -12),
textcoords="offset points",
)
title = (
f"KDTree@{env_specs['commit']} - "
f"Euclidean Distance, dtype=np.float64, {df.trial.max() + 1} trials - Bench. Name: {bench_name}\n"
)
title += (
"n_samples_train=%s - n_features=%s" % vals
)
_ = fig.suptitle(title, fontsize=16)
plt.savefig(f"{results_folder}/{bench_name}_{i}.pdf", bbox_inches="tight")
# Unifying pdf files into one
pdf_files = sorted(glob.glob(f"{results_folder}/{bench_name}*.pdf"))
subprocess.check_output(
["pdfunite", *pdf_files, f"{results_folder}/{bench_name}.pdf"]
)
if __name__ == "__main__":
parser = argparse.ArgumentParser("benchmark")
parser.add_argument("bench_name")
args = parser.parse_args()
bench_name = args.bench_name
with open("benchmarks/config.yml", "r") as f:
config = yaml.full_load(f)
results_folder = f"benchmarks/results/{bench_name}"
os.makedirs(results_folder, exist_ok=True)
print(f"Benchmarking {bench_name}")
benchmark(config, results_folder, bench_name)
print(f"Benchmark results wrote in {results_folder}")
print(f"Reporting results for {bench_name}")
report(results_folder, bench_name)
print(f"Reporting results wrote in {results_folder}")
\ No newline at end of file
estimators:
- name: sklearn
estimator: sklearn.neighbors.KDTree
- name: cython+
estimator: kdtree.KDTree
n_trials: 1
datasets:
- n_samples_train: 1e3
n_features: 10
- n_samples_train: 1e3
n_features: 50
- n_samples_train: 1e3
n_features: 100
- n_samples_train: 1e3
n_features: 1000
- n_samples_train: 1e6
n_features: 10
- n_samples_train: 1e6
n_features: 50
- n_samples_train: 1e6
n_features: 100
leaf_sizes:
- 64
- 128
- 256
- 512
- 1024
- 2048
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment