Commit 32e855bc authored by Julien Jerphanion's avatar Julien Jerphanion

Update benchmark

With some uggly duplication because we just want a quick test.
parent e8d90ccd
......@@ -27,6 +27,7 @@ def benchmark(config, results_folder, bench_name):
datasets = config["datasets"]
estimators = config["estimators"]
leaf_sizes = config["leaf_sizes"]
n_neighbors = config["n_neighbors"]
n_trials = config.get("n_trials", 3)
return_distance = config.get("return_distance", False)
......@@ -57,23 +58,28 @@ def benchmark(config, results_folder, bench_name):
for leaf_size in leaf_sizes:
for trial in range(n_trials):
dataset = {k: int(float(v)) for k, v in dataset.items()}
ns_train, n_features = dataset.values()
ns_train, ns_test, n_features = dataset.values()
X_train = np.random.rand(ns_train, n_features)
bytes_processed_data = X_train.nbytes
X_test = np.random.rand(ns_test, n_features)
bytes_processed_data_init = X_train.nbytes
bytes_processed_data_query = X_test.nbytes
t0_ = time.perf_counter()
tree = KDTree(X_train, leaf_size=256)
sk_tree = KDTree(X_train, leaf_size=256)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="init",
implementation="sklearn",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=np.nan,
time_elapsed=time_elapsed,
throughput=bytes_processed_data / time_elapsed / one_GiB,
throughput=bytes_processed_data_init / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
......@@ -87,12 +93,15 @@ def benchmark(config, results_folder, bench_name):
row = dict(
trial=trial,
func="init",
implementation="kdtree",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=np.nan,
time_elapsed=time_elapsed,
throughput=bytes_processed_data / time_elapsed / one_GiB,
throughput=bytes_processed_data_init / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
......@@ -104,6 +113,61 @@ def benchmark(config, results_folder, bench_name):
mode="w+",
index=False,
)
for k in n_neighbors:
t0_ = time.perf_counter()
sk_tree.query(X_test, k=k, return_distance=False)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="query",
implementation="sklearn",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=k,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_query / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
closests = np.zeros((ns_test, k), dtype=np.int32)
t0_ = time.perf_counter()
tree.query(X_test, closests)
t1_ = time.perf_counter()
time_elapsed = round(t1_ - t0_, 5)
row = dict(
trial=trial,
func="query",
implementation="kdtree",
leaf_size=leaf_size,
n_samples_train=ns_train,
n_samples_test=ns_test,
n_features=n_features,
n_neighbors=k,
time_elapsed=time_elapsed,
throughput=bytes_processed_data_query / time_elapsed / one_GiB,
)
benchmarks = benchmarks.append(row, ignore_index=True)
pprint(row)
print("---")
benchmarks.to_csv(
f"{results_folder}/{bench_name}.csv",
mode="w+",
index=False,
)
# Overriding again now that all the dyn. lib. have been loaded
env_specs["threadpool_info"] = threadpoolctl.threadpool_info()
......
......@@ -7,22 +7,15 @@ estimators:
n_trials: 1
datasets:
- n_samples_train: 1e3
n_features: 10
- n_samples_train: 1e3
n_features: 50
- n_samples_train: 1e3
n_features: 100
- n_samples_train: 1e3
n_features: 1000
- n_samples_train: 1e6
n_samples_test: 1e4
n_features: 5
- n_samples_train: 1e6
n_samples_test: 1e4
n_features: 10
- n_samples_train: 1e6
n_samples_test: 1e4
n_features: 50
- n_samples_train: 1e6
n_features: 100
leaf_sizes:
- 64
......@@ -30,4 +23,9 @@ leaf_sizes:
- 256
- 512
- 1024
- 2048
\ No newline at end of file
- 2048
n_neighbors:
- 1
- 10
- 100
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment