crypto: xor - use ktime for template benchmarking

Currently, we use the jiffies counter as a time source, by staring at it until a HZ period elapses, and then staring at it again and perform as many XOR operations as we can at the same time until another HZ period elapses, so that we can calculate the throughput. This takes longer than necessary, and depends on HZ, which is undesirable, since HZ is system dependent. Let's use the ktime interface instead, and use it to time a fixed number of XOR operations, which can be done much faster, and makes the time spent depend on the performance level of the system itself, which is much more reasonable. To ensure that we have the resolution we need even on systems with 32 kHz time sources, while not spending too much time in the benchmark on a slow CPU, let's switch to 3 attempts of 800 repetitions each: that way, we will only misidentify algorithms that perform within 10% of each other as the fastest if they are faster than 10 GB/s to begin with, which is not expected to occur on systems with such coarse clocks. On ThunderX2, I get the following results: Before: [72625.956765] xor: measuring software checksum speed [72625.993104] 8regs : 10169.000 MB/sec [72626.033099] 32regs : 12050.000 MB/sec [72626.073095] arm64_neon: 11100.000 MB/sec [72626.073097] xor: using function: 32regs (12050.000 MB/sec) After: [72599.650216] xor: measuring software checksum speed [72599.651188] 8regs : 10491 MB/sec [72599.652006] 32regs : 12345 MB/sec [72599.652871] arm64_neon : 11402 MB/sec [72599.652873] xor: using function: 32regs (12345 MB/sec) Link: https://lore.kernel.org/linux-crypto/20200923182230.22715-3-ardb@kernel.org/Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Douglas Anderson <dianders@chromium.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: xor - use ktime for template benchmarking
Currently, we use the jiffies counter as a time source, by staring at it until a HZ period elapses, and then staring at it again and perform as many XOR operations as we can at the same time until another HZ period elapses, so that we can calculate the throughput. This takes longer than necessary, and depends on HZ, which is undesirable, since HZ is system dependent. Let's use the ktime interface instead, and use it to time a fixed number of XOR operations, which can be done much faster, and makes the time spent depend on the performance level of the system itself, which is much more reasonable. To ensure that we have the resolution we need even on systems with 32 kHz time sources, while not spending too much time in the benchmark on a slow CPU, let's switch to 3 attempts of 800 repetitions each: that way, we will only misidentify algorithms that perform within 10% of each other as the fastest if they are faster than 10 GB/s to begin with, which is not expected to occur on systems with such coarse clocks. On ThunderX2, I get the following results: Before: [72625.956765] xor: measuring software checksum speed [72625.993104] 8regs : 10169.000 MB/sec [72626.033099] 32regs : 12050.000 MB/sec [72626.073095] arm64_neon: 11100.000 MB/sec [72626.073097] xor: using function: 32regs (12050.000 MB/sec) After: [72599.650216] xor: measuring software checksum speed [72599.651188] 8regs : 10491 MB/sec [72599.652006] 32regs : 12345 MB/sec [72599.652871] arm64_neon : 11402 MB/sec [72599.652873] xor: using function: 32regs (12345 MB/sec) Link: https://lore.kernel.org/linux-crypto/20200923182230.22715-3-ardb@kernel.org/Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Reviewed-by: Douglas Anderson <dianders@chromium.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
c055e3ea · Ard Biesheuvel · Herbert Xu · 524ccdbd · c055e3ea
Commit c055e3ea authored Sep 26, 2020 by Ard Biesheuvel Committed by Herbert Xu Oct 02, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 22 deletions

crypto/xor.c crypto/xor.c +16 -22

No files found.
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -76,49 +76,43 @@ static int __init register_xor_blocks(void)
 }
 #endif

-#define BENCH_SIZE (PAGE_SIZE)
+#define BENCH_SIZE	4096
+#define REPS		800U

 static void __init
 do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
 {
 	int speed;
-	unsigned long now, j;
-	int i, count, max;
+	int i, j, count;
+	ktime_t min, start, diff;

 	tmpl->next = template_list;
 	template_list = tmpl;

 	preempt_disable();

-	/*
-	 * Count the number of XORs done during a whole jiffy, and use
-	 * this to calculate the speed of checksumming.  We use a 2-page
-	 * allocation to have guaranteed color L1-cache layout.
-	 */
-	max = 0;
-	for (i = 0; i < 5; i++) {
-		j = jiffies;
-		count = 0;
-		while ((now = jiffies) == j)
-			cpu_relax();
-		while (time_before(jiffies, now + 1)) {
+	min = (ktime_t)S64_MAX;
+	for (i = 0; i < 3; i++) {
+		start = ktime_get();
+		for (j = 0; j < REPS; j++) {
 			mb(); /* prevent loop optimzation */
 			tmpl->do_2(BENCH_SIZE, b1, b2);
 			mb();
 			count++;
 			mb();
 		}
-		if (count > max)
-			max = count;
+		diff = ktime_sub(ktime_get(), start);
+		if (diff < min)
+			min = diff;
 	}

 	preempt_enable();

-	speed = max * (HZ * BENCH_SIZE / 1024);
+	// bytes/ns == GB/s, multiply by 1000 to get MB/s [not MiB/s]
+	speed = (1000 * REPS * BENCH_SIZE) / (unsigned int)ktime_to_ns(min);
 	tmpl->speed = speed;

-	printk(KERN_INFO "   %-10s: %5d.%03d MB/sec\n", tmpl->name,
-	       speed / 1000, speed % 1000);
+	pr_info("   %-16s: %5d MB/sec\n", tmpl->name, speed);
 }

 static int __init
@@ -158,8 +152,8 @@ calibrate_xor_blocks(void)
 		if (f->speed > fastest->speed)
 			fastest = f;

-	printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n",
-	       fastest->name, fastest->speed / 1000, fastest->speed % 1000);
+	pr_info("xor: using function: %s (%d MB/sec)\n",
+	       fastest->name, fastest->speed);

 #undef xor_speed