Commit 47b54fbf authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] /dev/urandom scalability improvement

From: David Mosberger <davidm@napali.hpl.hp.com>

Somebody recently pointed out a performance-anomaly to me where an unusual
amount of time was being spent reading from /dev/urandom.  The problem
isn't really surprising as it happened only on >= 4-way machines and the
random driver isn't terribly scalable the way it is written today.  If
scalability _really_ mattered, I suppose per-CPU data structures would be
the way to go.  However, I found that at least for 4-way machines,
performance can be improved considerably with the attached patch.  In
particular, I saw the following performance on a 4-way ia64 machine:

Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
			
parent ce334bb8
...@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word) ...@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word)
**********************************************************************/ **********************************************************************/
struct entropy_store { struct entropy_store {
/* mostly-read data: */
struct poolinfo poolinfo;
__u32 *pool;
/* read-write data: */
spinlock_t lock ____cacheline_aligned_in_smp;
unsigned add_ptr; unsigned add_ptr;
int entropy_count; int entropy_count;
int input_rotate; int input_rotate;
struct poolinfo poolinfo;
__u32 *pool;
spinlock_t lock;
}; };
/* /*
...@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in, ...@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in,
static __u32 const twist_table[8] = { static __u32 const twist_table[8] = {
0, 0x3b6e20c8, 0x76dc4190, 0x4db26158, 0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 }; 0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
unsigned i; unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
int new_rotate; int new_rotate, input_rotate;
int wordmask = r->poolinfo.poolwords - 1; int wordmask = r->poolinfo.poolwords - 1;
__u32 w; __u32 w, next_w;
unsigned long flags; unsigned long flags;
/* Taps are constant, so we can load them without holding r->lock. */
tap1 = r->poolinfo.tap1;
tap2 = r->poolinfo.tap2;
tap3 = r->poolinfo.tap3;
tap4 = r->poolinfo.tap4;
tap5 = r->poolinfo.tap5;
next_w = *in++;
spin_lock_irqsave(&r->lock, flags); spin_lock_irqsave(&r->lock, flags);
prefetch_range(r->pool, wordmask);
input_rotate = r->input_rotate;
add_ptr = r->add_ptr;
while (nwords--) { while (nwords--) {
w = rotate_left(r->input_rotate, *in++); w = rotate_left(input_rotate, next_w);
i = r->add_ptr = (r->add_ptr - 1) & wordmask; if (nwords > 0)
next_w = *in++;
i = add_ptr = (add_ptr - 1) & wordmask;
/* /*
* Normally, we add 7 bits of rotation to the pool. * Normally, we add 7 bits of rotation to the pool.
* At the beginning of the pool, add an extra 7 bits * At the beginning of the pool, add an extra 7 bits
* rotation, so that successive passes spread the * rotation, so that successive passes spread the
* input bits across the pool evenly. * input bits across the pool evenly.
*/ */
new_rotate = r->input_rotate + 14; new_rotate = input_rotate + 14;
if (i) if (i)
new_rotate = r->input_rotate + 7; new_rotate = input_rotate + 7;
r->input_rotate = new_rotate & 31; input_rotate = new_rotate & 31;
/* XOR in the various taps */ /* XOR in the various taps */
w ^= r->pool[(i + r->poolinfo.tap1) & wordmask]; w ^= r->pool[(i + tap1) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap2) & wordmask]; w ^= r->pool[(i + tap2) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap3) & wordmask]; w ^= r->pool[(i + tap3) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap4) & wordmask]; w ^= r->pool[(i + tap4) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap5) & wordmask]; w ^= r->pool[(i + tap5) & wordmask];
w ^= r->pool[i]; w ^= r->pool[i];
r->pool[i] = (w >> 3) ^ twist_table[w & 7]; r->pool[i] = (w >> 3) ^ twist_table[w & 7];
} }
r->input_rotate = input_rotate;
r->add_ptr = add_ptr;
spin_unlock_irqrestore(&r->lock, flags); spin_unlock_irqrestore(&r->lock, flags);
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#ifndef _LINUX_PREFETCH_H #ifndef _LINUX_PREFETCH_H
#define _LINUX_PREFETCH_H #define _LINUX_PREFETCH_H
#include <linux/types.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/cache.h> #include <asm/cache.h>
...@@ -54,4 +55,15 @@ static inline void prefetchw(const void *x) {;} ...@@ -54,4 +55,15 @@ static inline void prefetchw(const void *x) {;}
#define PREFETCH_STRIDE (4*L1_CACHE_BYTES) #define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
#endif #endif
static inline void prefetch_range(void *addr, size_t len)
{
#ifdef ARCH_HAS_PREFETCH
char *cp;
char *end = addr + len;
for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
prefetch(cp);
#endif
}
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment