Commit 47b54fbf authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] /dev/urandom scalability improvement

From: David Mosberger <davidm@napali.hpl.hp.com>

Somebody recently pointed out a performance-anomaly to me where an unusual
amount of time was being spent reading from /dev/urandom.  The problem
isn't really surprising as it happened only on >= 4-way machines and the
random driver isn't terribly scalable the way it is written today.  If
scalability _really_ mattered, I suppose per-CPU data structures would be
the way to go.  However, I found that at least for 4-way machines,
performance can be improved considerably with the attached patch.  In
particular, I saw the following performance on a 4-way ia64 machine:

Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
			
parent ce334bb8
......@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word)
**********************************************************************/
struct entropy_store {
/* mostly-read data: */
struct poolinfo poolinfo;
__u32 *pool;
/* read-write data: */
spinlock_t lock ____cacheline_aligned_in_smp;
unsigned add_ptr;
int entropy_count;
int input_rotate;
struct poolinfo poolinfo;
__u32 *pool;
spinlock_t lock;
};
/*
......@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in,
static __u32 const twist_table[8] = {
0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
unsigned i;
int new_rotate;
unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
int new_rotate, input_rotate;
int wordmask = r->poolinfo.poolwords - 1;
__u32 w;
__u32 w, next_w;
unsigned long flags;
/* Taps are constant, so we can load them without holding r->lock. */
tap1 = r->poolinfo.tap1;
tap2 = r->poolinfo.tap2;
tap3 = r->poolinfo.tap3;
tap4 = r->poolinfo.tap4;
tap5 = r->poolinfo.tap5;
next_w = *in++;
spin_lock_irqsave(&r->lock, flags);
prefetch_range(r->pool, wordmask);
input_rotate = r->input_rotate;
add_ptr = r->add_ptr;
while (nwords--) {
w = rotate_left(r->input_rotate, *in++);
i = r->add_ptr = (r->add_ptr - 1) & wordmask;
w = rotate_left(input_rotate, next_w);
if (nwords > 0)
next_w = *in++;
i = add_ptr = (add_ptr - 1) & wordmask;
/*
* Normally, we add 7 bits of rotation to the pool.
* At the beginning of the pool, add an extra 7 bits
* rotation, so that successive passes spread the
* input bits across the pool evenly.
*/
new_rotate = r->input_rotate + 14;
new_rotate = input_rotate + 14;
if (i)
new_rotate = r->input_rotate + 7;
r->input_rotate = new_rotate & 31;
new_rotate = input_rotate + 7;
input_rotate = new_rotate & 31;
/* XOR in the various taps */
w ^= r->pool[(i + r->poolinfo.tap1) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap2) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap3) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap4) & wordmask];
w ^= r->pool[(i + r->poolinfo.tap5) & wordmask];
w ^= r->pool[(i + tap1) & wordmask];
w ^= r->pool[(i + tap2) & wordmask];
w ^= r->pool[(i + tap3) & wordmask];
w ^= r->pool[(i + tap4) & wordmask];
w ^= r->pool[(i + tap5) & wordmask];
w ^= r->pool[i];
r->pool[i] = (w >> 3) ^ twist_table[w & 7];
}
r->input_rotate = input_rotate;
r->add_ptr = add_ptr;
spin_unlock_irqrestore(&r->lock, flags);
}
......
......@@ -10,6 +10,7 @@
#ifndef _LINUX_PREFETCH_H
#define _LINUX_PREFETCH_H
#include <linux/types.h>
#include <asm/processor.h>
#include <asm/cache.h>
......@@ -54,4 +55,15 @@ static inline void prefetchw(const void *x) {;}
#define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
#endif
static inline void prefetch_range(void *addr, size_t len)
{
#ifdef ARCH_HAS_PREFETCH
char *cp;
char *end = addr + len;
for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
prefetch(cp);
#endif
}
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment