Commit bfd24160 authored by Greg Banks's avatar Greg Banks Committed by Linus Torvalds

[PATCH] knfsd: make rpc threads pools numa aware

Actually implement multiple pools.  On NUMA machines, allocate a svc_pool per
NUMA node; on SMP a svc_pool per CPU; otherwise a single global pool.  Enqueue
sockets on the svc_pool corresponding to the CPU on which the socket bh is run
(i.e.  the NIC interrupt CPU).  Threads have their cpu mask set to limit them
to the CPUs in the svc_pool that owns them.

This is the patch that allows an Altix to scale NFS traffic linearly
beyond 4 CPUs and 4 NICs.

Incorporates changes and feedback from Neil Brown, Trond Myklebust, and
Christoph Hellwig.
Signed-off-by: default avatarGreg Banks <gnb@melbourne.sgi.com>
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent eec09661
...@@ -386,5 +386,6 @@ int svc_process(struct svc_rqst *); ...@@ -386,5 +386,6 @@ int svc_process(struct svc_rqst *);
int svc_register(struct svc_serv *, int, unsigned short); int svc_register(struct svc_serv *, int, unsigned short);
void svc_wake_up(struct svc_serv *); void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space); void svc_reserve(struct svc_rqst *rqstp, int space);
struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
#endif /* SUNRPC_SVC_H */ #endif /* SUNRPC_SVC_H */
...@@ -4,6 +4,10 @@ ...@@ -4,6 +4,10 @@
* High-level RPC service routines * High-level RPC service routines
* *
* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
*
* Multiple threads pools and NUMAisation
* Copyright (c) 2006 Silicon Graphics, Inc.
* by Greg Banks <gnb@melbourne.sgi.com>
*/ */
#include <linux/linkage.h> #include <linux/linkage.h>
...@@ -24,6 +28,242 @@ ...@@ -24,6 +28,242 @@
#define RPCDBG_FACILITY RPCDBG_SVCDSP #define RPCDBG_FACILITY RPCDBG_SVCDSP
#define RPC_PARANOIA 1 #define RPC_PARANOIA 1
/*
* Mode for mapping cpus to pools.
*/
enum {
SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */
SVC_POOL_GLOBAL, /* no mapping, just a single global pool
* (legacy & UP mode) */
SVC_POOL_PERCPU, /* one pool per cpu */
SVC_POOL_PERNODE /* one pool per numa node */
};
/*
* Structure for mapping cpus to pools and vice versa.
* Setup once during sunrpc initialisation.
*/
static struct svc_pool_map {
int mode; /* Note: int not enum to avoid
* warnings about "enumeration value
* not handled in switch" */
unsigned int npools;
unsigned int *pool_to; /* maps pool id to cpu or node */
unsigned int *to_pool; /* maps cpu or node to pool id */
} svc_pool_map = {
.mode = SVC_POOL_NONE
};
/*
* Detect best pool mapping mode heuristically,
* according to the machine's topology.
*/
static int
svc_pool_map_choose_mode(void)
{
unsigned int node;
if (num_online_nodes() > 1) {
/*
* Actually have multiple NUMA nodes,
* so split pools on NUMA node boundaries
*/
return SVC_POOL_PERNODE;
}
node = any_online_node(node_online_map);
if (nr_cpus_node(node) > 2) {
/*
* Non-trivial SMP, or CONFIG_NUMA on
* non-NUMA hardware, e.g. with a generic
* x86_64 kernel on Xeons. In this case we
* want to divide the pools on cpu boundaries.
*/
return SVC_POOL_PERCPU;
}
/* default: one global pool */
return SVC_POOL_GLOBAL;
}
/*
* Allocate the to_pool[] and pool_to[] arrays.
* Returns 0 on success or an errno.
*/
static int
svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
{
m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
if (!m->to_pool)
goto fail;
m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
if (!m->pool_to)
goto fail_free;
return 0;
fail_free:
kfree(m->to_pool);
fail:
return -ENOMEM;
}
/*
* Initialise the pool map for SVC_POOL_PERCPU mode.
* Returns number of pools or <0 on error.
*/
static int
svc_pool_map_init_percpu(struct svc_pool_map *m)
{
unsigned int maxpools = highest_possible_processor_id()+1;
unsigned int pidx = 0;
unsigned int cpu;
int err;
err = svc_pool_map_alloc_arrays(m, maxpools);
if (err)
return err;
for_each_online_cpu(cpu) {
BUG_ON(pidx > maxpools);
m->to_pool[cpu] = pidx;
m->pool_to[pidx] = cpu;
pidx++;
}
/* cpus brought online later all get mapped to pool0, sorry */
return pidx;
};
/*
* Initialise the pool map for SVC_POOL_PERNODE mode.
* Returns number of pools or <0 on error.
*/
static int
svc_pool_map_init_pernode(struct svc_pool_map *m)
{
unsigned int maxpools = highest_possible_node_id()+1;
unsigned int pidx = 0;
unsigned int node;
int err;
err = svc_pool_map_alloc_arrays(m, maxpools);
if (err)
return err;
for_each_node_with_cpus(node) {
/* some architectures (e.g. SN2) have cpuless nodes */
BUG_ON(pidx > maxpools);
m->to_pool[node] = pidx;
m->pool_to[pidx] = node;
pidx++;
}
/* nodes brought online later all get mapped to pool0, sorry */
return pidx;
}
/*
* Build the global map of cpus to pools and vice versa.
*/
static unsigned int
svc_pool_map_init(void)
{
struct svc_pool_map *m = &svc_pool_map;
int npools = -1;
if (m->mode != SVC_POOL_NONE)
return m->npools;
m->mode = svc_pool_map_choose_mode();
switch (m->mode) {
case SVC_POOL_PERCPU:
npools = svc_pool_map_init_percpu(m);
break;
case SVC_POOL_PERNODE:
npools = svc_pool_map_init_pernode(m);
break;
}
if (npools < 0) {
/* default, or memory allocation failure */
npools = 1;
m->mode = SVC_POOL_GLOBAL;
}
m->npools = npools;
return m->npools;
}
/*
* Set the current thread's cpus_allowed mask so that it
* will only run on cpus in the given pool.
*
* Returns 1 and fills in oldmask iff a cpumask was applied.
*/
static inline int
svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask)
{
struct svc_pool_map *m = &svc_pool_map;
unsigned int node; /* or cpu */
/*
* The caller checks for sv_nrpools > 1, which
* implies that we've been initialized and the
* map mode is not NONE.
*/
BUG_ON(m->mode == SVC_POOL_NONE);
switch (m->mode)
{
default:
return 0;
case SVC_POOL_PERCPU:
node = m->pool_to[pidx];
*oldmask = current->cpus_allowed;
set_cpus_allowed(current, cpumask_of_cpu(node));
return 1;
case SVC_POOL_PERNODE:
node = m->pool_to[pidx];
*oldmask = current->cpus_allowed;
set_cpus_allowed(current, node_to_cpumask(node));
return 1;
}
}
/*
* Use the mapping mode to choose a pool for a given CPU.
* Used when enqueueing an incoming RPC. Always returns
* a non-NULL pool pointer.
*/
struct svc_pool *
svc_pool_for_cpu(struct svc_serv *serv, int cpu)
{
struct svc_pool_map *m = &svc_pool_map;
unsigned int pidx = 0;
/*
* SVC_POOL_NONE happens in a pure client when
* lockd is brought up, so silently treat it the
* same as SVC_POOL_GLOBAL.
*/
switch (m->mode) {
case SVC_POOL_PERCPU:
pidx = m->to_pool[cpu];
break;
case SVC_POOL_PERNODE:
pidx = m->to_pool[cpu_to_node(cpu)];
break;
}
return &serv->sv_pools[pidx % serv->sv_nrpools];
}
/* /*
* Create an RPC service * Create an RPC service
*/ */
...@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, ...@@ -105,8 +345,9 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
svc_thread_fn func, int sig, struct module *mod) svc_thread_fn func, int sig, struct module *mod)
{ {
struct svc_serv *serv; struct svc_serv *serv;
unsigned int npools = svc_pool_map_init();
serv = __svc_create(prog, bufsize, /*npools*/1, shutdown); serv = __svc_create(prog, bufsize, npools, shutdown);
if (serv != NULL) { if (serv != NULL) {
serv->sv_function = func; serv->sv_function = func;
...@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp) ...@@ -209,6 +450,8 @@ svc_release_buffer(struct svc_rqst *rqstp)
/* /*
* Create a thread in the given pool. Caller must hold BKL. * Create a thread in the given pool. Caller must hold BKL.
* On a NUMA or SMP machine, with a multi-pool serv, the thread
* will be restricted to run on the cpus belonging to the pool.
*/ */
static int static int
__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
...@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, ...@@ -216,6 +459,8 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
{ {
struct svc_rqst *rqstp; struct svc_rqst *rqstp;
int error = -ENOMEM; int error = -ENOMEM;
int have_oldmask = 0;
cpumask_t oldmask;
rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL);
if (!rqstp) if (!rqstp)
...@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, ...@@ -235,7 +480,15 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv,
spin_unlock_bh(&pool->sp_lock); spin_unlock_bh(&pool->sp_lock);
rqstp->rq_server = serv; rqstp->rq_server = serv;
rqstp->rq_pool = pool; rqstp->rq_pool = pool;
if (serv->sv_nrpools > 1)
have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask);
error = kernel_thread((int (*)(void *)) func, rqstp, 0); error = kernel_thread((int (*)(void *)) func, rqstp, 0);
if (have_oldmask)
set_cpus_allowed(current, oldmask);
if (error < 0) if (error < 0)
goto out_thread; goto out_thread;
svc_sock_update_bufs(serv); svc_sock_update_bufs(serv);
......
...@@ -151,8 +151,9 @@ static void ...@@ -151,8 +151,9 @@ static void
svc_sock_enqueue(struct svc_sock *svsk) svc_sock_enqueue(struct svc_sock *svsk)
{ {
struct svc_serv *serv = svsk->sk_server; struct svc_serv *serv = svsk->sk_server;
struct svc_pool *pool = &serv->sv_pools[0]; struct svc_pool *pool;
struct svc_rqst *rqstp; struct svc_rqst *rqstp;
int cpu;
if (!(svsk->sk_flags & if (!(svsk->sk_flags &
( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) ))
...@@ -160,6 +161,10 @@ svc_sock_enqueue(struct svc_sock *svsk) ...@@ -160,6 +161,10 @@ svc_sock_enqueue(struct svc_sock *svsk)
if (test_bit(SK_DEAD, &svsk->sk_flags)) if (test_bit(SK_DEAD, &svsk->sk_flags))
return; return;
cpu = get_cpu();
pool = svc_pool_for_cpu(svsk->sk_server, cpu);
put_cpu();
spin_lock_bh(&pool->sp_lock); spin_lock_bh(&pool->sp_lock);
if (!list_empty(&pool->sp_threads) && if (!list_empty(&pool->sp_threads) &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment