Commit a054374f authored by Mike Marciniszyn's avatar Mike Marciniszyn Committed by Greg Kroah-Hartman

staging/rdma/hfi1: convert buffers allocated atomic to per cpu

Profiling has shown the the atomic is a performance issue
for the pio hot path.

If multiple cpus allocated an sc's buffer, the cacheline
containing the atomic will bounce from L0 to L0.

Convert the atomic to a percpu variable.
Reviewed-by: default avatarJubin John <jubin.john@intel.com>
Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent a5a9e8cc
...@@ -660,6 +660,24 @@ void set_pio_integrity(struct send_context *sc) ...@@ -660,6 +660,24 @@ void set_pio_integrity(struct send_context *sc)
write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg); write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg);
} }
static u32 get_buffers_allocated(struct send_context *sc)
{
int cpu;
u32 ret = 0;
for_each_possible_cpu(cpu)
ret += *per_cpu_ptr(sc->buffers_allocated, cpu);
return ret;
}
static void reset_buffers_allocated(struct send_context *sc)
{
int cpu;
for_each_possible_cpu(cpu)
(*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0;
}
/* /*
* Allocate a NUMA relative send context structure of the given type along * Allocate a NUMA relative send context structure of the given type along
* with a HW context. * with a HW context.
...@@ -668,7 +686,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, ...@@ -668,7 +686,7 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
uint hdrqentsize, int numa) uint hdrqentsize, int numa)
{ {
struct send_context_info *sci; struct send_context_info *sci;
struct send_context *sc; struct send_context *sc = NULL;
dma_addr_t pa; dma_addr_t pa;
unsigned long flags; unsigned long flags;
u64 reg; u64 reg;
...@@ -686,10 +704,20 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, ...@@ -686,10 +704,20 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
if (!sc) if (!sc)
return NULL; return NULL;
sc->buffers_allocated = alloc_percpu(u32);
if (!sc->buffers_allocated) {
kfree(sc);
dd_dev_err(dd,
"Cannot allocate buffers_allocated per cpu counters\n"
);
return NULL;
}
spin_lock_irqsave(&dd->sc_lock, flags); spin_lock_irqsave(&dd->sc_lock, flags);
ret = sc_hw_alloc(dd, type, &sw_index, &hw_context); ret = sc_hw_alloc(dd, type, &sw_index, &hw_context);
if (ret) { if (ret) {
spin_unlock_irqrestore(&dd->sc_lock, flags); spin_unlock_irqrestore(&dd->sc_lock, flags);
free_percpu(sc->buffers_allocated);
kfree(sc); kfree(sc);
return NULL; return NULL;
} }
...@@ -705,7 +733,6 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, ...@@ -705,7 +733,6 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type,
spin_lock_init(&sc->credit_ctrl_lock); spin_lock_init(&sc->credit_ctrl_lock);
INIT_LIST_HEAD(&sc->piowait); INIT_LIST_HEAD(&sc->piowait);
INIT_WORK(&sc->halt_work, sc_halted); INIT_WORK(&sc->halt_work, sc_halted);
atomic_set(&sc->buffers_allocated, 0);
init_waitqueue_head(&sc->halt_wait); init_waitqueue_head(&sc->halt_wait);
/* grouping is always single context for now */ /* grouping is always single context for now */
...@@ -866,6 +893,7 @@ void sc_free(struct send_context *sc) ...@@ -866,6 +893,7 @@ void sc_free(struct send_context *sc)
spin_unlock_irqrestore(&dd->sc_lock, flags); spin_unlock_irqrestore(&dd->sc_lock, flags);
kfree(sc->sr); kfree(sc->sr);
free_percpu(sc->buffers_allocated);
kfree(sc); kfree(sc);
} }
...@@ -1029,7 +1057,7 @@ int sc_restart(struct send_context *sc) ...@@ -1029,7 +1057,7 @@ int sc_restart(struct send_context *sc)
/* kernel context */ /* kernel context */
loop = 0; loop = 0;
while (1) { while (1) {
count = atomic_read(&sc->buffers_allocated); count = get_buffers_allocated(sc);
if (count == 0) if (count == 0)
break; break;
if (loop > 100) { if (loop > 100) {
...@@ -1197,7 +1225,8 @@ int sc_enable(struct send_context *sc) ...@@ -1197,7 +1225,8 @@ int sc_enable(struct send_context *sc)
sc->sr_head = 0; sc->sr_head = 0;
sc->sr_tail = 0; sc->sr_tail = 0;
sc->flags = 0; sc->flags = 0;
atomic_set(&sc->buffers_allocated, 0); /* the alloc lock insures no fast path allocation */
reset_buffers_allocated(sc);
/* /*
* Clear all per-context errors. Some of these will be set when * Clear all per-context errors. Some of these will be set when
...@@ -1373,7 +1402,8 @@ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, ...@@ -1373,7 +1402,8 @@ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
/* there is enough room */ /* there is enough room */
atomic_inc(&sc->buffers_allocated); preempt_disable();
this_cpu_inc(*sc->buffers_allocated);
/* read this once */ /* read this once */
head = sc->sr_head; head = sc->sr_head;
......
...@@ -130,7 +130,7 @@ struct send_context { ...@@ -130,7 +130,7 @@ struct send_context {
spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp; spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
u64 credit_ctrl; /* cache for credit control */ u64 credit_ctrl; /* cache for credit control */
u32 credit_intr_count; /* count of credit intr users */ u32 credit_intr_count; /* count of credit intr users */
atomic_t buffers_allocated; /* count of buffers allocated */ u32 __percpu *buffers_allocated;/* count of buffers allocated */
wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */ wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */
}; };
......
...@@ -160,7 +160,8 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, ...@@ -160,7 +160,8 @@ void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc,
} }
/* finished with this buffer */ /* finished with this buffer */
atomic_dec(&pbuf->sc->buffers_allocated); this_cpu_dec(*pbuf->sc->buffers_allocated);
preempt_enable();
} }
/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */ /* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */
...@@ -854,5 +855,6 @@ void seg_pio_copy_end(struct pio_buf *pbuf) ...@@ -854,5 +855,6 @@ void seg_pio_copy_end(struct pio_buf *pbuf)
} }
/* finished with this buffer */ /* finished with this buffer */
atomic_dec(&pbuf->sc->buffers_allocated); this_cpu_dec(*pbuf->sc->buffers_allocated);
preempt_enable();
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment