padata: simplify serialization mechanism

We count the number of processed objects on a percpu basis, so we need to go through all the percpu reorder queues to calculate the sequence number of the next object that needs serialization. This patch changes this to count the number of processed objects global. So we can calculate the sequence number and the percpu reorder queue of the next object that needs serialization without searching through the percpu reorder queues. This avoids some accesses to memory of foreign cpus. Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

padata: simplify serialization mechanism
We count the number of processed objects on a percpu basis, so we need to go through all the percpu reorder queues to calculate the sequence number of the next object that needs serialization. This patch changes this to count the number of processed objects global. So we can calculate the sequence number and the percpu reorder queue of the next object that needs serialization without searching through the percpu reorder queues. This avoids some accesses to memory of foreign cpus. Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
5f1a8c1b · Steffen Klassert · Herbert Xu · 83f619f3 · 5f1a8c1b · 5f1a8c1b
Commit 5f1a8c1b authored Jul 07, 2010 by Steffen Klassert Committed by Herbert Xu Jul 14, 2010
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 55 deletions

include/linux/padata.h include/linux/padata.h +3 -3

kernel/padata.c kernel/padata.c +19 -52

No files found.
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -67,7 +67,6 @@ struct padata_list {
 * @pwork: work struct for parallelization.
 * @swork: work struct for serialization.
 * @pd: Backpointer to the internal control structure.
- * @num_obj: Number of objects that are processed by this cpu.
 * @cpu_index: Index of the cpu.
 */
 struct padata_queue {
@@ -77,7 +76,6 @@ struct padata_queue {
 	struct work_struct	pwork;
 	struct work_struct	swork;
 	struct parallel_data    *pd;
-	atomic_t		num_obj;
 	int			cpu_index;
 };
@@ -93,6 +91,7 @@ struct padata_queue {
 * @max_seq_nr:  Maximal used sequence number.
 * @cpumask: cpumask in use.
 * @lock: Reorder lock.
+ * @processed: Number of already processed objects.
 * @timer: Reorder timer.
 */
 struct parallel_data {
@@ -103,7 +102,8 @@ struct parallel_data {
 	atomic_t                refcnt;
 	unsigned int		max_seq_nr;
 	cpumask_var_t		cpumask;
-	spinlock_t              lock;
+	spinlock_t              lock ____cacheline_aligned;
+	unsigned int            processed;
 	struct timer_list       timer;
 };

--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -170,79 +170,47 @@ EXPORT_SYMBOL(padata_do_parallel);
 */
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
-	int cpu, num_cpus, empty, calc_seq_nr;
+	int cpu, num_cpus;
-	int seq_nr, next_nr, overrun, next_overrun;
+	int next_nr, next_index;
 	struct padata_queue *queue, *next_queue;
 	struct padata_priv *padata;
 	struct padata_list *reorder;
-	empty = 0;
-	next_nr = -1;
-	next_overrun = 0;
-	next_queue = NULL;
 	num_cpus = cpumask_weight(pd->cpumask);
-	for_each_cpu(cpu, pd->cpumask) {
-		queue = per_cpu_ptr(pd->queue, cpu);
-		reorder = &queue->reorder;
 	/*
-		 * Calculate the seq_nr of the object that should be
+	 * Calculate the percpu reorder queue and the sequence
-		 * next in this reorder queue.
+	 * number of the next object.
 	 */
-		overrun = 0;
+	next_nr = pd->processed;
-		calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
+	next_index = next_nr % num_cpus;
-			       + queue->cpu_index;
+	cpu = padata_index_to_cpu(pd, next_index);
+	next_queue = per_cpu_ptr(pd->queue, cpu);
-		if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
-			calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
+	if (unlikely(next_nr > pd->max_seq_nr)) {
-			overrun = 1;
+		next_nr = next_nr - pd->max_seq_nr - 1;
-		}
+		next_index = next_nr % num_cpus;
+		cpu = padata_index_to_cpu(pd, next_index);
-		if (!list_empty(&reorder->list)) {
+		next_queue = per_cpu_ptr(pd->queue, cpu);
-			padata = list_entry(reorder->list.next,
+		pd->processed = 0;
-					    struct padata_priv, list);
-			seq_nr  = padata->seq_nr;
-			BUG_ON(calc_seq_nr != seq_nr);
-		} else {
-			seq_nr = calc_seq_nr;
-			empty++;
-		}
-		if (next_nr < 0 || seq_nr < next_nr
-		    || (next_overrun && !overrun)) {
-			next_nr = seq_nr;
-			next_overrun = overrun;
-			next_queue = queue;
-		}
 	}
 	padata = NULL;
-	if (empty == num_cpus)
-		goto out;
 	reorder = &next_queue->reorder;
 	if (!list_empty(&reorder->list)) {
 		padata = list_entry(reorder->list.next,
 				    struct padata_priv, list);
-		if (unlikely(next_overrun)) {
+		BUG_ON(next_nr != padata->seq_nr);
-			for_each_cpu(cpu, pd->cpumask) {
-				queue = per_cpu_ptr(pd->queue, cpu);
-				atomic_set(&queue->num_obj, 0);
-			}
-		}
 		spin_lock(&reorder->lock);
 		list_del_init(&padata->list);
 		atomic_dec(&pd->reorder_objects);
 		spin_unlock(&reorder->lock);
-		atomic_inc(&next_queue->num_obj);
+		pd->processed++;
 		goto out;
 	}
@@ -430,7 +398,6 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 		INIT_WORK(&queue->pwork, padata_parallel_worker);
 		INIT_WORK(&queue->swork, padata_serial_worker);
-		atomic_set(&queue->num_obj, 0);
 	}
 	num_cpus = cpumask_weight(pd->cpumask);