mstsirkin
diff --git a/‎Documentation/block/queue-sysfs.txt
+7-3 b/‎Documentation/block/queue-sysfs.txt
+7-3
diff --git a/‎block/blk-core.c
+7-4 b/‎block/blk-core.c
+7-4
diff --git a/‎block/blk-ioc.c
+20-20 b/‎block/blk-ioc.c
+20-20
diff --git a/‎block/blk-lib.c
+4-1 b/‎block/blk-lib.c
+4-1
diff --git a/‎block/blk-softirq.c
+7-4 b/‎block/blk-softirq.c
+7-4
diff --git a/‎block/blk-sysfs.c
+9-4 b/‎block/blk-sysfs.c
+9-4
diff --git a/‎block/blk-throttle.c
+4-4 b/‎block/blk-throttle.c
+4-4
@@ -45,9 +45,13 @@ device.
 
 rq_affinity (RW)
 ----------------
-If this option is enabled, the block layer will migrate request completions
-to the CPU that originally submitted the request. For some workloads
-this provides a significant reduction in CPU cycles due to caching effects.
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
 
 scheduler (RW)
 --------------
 
@@ -1282,10 +1282,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 	init_request_from_bio(req, bio);
 
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-	    bio_flagged(bio, BIO_CPU_AFFINE)) {
-		req->cpu = blk_cpu_to_group(get_cpu());
-		put_cpu();
-	}
+	    bio_flagged(bio, BIO_CPU_AFFINE))
+		req->cpu = smp_processor_id();
 
 	plug = current->plug;
 	if (plug) {
@@ -1305,7 +1303,10 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 				plug->should_sort = 1;
 		}
 		list_add_tail(&req->queuelist, &plug->list);
+		plug->count++;
 		drive_stat_acct(req, 1);
+		if (plug->count >= BLK_MAX_REQUEST_COUNT)
+			blk_flush_plug_list(plug, false);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
@@ -2629,6 +2630,7 @@ void blk_start_plug(struct blk_plug *plug)
 	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->cb_list);
 	plug->should_sort = 0;
+	plug->count = 0;
 
 	/*
 	 * If this is a nested plug, don't actually assign it. It will be
@@ -2712,6 +2714,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		return;
 
 	list_splice_init(&plug->list, &list);
+	plug->count = 0;
 
 	if (plug->should_sort) {
 		list_sort(NULL, &list, plug_rq_cmp);
 
@@ -82,26 +82,26 @@ void exit_io_context(struct task_struct *task)
 
 struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret;
+	struct io_context *ioc;
 
-	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-	if (ret) {
-		atomic_long_set(&ret->refcount, 1);
-		atomic_set(&ret->nr_tasks, 1);
-		spin_lock_init(&ret->lock);
-		ret->ioprio_changed = 0;
-		ret->ioprio = 0;
-		ret->last_waited = 0; /* doesn't matter... */
-		ret->nr_batch_requests = 0; /* because this is 0 */
-		INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
-		INIT_HLIST_HEAD(&ret->cic_list);
-		ret->ioc_data = NULL;
+	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
+	if (ioc) {
+		atomic_long_set(&ioc->refcount, 1);
+		atomic_set(&ioc->nr_tasks, 1);
+		spin_lock_init(&ioc->lock);
+		ioc->ioprio_changed = 0;
+		ioc->ioprio = 0;
+		ioc->last_waited = 0; /* doesn't matter... */
+		ioc->nr_batch_requests = 0; /* because this is 0 */
+		INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+		INIT_HLIST_HEAD(&ioc->cic_list);
+		ioc->ioc_data = NULL;
 #if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-		ret->cgroup_changed = 0;
+		ioc->cgroup_changed = 0;
 #endif
 	}
 
-	return ret;
+	return ioc;
 }
 
 /*
@@ -139,19 +139,19 @@ struct io_context *current_io_context(gfp_t gfp_flags, int node)
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
-	struct io_context *ret = NULL;
+	struct io_context *ioc = NULL;
 
 	/*
 	 * Check for unlikely race with exiting task. ioc ref count is
 	 * zero when ioc is being detached.
 	 */
 	do {
-		ret = current_io_context(gfp_flags, node);
-		if (unlikely(!ret))
+		ioc = current_io_context(gfp_flags, node);
+		if (unlikely(!ioc))
 			break;
-	} while (!atomic_long_inc_not_zero(&ret->refcount));
+	} while (!atomic_long_inc_not_zero(&ioc->refcount));
 
-	return ret;
+	return ioc;
 }
 EXPORT_SYMBOL(get_io_context);
 
 
@@ -59,7 +59,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	 * granularity
 	 */
 	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-	if (q->limits.discard_granularity) {
+	if (unlikely(!max_discard_sectors)) {
+		/* Avoid infinite loop below. Being cautious never hurts. */
+		return -EOPNOTSUPP;
+	} else if (q->limits.discard_granularity) {
 		unsigned int disc_sects = q->limits.discard_granularity >> 9;
 
 		max_discard_sectors &= ~(disc_sects - 1);
 
@@ -103,22 +103,25 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
-	int ccpu, cpu, group_cpu;
 
 	BUG_ON(!q->softirq_done_fn);
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	group_cpu = blk_cpu_to_group(cpu);
 
 	/*
 	 * Select completion CPU
 	 */
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-	else
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
+	} else
 		ccpu = cpu;
 
 	if (ccpu == cpu || ccpu == group_cpu) {
 
@@ -244,8 +244,9 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
 	bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+	bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
 
-	return queue_var_show(set, page);
+	return queue_var_show(set << force, page);
 }
 
 static ssize_t
@@ -257,10 +258,14 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 
 	ret = queue_var_store(&val, page, count);
 	spin_lock_irq(q->queue_lock);
-	if (val)
+	if (val) {
 		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-	else
-		queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+		if (val == 2)
+			queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+	} else {
+		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+	}
 	spin_unlock_irq(q->queue_lock);
 #endif
 	return ret;
 
@@ -142,9 +142,9 @@ static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg)
 	return NULL;
 }
 
-static inline int total_nr_queued(struct throtl_data *td)
+static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
-	return (td->nr_queued[0] + td->nr_queued[1]);
+	return td->nr_queued[0] + td->nr_queued[1];
 }
 
 static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg)
@@ -927,7 +927,7 @@ static int throtl_dispatch(struct request_queue *q)
 
 	bio_list_init(&bio_list_on_stack);
 
-	throtl_log(td, "dispatch nr_queued=%d read=%u write=%u",
+	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 
@@ -970,7 +970,7 @@ throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 	struct delayed_work *dwork = &td->throtl_work;
 
 	/* schedule work if limits changed even if no bio is queued */
-	if (total_nr_queued(td) > 0 || td->limits_changed) {
+	if (total_nr_queued(td) || td->limits_changed) {
 		/*
 		 * We might have a work scheduled to be executed in future.
 		 * Cancel that and schedule a new one.