Commit 37c2578c authored by Dexuan Cui's avatar Dexuan Cui Committed by Greg Kroah-Hartman

Drivers: hv: vmbus: Offload the handling of channels to two workqueues

vmbus_process_offer() mustn't call channel->sc_creation_callback()
directly for sub-channels, because sc_creation_callback() ->
vmbus_open() may never get the host's response to the
OPEN_CHANNEL message (the host may rescind a channel at any time,
e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
may not wake up the vmbus_open() as it's blocked due to a non-zero
vmbus_connection.offer_in_progress, and finally we have a deadlock.

The above is also true for primary channels, if the related device
drivers use sync probing mode by default.

And, usually the handling of primary channels and sub-channels can
depend on each other, so we should offload them to different
workqueues to avoid possible deadlock, e.g. in sync-probing mode,
NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
and waits for all the sub-channels to appear, but the latter
can't get the rtnl_lock and this blocks the handling of sub-channels.

The patch can fix the multiple-NIC deadlock described above for
v3.x kernels (e.g. RHEL 7.x) which don't support async-probing
of devices, and v4.4, v4.9, v4.14 and v4.18 which support async-probing
but don't enable async-probing for Hyper-V drivers (yet).

The patch can also fix the hang issue in sub-channel's handling described
above for all versions of kernels, including v4.19 and v4.20-rc4.

So actually the patch should be applied to all the existing kernels,
not only the kernels that have 8195b139.

Fixes: 8195b139 ("hv_netvsc: fix deadlock on hotplug")
Cc: stable@vger.kernel.org
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: K. Y. Srinivasan <kys@microsoft.com>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: default avatarDexuan Cui <decui@microsoft.com>
Signed-off-by: default avatarK. Y. Srinivasan <kys@microsoft.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 25956467
...@@ -435,61 +435,16 @@ void vmbus_free_channels(void) ...@@ -435,61 +435,16 @@ void vmbus_free_channels(void)
} }
} }
/* /* Note: the function can run concurrently for primary/sub channels. */
* vmbus_process_offer - Process the offer by creating a channel/device static void vmbus_add_channel_work(struct work_struct *work)
* associated with this offer
*/
static void vmbus_process_offer(struct vmbus_channel *newchannel)
{ {
struct vmbus_channel *channel; struct vmbus_channel *newchannel =
bool fnew = true; container_of(work, struct vmbus_channel, add_channel_work);
struct vmbus_channel *primary_channel = newchannel->primary_channel;
unsigned long flags; unsigned long flags;
u16 dev_type; u16 dev_type;
int ret; int ret;
/* Make sure this is a new offer */
mutex_lock(&vmbus_connection.channel_mutex);
/*
* Now that we have acquired the channel_mutex,
* we can release the potentially racing rescind thread.
*/
atomic_dec(&vmbus_connection.offer_in_progress);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
newchannel->offermsg.offer.if_type) &&
!uuid_le_cmp(channel->offermsg.offer.if_instance,
newchannel->offermsg.offer.if_instance)) {
fnew = false;
break;
}
}
if (fnew)
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
mutex_unlock(&vmbus_connection.channel_mutex);
if (!fnew) {
/*
* Check to see if this is a sub-channel.
*/
if (newchannel->offermsg.offer.sub_channel_index != 0) {
/*
* Process the sub-channel.
*/
newchannel->primary_channel = channel;
spin_lock_irqsave(&channel->lock, flags);
list_add_tail(&newchannel->sc_list, &channel->sc_list);
channel->num_sc++;
spin_unlock_irqrestore(&channel->lock, flags);
} else {
goto err_free_chan;
}
}
dev_type = hv_get_dev_type(newchannel); dev_type = hv_get_dev_type(newchannel);
init_vp_index(newchannel, dev_type); init_vp_index(newchannel, dev_type);
...@@ -507,27 +462,26 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) ...@@ -507,27 +462,26 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
/* /*
* This state is used to indicate a successful open * This state is used to indicate a successful open
* so that when we do close the channel normally, we * so that when we do close the channel normally, we
* can cleanup properly * can cleanup properly.
*/ */
newchannel->state = CHANNEL_OPEN_STATE; newchannel->state = CHANNEL_OPEN_STATE;
if (!fnew) { if (primary_channel != NULL) {
struct hv_device *dev /* newchannel is a sub-channel. */
= newchannel->primary_channel->device_obj; struct hv_device *dev = primary_channel->device_obj;
if (vmbus_add_channel_kobj(dev, newchannel)) if (vmbus_add_channel_kobj(dev, newchannel))
goto err_free_chan; goto err_deq_chan;
if (primary_channel->sc_creation_callback != NULL)
primary_channel->sc_creation_callback(newchannel);
if (channel->sc_creation_callback != NULL)
channel->sc_creation_callback(newchannel);
newchannel->probe_done = true; newchannel->probe_done = true;
return; return;
} }
/* /*
* Start the process of binding this offer to the driver * Start the process of binding the primary channel to the driver
* We need to set the DeviceObject field before calling
* vmbus_child_dev_add()
*/ */
newchannel->device_obj = vmbus_device_create( newchannel->device_obj = vmbus_device_create(
&newchannel->offermsg.offer.if_type, &newchannel->offermsg.offer.if_type,
...@@ -556,13 +510,28 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) ...@@ -556,13 +510,28 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
err_deq_chan: err_deq_chan:
mutex_lock(&vmbus_connection.channel_mutex); mutex_lock(&vmbus_connection.channel_mutex);
list_del(&newchannel->listentry);
/*
* We need to set the flag, otherwise
* vmbus_onoffer_rescind() can be blocked.
*/
newchannel->probe_done = true;
if (primary_channel == NULL) {
list_del(&newchannel->listentry);
} else {
spin_lock_irqsave(&primary_channel->lock, flags);
list_del(&newchannel->sc_list);
spin_unlock_irqrestore(&primary_channel->lock, flags);
}
mutex_unlock(&vmbus_connection.channel_mutex); mutex_unlock(&vmbus_connection.channel_mutex);
if (newchannel->target_cpu != get_cpu()) { if (newchannel->target_cpu != get_cpu()) {
put_cpu(); put_cpu();
smp_call_function_single(newchannel->target_cpu, smp_call_function_single(newchannel->target_cpu,
percpu_channel_deq, newchannel, true); percpu_channel_deq,
newchannel, true);
} else { } else {
percpu_channel_deq(newchannel); percpu_channel_deq(newchannel);
put_cpu(); put_cpu();
...@@ -570,14 +539,104 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) ...@@ -570,14 +539,104 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
vmbus_release_relid(newchannel->offermsg.child_relid); vmbus_release_relid(newchannel->offermsg.child_relid);
err_free_chan:
free_channel(newchannel); free_channel(newchannel);
} }
/*
* vmbus_process_offer - Process the offer by creating a channel/device
* associated with this offer
*/
static void vmbus_process_offer(struct vmbus_channel *newchannel)
{
struct vmbus_channel *channel;
struct workqueue_struct *wq;
unsigned long flags;
bool fnew = true;
mutex_lock(&vmbus_connection.channel_mutex);
/*
* Now that we have acquired the channel_mutex,
* we can release the potentially racing rescind thread.
*/
atomic_dec(&vmbus_connection.offer_in_progress);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
if (!uuid_le_cmp(channel->offermsg.offer.if_type,
newchannel->offermsg.offer.if_type) &&
!uuid_le_cmp(channel->offermsg.offer.if_instance,
newchannel->offermsg.offer.if_instance)) {
fnew = false;
break;
}
}
if (fnew)
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
else {
/*
* Check to see if this is a valid sub-channel.
*/
if (newchannel->offermsg.offer.sub_channel_index == 0) {
mutex_unlock(&vmbus_connection.channel_mutex);
/*
* Don't call free_channel(), because newchannel->kobj
* is not initialized yet.
*/
kfree(newchannel);
WARN_ON_ONCE(1);
return;
}
/*
* Process the sub-channel.
*/
newchannel->primary_channel = channel;
spin_lock_irqsave(&channel->lock, flags);
list_add_tail(&newchannel->sc_list, &channel->sc_list);
spin_unlock_irqrestore(&channel->lock, flags);
}
mutex_unlock(&vmbus_connection.channel_mutex);
/*
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
* directly for sub-channels, because sc_creation_callback() ->
* vmbus_open() may never get the host's response to the
* OPEN_CHANNEL message (the host may rescind a channel at any time,
* e.g. in the case of hot removing a NIC), and vmbus_onoffer_rescind()
* may not wake up the vmbus_open() as it's blocked due to a non-zero
* vmbus_connection.offer_in_progress, and finally we have a deadlock.
*
* The above is also true for primary channels, if the related device
* drivers use sync probing mode by default.
*
* And, usually the handling of primary channels and sub-channels can
* depend on each other, so we should offload them to different
* workqueues to avoid possible deadlock, e.g. in sync-probing mode,
* NIC1's netvsc_subchan_work() can race with NIC2's netvsc_probe() ->
* rtnl_lock(), and causes deadlock: the former gets the rtnl_lock
* and waits for all the sub-channels to appear, but the latter
* can't get the rtnl_lock and this blocks the handling of
* sub-channels.
*/
INIT_WORK(&newchannel->add_channel_work, vmbus_add_channel_work);
wq = fnew ? vmbus_connection.handle_primary_chan_wq :
vmbus_connection.handle_sub_chan_wq;
queue_work(wq, &newchannel->add_channel_work);
}
/* /*
* We use this state to statically distribute the channel interrupt load. * We use this state to statically distribute the channel interrupt load.
*/ */
static int next_numa_node_id; static int next_numa_node_id;
/*
* init_vp_index() accesses global variables like next_numa_node_id, and
* it can run concurrently for primary channels and sub-channels: see
* vmbus_process_offer(), so we need the lock to protect the global
* variables.
*/
static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
/* /*
* Starting with Win8, we can statically distribute the incoming * Starting with Win8, we can statically distribute the incoming
...@@ -613,6 +672,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) ...@@ -613,6 +672,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
return; return;
} }
spin_lock(&bind_channel_to_cpu_lock);
/* /*
* Based on the channel affinity policy, we will assign the NUMA * Based on the channel affinity policy, we will assign the NUMA
* nodes. * nodes.
...@@ -695,6 +756,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) ...@@ -695,6 +756,8 @@ static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
channel->target_cpu = cur_cpu; channel->target_cpu = cur_cpu;
channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
spin_unlock(&bind_channel_to_cpu_lock);
free_cpumask_var(available_mask); free_cpumask_var(available_mask);
} }
......
...@@ -190,6 +190,20 @@ int vmbus_connect(void) ...@@ -190,6 +190,20 @@ int vmbus_connect(void)
goto cleanup; goto cleanup;
} }
vmbus_connection.handle_primary_chan_wq =
create_workqueue("hv_pri_chan");
if (!vmbus_connection.handle_primary_chan_wq) {
ret = -ENOMEM;
goto cleanup;
}
vmbus_connection.handle_sub_chan_wq =
create_workqueue("hv_sub_chan");
if (!vmbus_connection.handle_sub_chan_wq) {
ret = -ENOMEM;
goto cleanup;
}
INIT_LIST_HEAD(&vmbus_connection.chn_msg_list); INIT_LIST_HEAD(&vmbus_connection.chn_msg_list);
spin_lock_init(&vmbus_connection.channelmsg_lock); spin_lock_init(&vmbus_connection.channelmsg_lock);
...@@ -280,10 +294,14 @@ void vmbus_disconnect(void) ...@@ -280,10 +294,14 @@ void vmbus_disconnect(void)
*/ */
vmbus_initiate_unload(false); vmbus_initiate_unload(false);
if (vmbus_connection.work_queue) { if (vmbus_connection.handle_sub_chan_wq)
drain_workqueue(vmbus_connection.work_queue); destroy_workqueue(vmbus_connection.handle_sub_chan_wq);
if (vmbus_connection.handle_primary_chan_wq)
destroy_workqueue(vmbus_connection.handle_primary_chan_wq);
if (vmbus_connection.work_queue)
destroy_workqueue(vmbus_connection.work_queue); destroy_workqueue(vmbus_connection.work_queue);
}
if (vmbus_connection.int_page) { if (vmbus_connection.int_page) {
free_pages((unsigned long)vmbus_connection.int_page, 0); free_pages((unsigned long)vmbus_connection.int_page, 0);
......
...@@ -335,7 +335,14 @@ struct vmbus_connection { ...@@ -335,7 +335,14 @@ struct vmbus_connection {
struct list_head chn_list; struct list_head chn_list;
struct mutex channel_mutex; struct mutex channel_mutex;
/*
* An offer message is handled first on the work_queue, and then
* is further handled on handle_primary_chan_wq or
* handle_sub_chan_wq.
*/
struct workqueue_struct *work_queue; struct workqueue_struct *work_queue;
struct workqueue_struct *handle_primary_chan_wq;
struct workqueue_struct *handle_sub_chan_wq;
}; };
......
...@@ -905,6 +905,13 @@ struct vmbus_channel { ...@@ -905,6 +905,13 @@ struct vmbus_channel {
bool probe_done; bool probe_done;
/*
* We must offload the handling of the primary/sub channels
* from the single-threaded vmbus_connection.work_queue to
* two different workqueue, otherwise we can block
* vmbus_connection.work_queue and hang: see vmbus_process_offer().
*/
struct work_struct add_channel_work;
}; };
static inline bool is_hvsock_channel(const struct vmbus_channel *c) static inline bool is_hvsock_channel(const struct vmbus_channel *c)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment