Commit a88eb6be authored by David S. Miller's avatar David S. Miller

Merge branch 'tipc-link-starvation'

Jon Maloy says:

====================
tipc: improve interaction socket-link

We fix a very real starvation problem that may occur when a link
encounters send buffer congestion. At the same time we make the
interaction between the socket and link layer simpler and more
consistent.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents aa276dd7 365ad353
...@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq) ...@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
* and to identified node local sockets * and to identified node local sockets
* @net: the applicable net namespace * @net: the applicable net namespace
* @list: chain of buffers containing message * @list: chain of buffers containing message
* Consumes the buffer chain, except when returning -ELINKCONG * Consumes the buffer chain.
* Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
*/ */
int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
...@@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) ...@@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
tipc_bcast_unlock(net); tipc_bcast_unlock(net);
/* Don't send to local node if adding to link failed */ /* Don't send to local node if adding to link failed */
if (unlikely(rc)) { if (unlikely(rc && (rc != -ELINKCONG))) {
__skb_queue_purge(&rcvq); __skb_queue_purge(&rcvq);
return rc; return rc;
} }
...@@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list) ...@@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
tipc_bcbase_xmit(net, &xmitq); tipc_bcbase_xmit(net, &xmitq);
tipc_sk_mcast_rcv(net, &rcvq, &inputq); tipc_sk_mcast_rcv(net, &rcvq, &inputq);
__skb_queue_purge(list); __skb_queue_purge(list);
return 0; return rc;
} }
/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link /* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
......
...@@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) ...@@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
/** /**
* link_schedule_user - schedule a message sender for wakeup after congestion * link_schedule_user - schedule a message sender for wakeup after congestion
* @link: congested link * @l: congested link
* @list: message that was attempted sent * @hdr: header of message that is being sent
* Create pseudo msg to send back to user when congestion abates * Create pseudo msg to send back to user when congestion abates
* Does not consume buffer list
*/ */
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
{ {
struct tipc_msg *msg = buf_msg(skb_peek(list)); u32 dnode = tipc_own_addr(l->net);
int imp = msg_importance(msg); u32 dport = msg_origport(hdr);
u32 oport = msg_origport(msg);
u32 addr = tipc_own_addr(link->net);
struct sk_buff *skb; struct sk_buff *skb;
/* This really cannot happen... */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
return -ENOBUFS;
}
/* Non-blocking sender: */
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
return -ELINKCONG;
/* Create and schedule wakeup pseudo message */ /* Create and schedule wakeup pseudo message */
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
addr, addr, oport, 0, 0); dnode, l->addr, dport, 0, 0);
if (!skb) if (!skb)
return -ENOBUFS; return -ENOBUFS;
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); msg_set_dest_droppable(buf_msg(skb), true);
TIPC_SKB_CB(skb)->chain_imp = imp; TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
skb_queue_tail(&link->wakeupq, skb); skb_queue_tail(&l->wakeupq, skb);
link->stats.link_congs++; l->stats.link_congs++;
return -ELINKCONG; return -ELINKCONG;
} }
/** /**
* link_prepare_wakeup - prepare users for wakeup after congestion * link_prepare_wakeup - prepare users for wakeup after congestion
* @link: congested link * @l: congested link
* Move a number of waiting users, as permitted by available space in * Wake up a number of waiting users, as permitted by available space
* the send queue, from link wait queue to node wait queue for wakeup * in the send queue
*/ */
void link_prepare_wakeup(struct tipc_link *l) void link_prepare_wakeup(struct tipc_link *l)
{ {
int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
int imp, lim;
struct sk_buff *skb, *tmp; struct sk_buff *skb, *tmp;
int imp, i = 0;
skb_queue_walk_safe(&l->wakeupq, skb, tmp) { skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
imp = TIPC_SKB_CB(skb)->chain_imp; imp = TIPC_SKB_CB(skb)->chain_imp;
lim = l->backlog[imp].limit; if (l->backlog[imp].len < l->backlog[imp].limit) {
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz; skb_unlink(skb, &l->wakeupq);
if ((pnd[imp] + l->backlog[imp].len) >= lim) skb_queue_tail(l->inputq, skb);
} else if (i++ > 10) {
break; break;
skb_unlink(skb, &l->wakeupq); }
skb_queue_tail(l->inputq, skb);
} }
} }
...@@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l) ...@@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l)
* @list: chain of buffers containing message * @list: chain of buffers containing message
* @xmitq: returned list of packets to be sent by caller * @xmitq: returned list of packets to be sent by caller
* *
* Consumes the buffer chain, except when returning -ELINKCONG, * Consumes the buffer chain.
* since the caller then may want to make more send attempts.
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/ */
...@@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, ...@@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
{ {
struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct tipc_msg *hdr = buf_msg(skb_peek(list));
unsigned int maxwin = l->window; unsigned int maxwin = l->window;
unsigned int i, imp = msg_importance(hdr); int imp = msg_importance(hdr);
unsigned int mtu = l->mtu; unsigned int mtu = l->mtu;
u16 ack = l->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1;
u16 seqno = l->snd_nxt; u16 seqno = l->snd_nxt;
...@@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, ...@@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff *skb, *_skb, *bskb; struct sk_buff *skb, *_skb, *bskb;
int pkt_cnt = skb_queue_len(list); int pkt_cnt = skb_queue_len(list);
int rc = 0;
/* Match msg importance against this and all higher backlog limits: */
if (!skb_queue_empty(backlogq)) {
for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
return link_schedule_user(l, list);
}
}
if (unlikely(msg_size(hdr) > mtu)) { if (unlikely(msg_size(hdr) > mtu)) {
skb_queue_purge(list); skb_queue_purge(list);
return -EMSGSIZE; return -EMSGSIZE;
} }
/* Allow oversubscription of one data msg per source at congestion */
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
if (imp == TIPC_SYSTEM_IMPORTANCE) {
pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
return -ENOBUFS;
}
rc = link_schedule_user(l, hdr);
}
if (pkt_cnt > 1) { if (pkt_cnt > 1) {
l->stats.sent_fragmented++; l->stats.sent_fragmented++;
l->stats.sent_fragments += pkt_cnt; l->stats.sent_fragments += pkt_cnt;
...@@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, ...@@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
skb_queue_splice_tail_init(list, backlogq); skb_queue_splice_tail_init(list, backlogq);
} }
l->snd_nxt = seqno; l->snd_nxt = seqno;
return 0; return rc;
} }
void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
......
...@@ -98,8 +98,6 @@ struct tipc_skb_cb { ...@@ -98,8 +98,6 @@ struct tipc_skb_cb {
u32 bytes_read; u32 bytes_read;
struct sk_buff *tail; struct sk_buff *tail;
bool validated; bool validated;
bool wakeup_pending;
u16 chain_sz;
u16 chain_imp; u16 chain_imp;
u16 ackers; u16 ackers;
}; };
......
...@@ -608,7 +608,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, ...@@ -608,7 +608,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
* Returns non-zero if any off-node ports overlap * Returns non-zero if any off-node ports overlap
*/ */
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
u32 limit, struct tipc_plist *dports) u32 limit, struct list_head *dports)
{ {
struct name_seq *seq; struct name_seq *seq;
struct sub_seq *sseq; struct sub_seq *sseq;
...@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, ...@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
info = sseq->info; info = sseq->info;
list_for_each_entry(publ, &info->node_list, node_list) { list_for_each_entry(publ, &info->node_list, node_list) {
if (publ->scope <= limit) if (publ->scope <= limit)
tipc_plist_push(dports, publ->ref); u32_push(dports, publ->ref);
} }
if (info->cluster_list_size != info->node_list_size) if (info->cluster_list_size != info->node_list_size)
...@@ -1022,40 +1022,84 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -1022,40 +1022,84 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len; return skb->len;
} }
void tipc_plist_push(struct tipc_plist *pl, u32 port) struct u32_item {
struct list_head list;
u32 value;
};
bool u32_find(struct list_head *l, u32 value)
{ {
struct tipc_plist *nl; struct u32_item *item;
if (likely(!pl->port)) { list_for_each_entry(item, l, list) {
pl->port = port; if (item->value == value)
return; return true;
} }
if (pl->port == port) return false;
return; }
list_for_each_entry(nl, &pl->list, list) {
if (nl->port == port) bool u32_push(struct list_head *l, u32 value)
return; {
struct u32_item *item;
list_for_each_entry(item, l, list) {
if (item->value == value)
return false;
}
item = kmalloc(sizeof(*item), GFP_ATOMIC);
if (unlikely(!item))
return false;
item->value = value;
list_add(&item->list, l);
return true;
}
u32 u32_pop(struct list_head *l)
{
struct u32_item *item;
u32 value = 0;
if (list_empty(l))
return 0;
item = list_first_entry(l, typeof(*item), list);
value = item->value;
list_del(&item->list);
kfree(item);
return value;
}
bool u32_del(struct list_head *l, u32 value)
{
struct u32_item *item, *tmp;
list_for_each_entry_safe(item, tmp, l, list) {
if (item->value != value)
continue;
list_del(&item->list);
kfree(item);
return true;
} }
nl = kmalloc(sizeof(*nl), GFP_ATOMIC); return false;
if (nl) { }
nl->port = port;
list_add(&nl->list, &pl->list); void u32_list_purge(struct list_head *l)
{
struct u32_item *item, *tmp;
list_for_each_entry_safe(item, tmp, l, list) {
list_del(&item->list);
kfree(item);
} }
} }
u32 tipc_plist_pop(struct tipc_plist *pl) int u32_list_len(struct list_head *l)
{ {
struct tipc_plist *nl; struct u32_item *item;
u32 port = 0; int i = 0;
if (likely(list_empty(&pl->list))) { list_for_each_entry(item, l, list) {
port = pl->port; i++;
pl->port = 0;
return port;
} }
nl = list_first_entry(&pl->list, typeof(*nl), list); return i;
port = nl->port;
list_del(&nl->list);
kfree(nl);
return port;
} }
...@@ -99,7 +99,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); ...@@ -99,7 +99,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
u32 limit, struct tipc_plist *dports); u32 limit, struct list_head *dports);
struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
u32 upper, u32 scope, u32 port_ref, u32 upper, u32 scope, u32 port_ref,
u32 key); u32 key);
...@@ -116,18 +116,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); ...@@ -116,18 +116,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net); int tipc_nametbl_init(struct net *net);
void tipc_nametbl_stop(struct net *net); void tipc_nametbl_stop(struct net *net);
struct tipc_plist { bool u32_push(struct list_head *l, u32 value);
struct list_head list; u32 u32_pop(struct list_head *l);
u32 port; bool u32_find(struct list_head *l, u32 value);
}; bool u32_del(struct list_head *l, u32 value);
void u32_list_purge(struct list_head *l);
static inline void tipc_plist_init(struct tipc_plist *pl) int u32_list_len(struct list_head *l);
{
INIT_LIST_HEAD(&pl->list);
pl->port = 0;
}
void tipc_plist_push(struct tipc_plist *pl, u32 port);
u32 tipc_plist_pop(struct tipc_plist *pl);
#endif #endif
...@@ -1167,7 +1167,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) ...@@ -1167,7 +1167,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
* @list: chain of buffers containing message * @list: chain of buffers containing message
* @dnode: address of destination node * @dnode: address of destination node
* @selector: a number used for deterministic link selection * @selector: a number used for deterministic link selection
* Consumes the buffer chain, except when returning -ELINKCONG * Consumes the buffer chain.
* Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
*/ */
int tipc_node_xmit(struct net *net, struct sk_buff_head *list, int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
...@@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, ...@@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
spin_unlock_bh(&le->lock); spin_unlock_bh(&le->lock);
tipc_node_read_unlock(n); tipc_node_read_unlock(n);
if (likely(rc == 0)) if (unlikely(rc == -ENOBUFS))
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
else if (rc == -ENOBUFS)
tipc_node_link_down(n, bearer_id, false); tipc_node_link_down(n, bearer_id, false);
else
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
tipc_node_put(n); tipc_node_put(n);
...@@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, ...@@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
* messages, which will not be rejected * messages, which will not be rejected
* The only exception is datagram messages rerouted after secondary * The only exception is datagram messages rerouted after secondary
* lookup, which are rare and safe to dispose of anyway. * lookup, which are rare and safe to dispose of anyway.
* TODO: Return real return value, and let callers use
* tipc_wait_for_sendpkt() where applicable
*/ */
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
u32 selector) u32 selector)
{ {
struct sk_buff_head head; struct sk_buff_head head;
int rc;
skb_queue_head_init(&head); skb_queue_head_init(&head);
__skb_queue_tail(&head, skb); __skb_queue_tail(&head, skb);
rc = tipc_node_xmit(net, &head, dnode, selector); tipc_node_xmit(net, &head, dnode, selector);
if (rc == -ELINKCONG)
kfree_skb(skb);
return 0; return 0;
} }
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment