Commit 9dc30648 authored by David S. Miller's avatar David S. Miller

Merge branch 'per-route-dctcp-receive-side'

Daniel Borkmann says:

====================
tcp: receive-side per route dctcp handling

Original cover letter:

  Currently, the following case doesn't use DCTCP, even if it should:

    - responder has f.e. cubic as system wide default
    - 'ip route congctl dctcp $src' was set

  Then, DCTCP is NOT used if a DCTCP sender attempts to connect from a
  host in the $src range: ECT(0) is set, but listen_sk is not dctcp, so
  we fail the INET_ECN_is_not_ect sanity check.

  We also have to examine the dst used for the SYN/ACK reply to make
  this case work.

  In order to minimize additional cost, store the 'ecn is must have'
  information is the dst_features field.

  The set targets -next instead of -net since this doesn't seem to be a
  serious bug and to give the change more soak time until it hits linus
  tree.

v1 -> v2:
 - Addressed Dave's feedback, not exposing any bits to user space
 - Added patch 3 to reject incorrect configurations
 - Rest as is, rebased and retested
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 87583ebb c3a8d947
......@@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
p[metric-1] = val;
}
/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA (1 << 31)
#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)
static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
......
......@@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
#ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else
......
......@@ -418,10 +418,13 @@ enum {
#define RTAX_MAX (__RTAX_MAX - 1)
#define RTAX_FEATURE_ECN 0x00000001
#define RTAX_FEATURE_SACK 0x00000002
#define RTAX_FEATURE_TIMESTAMP 0x00000004
#define RTAX_FEATURE_ALLFRAG 0x00000008
#define RTAX_FEATURE_ECN (1 << 0)
#define RTAX_FEATURE_SACK (1 << 1)
#define RTAX_FEATURE_TIMESTAMP (1 << 2)
#define RTAX_FEATURE_ALLFRAG (1 << 3)
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \
RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG)
struct rta_session {
__u8 proto;
......
......@@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
} else if (i == RTAX_FEATURES - 1) {
u32 user_features = metrics[i] & RTAX_FEATURE_MASK;
BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
if (nla_put_u32(skb, i + 1, user_features))
goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
......
......@@ -876,6 +876,50 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
return true;
}
static int
fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
{
bool ecn_ca = false;
struct nlattr *nla;
int remaining;
if (!cfg->fc_mx)
return 0;
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
u32 val;
if (!type)
continue;
if (type > RTAX_MAX)
return -EINVAL;
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
val = 65535 - 15;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
return -EINVAL;
fi->fib_metrics[type - 1] = val;
}
if (ecn_ca)
fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
return 0;
}
struct fib_info *fib_create_info(struct fib_config *cfg)
{
int err;
......@@ -948,36 +992,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} endfor_nexthops(fi)
if (cfg->fc_mx) {
struct nlattr *nla;
int remaining;
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
if (type) {
u32 val;
if (type > RTAX_MAX)
goto err_inval;
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
if (val == TCP_CA_UNSPEC)
goto err_inval;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
val = 65535 - 15;
fi->fib_metrics[type - 1] = val;
}
}
}
err = fib_convert_metrics(fi, cfg);
if (err)
goto failure;
if (cfg->fc_mp) {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
......
......@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
u32 tcp_ca_get_key_by_name(const char *name)
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key;
u32 key = TCP_CA_UNSPEC;
might_sleep();
rcu_read_lock();
ca = __tcp_ca_find_autoload(name);
key = ca ? ca->key : TCP_CA_UNSPEC;
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
}
rcu_read_unlock();
return key;
......
......@@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req,
const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr;
bool ect, ecn_ok;
u32 ecn_ok_dst;
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk))
if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA))
inet_rsk(req)->ecn_ok = 1;
}
......
......@@ -1698,6 +1698,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
......@@ -1711,30 +1712,36 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
if (type) {
u32 val;
if (!type)
continue;
if (unlikely(type > RTAX_MAX))
goto err;
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp);
val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
goto err;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
goto err;
mp[type - 1] = val;
__set_bit(type - 1, mxc->mx_valid);
}
if (ecn_ca) {
__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}
mxc->mx = mp;
return 0;
err:
kfree(mp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment