Merge branch 'icmp-avoid-possible-side-channels-attacks'

Eric Dumazet says: ==================== icmp: avoid possible side-channels attacks Keyu Man reminded us that linux ICMP rate limiting was still allowing side-channels attacks. Quoting the fine document [1]: 4.4 Private Source Port Scan Method ... We can then use the same global ICMP rate limit as a side channel to infer if such an ICMP message has been triggered. At first glance, this method can work but at a low speed of one port per second, due to the per-IP rate limit on ICMP messages. Surprisingly, after we analyze the source code of the ICMP rate limit implementation, we find that the global rate limit is checked prior to the per-IP rate limit. This means that even if the per-IP rate limit may eventually determine that no ICMP reply should be sent, a packet is still subjected to the global rate limit check and one token is deducted. Ironically, such a decision is consciously made by Linux developers to avoid invoking the expensive check of the per-IP rate limit [ 22], involving a search process to locate the per-IP data structure. This effectively means that the per-IP rate limit can be disre- garded for the purpose of our side channel based scan, as it only determines if the final ICMP reply is generated but has nothing to do with the global rate limit counter decrement. As a result, we can continue to use roughly the same scan method as efficient as before, achieving 1,000 ports per second ... This series : 1) Changes the order of the two rate limiters to fix the issue. 2-3) Make the 'host-wide' rate limiter a per-netns one. [1] Link: https://dl.acm.org/doi/pdf/10.1145/3372297.3417280 ==================== Link: https://patch.msgid.link/20240829144641.3880376-1-edumazet@google.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'icmp-avoid-possible-side-channels-attacks'
Eric Dumazet says: ==================== icmp: avoid possible side-channels attacks Keyu Man reminded us that linux ICMP rate limiting was still allowing side-channels attacks. Quoting the fine document [1]: 4.4 Private Source Port Scan Method ... We can then use the same global ICMP rate limit as a side channel to infer if such an ICMP message has been triggered. At first glance, this method can work but at a low speed of one port per second, due to the per-IP rate limit on ICMP messages. Surprisingly, after we analyze the source code of the ICMP rate limit implementation, we find that the global rate limit is checked prior to the per-IP rate limit. This means that even if the per-IP rate limit may eventually determine that no ICMP reply should be sent, a packet is still subjected to the global rate limit check and one token is deducted. Ironically, such a decision is consciously made by Linux developers to avoid invoking the expensive check of the per-IP rate limit [ 22], involving a search process to locate the per-IP data structure. This effectively means that the per-IP rate limit can be disre- garded for the purpose of our side channel based scan, as it only determines if the final ICMP reply is generated but has nothing to do with the global rate limit counter decrement. As a result, we can continue to use roughly the same scan method as efficient as before, achieving 1,000 ports per second ... This series : 1) Changes the order of the two rate limiters to fix the issue. 2-3) Make the 'host-wide' rate limiter a per-netns one. [1] Link: https://dl.acm.org/doi/pdf/10.1145/3372297.3417280 ==================== Link: https://patch.msgid.link/20240829144641.3880376-1-edumazet@google.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
789ed80a · Jakub Kicinski · b26b6449 · f17bf505 · 789ed80a · 789ed80a
Commit 789ed80a authored Aug 30, 2024 by Jakub Kicinski
5 changed files
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -794,9 +794,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
 	ip_cmsg_recv_offset(msg, skb->sk, skb, 0, 0);
 }
-bool icmp_global_allow(void);
+bool icmp_global_allow(struct net *net);
-extern int sysctl_icmp_msgs_per_sec;
+void icmp_global_consume(struct net *net);
-extern int sysctl_icmp_msgs_burst;
 #ifdef CONFIG_PROC_FS
 int ip_misc_proc_init(void);

--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -122,7 +122,10 @@ struct netns_ipv4 {
 	u8 sysctl_icmp_errors_use_inbound_ifaddr;
 	int sysctl_icmp_ratelimit;
 	int sysctl_icmp_ratemask;
+	int sysctl_icmp_msgs_per_sec;
+	int sysctl_icmp_msgs_burst;
+	atomic_t icmp_global_credit;
+	u32 icmp_global_stamp;
 	u32 ip_rt_min_pmtu;
 	int ip_rt_mtu_expires;
 	int ip_rt_min_advmss;

--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -220,61 +220,56 @@ static inline void icmp_xmit_unlock(struct sock *sk)
 	spin_unlock(&sk->sk_lock.slock);
 }
-int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
-int sysctl_icmp_msgs_burst __read_mostly = 50;
-static struct {
-	spinlock_t	lock;
-	u32		credit;
-	u32		stamp;
-} icmp_global = {
-	.lock		= __SPIN_LOCK_UNLOCKED(icmp_global.lock),
-};
 /**
 * icmp_global_allow - Are we allowed to send one more ICMP message ?
+ * @net: network namespace
 *
 * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
 * Returns false if we reached the limit and can not send another packet.
- * Note: called with BH disabled
+ * Works in tandem with icmp_global_consume().
 */
-bool icmp_global_allow(void)
+bool icmp_global_allow(struct net *net)
 {
-	u32 credit, delta, incr = 0, now = (u32)jiffies;
+	u32 delta, now, oldstamp;
-	bool rc = false;
+	int incr, new, old;
-	/* Check if token bucket is empty and cannot be refilled
+	/* Note: many cpus could find this condition true.
-	 * without taking the spinlock. The READ_ONCE() are paired
+	 * Then later icmp_global_consume() could consume more credits,
-	 * with the following WRITE_ONCE() in this same function.
+	 * this is an acceptable race.
 	 */
-	if (!READ_ONCE(icmp_global.credit)) {
+	if (atomic_read(&net->ipv4.icmp_global_credit) > 0)
-		delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ);
+		return true;
-		if (delta < HZ / 50)
-			return false;
-	}
-	spin_lock(&icmp_global.lock);
+	now = jiffies;
-	delta = min_t(u32, now - icmp_global.stamp, HZ);
+	oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp);
-	if (delta >= HZ / 50) {
+	delta = min_t(u32, now - oldstamp, HZ);
-		incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ;
+	if (delta < HZ / 50)
-		if (incr)
+		return false;
-			WRITE_ONCE(icmp_global.stamp, now);
-	}
+	incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ;
-	credit = min_t(u32, icmp_global.credit + incr,
+	if (!incr)
-		       READ_ONCE(sysctl_icmp_msgs_burst));
+		return false;
-	if (credit) {
-		/* We want to use a credit of one in average, but need to randomize
+	if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) {
-		 * it for security reasons.
+		old = atomic_read(&net->ipv4.icmp_global_credit);
-		 */
+		do {
-		credit = max_t(int, credit - get_random_u32_below(3), 0);
+			new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst));
-		rc = true;
+		} while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new));
 	}
-	WRITE_ONCE(icmp_global.credit, credit);
+	return true;
-	spin_unlock(&icmp_global.lock);
-	return rc;
 }
 EXPORT_SYMBOL(icmp_global_allow);
+void icmp_global_consume(struct net *net)
+{
+	int credits = get_random_u32_below(3);
+	/* Note: this might make icmp_global.credit negative. */
+	if (credits)
+		atomic_sub(credits, &net->ipv4.icmp_global_credit);
+}
+EXPORT_SYMBOL(icmp_global_consume);
 static bool icmpv4_mask_allow(struct net *net, int type, int code)
 {
 	if (type > NR_ICMP_TYPES)
@@ -291,14 +286,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code)
 	return false;
 }
-static bool icmpv4_global_allow(struct net *net, int type, int code)
+static bool icmpv4_global_allow(struct net *net, int type, int code,
+				bool *apply_ratelimit)
 {
 	if (icmpv4_mask_allow(net, type, code))
 		return true;
-	if (icmp_global_allow())
+	if (icmp_global_allow(net)) {
+		*apply_ratelimit = true;
 		return true;
+	}
 	__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
 	return false;
 }
@@ -308,15 +305,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code)
 */
 static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
-			       struct flowi4 *fl4, int type, int code)
+			       struct flowi4 *fl4, int type, int code,
+			       bool apply_ratelimit)
 {
 	struct dst_entry *dst = &rt->dst;
 	struct inet_peer *peer;
 	bool rc = true;
 	int vif;
-	if (icmpv4_mask_allow(net, type, code))
+	if (!apply_ratelimit)
-		goto out;
+		return true;
 	/* No rate limit on loopback */
 	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
@@ -331,6 +329,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
 out:
 	if (!rc)
 		__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+	else
+		icmp_global_consume(net);
 	return rc;
 }
@@ -402,6 +402,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	struct ipcm_cookie ipc;
 	struct rtable *rt = skb_rtable(skb);
 	struct net *net = dev_net(rt->dst.dev);
+	bool apply_ratelimit = false;
 	struct flowi4 fl4;
 	struct sock *sk;
 	struct inet_sock *inet;
@@ -413,11 +414,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
 		return;
-	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	/* Needed by both icmpv4_global_allow and icmp_xmit_lock */
 	local_bh_disable();
-	/* global icmp_msgs_per_sec */
+	/* is global icmp_msgs_per_sec exhausted ? */
-	if (!icmpv4_global_allow(net, type, code))
+	if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
 		goto out_bh_enable;
 	sk = icmp_xmit_lock(net);
@@ -450,7 +451,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	rt = ip_route_output_key(net, &fl4);
 	if (IS_ERR(rt))
 		goto out_unlock;
-	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+	if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
 		icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
 	ip_rt_put(rt);
 out_unlock:
@@ -596,6 +597,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	int room;
 	struct icmp_bxm icmp_param;
 	struct rtable *rt = skb_rtable(skb_in);
+	bool apply_ratelimit = false;
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	__be32 saddr;
@@ -677,7 +679,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 		}
 	}
-	/* Needed by both icmp_global_allow and icmp_xmit_lock */
+	/* Needed by both icmpv4_global_allow and icmp_xmit_lock */
 	local_bh_disable();
 	/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
@@ -685,7 +687,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	 * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
 	 */
 	if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
-	      !icmpv4_global_allow(net, type, code))
+	      !icmpv4_global_allow(net, type, code, &apply_ratelimit))
 		goto out_bh_enable;
 	sk = icmp_xmit_lock(net);
@@ -744,7 +746,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 		goto out_unlock;
 	/* peer icmp_ratelimit */
-	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+	if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
 		goto ende;
 	/* RFC says return as much as we can without exceeding 576 bytes. */
@@ -1487,6 +1489,8 @@ static int __net_init icmp_sk_init(struct net *net)
 	net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
 	net->ipv4.sysctl_icmp_ratemask = 0x1818;
 	net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+	net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
+	net->ipv4.sysctl_icmp_msgs_burst = 50;
 	return 0;
 }

--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -600,22 +600,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0444,
 		.proc_handler   = proc_tcp_available_ulp,
 	},
-	{
-		.procname	= "icmp_msgs_per_sec",
-		.data		= &sysctl_icmp_msgs_per_sec,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-	},
-	{
-		.procname	= "icmp_msgs_burst",
-		.data		= &sysctl_icmp_msgs_burst,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= SYSCTL_ZERO,
-	},
 	{
 		.procname	= "udp_mem",
 		.data		= &sysctl_udp_mem,
@@ -701,6 +685,22 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+	{
+		.procname	= "icmp_msgs_per_sec",
+		.data		= &init_net.ipv4.sysctl_icmp_msgs_per_sec,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
+	{
+		.procname	= "icmp_msgs_burst",
+		.data		= &init_net.ipv4.sysctl_icmp_msgs_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+	},
 	{
 		.procname	= "ping_group_range",
 		.data		= &init_net.ipv4.ping_group_range.range,

--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net *net, int type)
 	return false;
 }
-static bool icmpv6_global_allow(struct net *net, int type)
+static bool icmpv6_global_allow(struct net *net, int type,
+				bool *apply_ratelimit)
 {
 	if (icmpv6_mask_allow(net, type))
 		return true;
-	if (icmp_global_allow())
+	if (icmp_global_allow(net)) {
+		*apply_ratelimit = true;
 		return true;
+	}
 	__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
 	return false;
 }
@@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct net *net, int type)
 * Check the ICMP output rate limit
 */
 static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
-			       struct flowi6 *fl6)
+			       struct flowi6 *fl6, bool apply_ratelimit)
 {
 	struct net *net = sock_net(sk);
 	struct dst_entry *dst;
 	bool res = false;
-	if (icmpv6_mask_allow(net, type))
+	if (!apply_ratelimit)
 		return true;
 	/*
@@ -228,6 +230,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	if (!res)
 		__ICMP6_INC_STATS(net, ip6_dst_idev(dst),
 				  ICMP6_MIB_RATELIMITHOST);
+	else
+		icmp_global_consume(net);
 	dst_release(dst);
 	return res;
 }
@@ -452,6 +456,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	struct net *net;
 	struct ipv6_pinfo *np;
 	const struct in6_addr *saddr = NULL;
+	bool apply_ratelimit = false;
 	struct dst_entry *dst;
 	struct icmp6hdr tmp_hdr;
 	struct flowi6 fl6;
@@ -533,11 +538,12 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 		return;
 	}
-	/* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+	/* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
 	local_bh_disable();
 	/* Check global sysctl_icmp_msgs_per_sec ratelimit */
-	if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type))
+	if (!(skb->dev->flags & IFF_LOOPBACK) &&
+	    !icmpv6_global_allow(net, type, &apply_ratelimit))
 		goto out_bh_enable;
 	mip6_addr_swap(skb, parm);
@@ -575,7 +581,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	np = inet6_sk(sk);
-	if (!icmpv6_xrlim_allow(sk, type, &fl6))
+	if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
 		goto out;
 	tmp_hdr.icmp6_type = type;
@@ -717,6 +723,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
 	struct ipv6_pinfo *np;
 	const struct in6_addr *saddr = NULL;
 	struct icmp6hdr *icmph = icmp6_hdr(skb);
+	bool apply_ratelimit = false;
 	struct icmp6hdr tmp_hdr;
 	struct flowi6 fl6;
 	struct icmpv6_msg msg;
@@ -781,8 +788,9 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
 		goto out;
 	/* Check the ratelimit */
-	if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) ||
+	if ((!(skb->dev->flags & IFF_LOOPBACK) &&
-	    !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6))
+	    !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) ||
+	    !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit))
 		goto out_dst_release;
 	idev = __in6_dev_get(skb->dev);