1. 07 Dec, 2023 11 commits
    • Ido Schimmel's avatar
      drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group · e0378187
      Ido Schimmel authored
      The "NET_DM" generic netlink family notifies drop locations over the
      "events" multicast group. This is problematic since by default generic
      netlink allows non-root users to listen to these notifications.
      
      Fix by adding a new field to the generic netlink multicast group
      structure that when set prevents non-root users or root without the
      'CAP_SYS_ADMIN' capability (in the user namespace owning the network
      namespace) from joining the group. Set this field for the "events"
      group. Use 'CAP_SYS_ADMIN' rather than 'CAP_NET_ADMIN' because of the
      nature of the information that is shared over this group.
      
      Note that the capability check in this case will always be performed
      against the initial user namespace since the family is not netns aware
      and only operates in the initial network namespace.
      
      A new field is added to the structure rather than using the "flags"
      field because the existing field uses uAPI flags and it is inappropriate
      to add a new uAPI flag for an internal kernel check. In net-next we can
      rework the "flags" field to use internal flags and fold the new field
      into it. But for now, in order to reduce the amount of changes, add a
      new field.
      
      Since the information can only be consumed by root, mark the control
      plane operations that start and stop the tracing as root-only using the
      'GENL_ADMIN_PERM' flag.
      
      Tested using [1].
      
      Before:
      
       # capsh -- -c ./dm_repo
       # capsh --drop=cap_sys_admin -- -c ./dm_repo
      
      After:
      
       # capsh -- -c ./dm_repo
       # capsh --drop=cap_sys_admin -- -c ./dm_repo
       Failed to join "events" multicast group
      
      [1]
       $ cat dm.c
       #include <stdio.h>
       #include <netlink/genl/ctrl.h>
       #include <netlink/genl/genl.h>
       #include <netlink/socket.h>
      
       int main(int argc, char **argv)
       {
       	struct nl_sock *sk;
       	int grp, err;
      
       	sk = nl_socket_alloc();
       	if (!sk) {
       		fprintf(stderr, "Failed to allocate socket\n");
       		return -1;
       	}
      
       	err = genl_connect(sk);
       	if (err) {
       		fprintf(stderr, "Failed to connect socket\n");
       		return err;
       	}
      
       	grp = genl_ctrl_resolve_grp(sk, "NET_DM", "events");
       	if (grp < 0) {
       		fprintf(stderr,
       			"Failed to resolve \"events\" multicast group\n");
       		return grp;
       	}
      
       	err = nl_socket_add_memberships(sk, grp, NFNLGRP_NONE);
       	if (err) {
       		fprintf(stderr, "Failed to join \"events\" multicast group\n");
       		return err;
       	}
      
       	return 0;
       }
       $ gcc -I/usr/include/libnl3 -lnl-3 -lnl-genl-3 -o dm_repo dm.c
      
      Fixes: 9a8afc8d ("Network Drop Monitor: Adding drop monitor implementation & Netlink protocol")
      Reported-by: default avatar"The UK's National Cyber Security Centre (NCSC)" <security@ncsc.gov.uk>
      Signed-off-by: default avatarIdo Schimmel <idosch@nvidia.com>
      Reviewed-by: default avatarJacob Keller <jacob.e.keller@intel.com>
      Reviewed-by: default avatarJiri Pirko <jiri@nvidia.com>
      Link: https://lore.kernel.org/r/20231206213102.1824398-3-idosch@nvidia.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      e0378187
    • Ido Schimmel's avatar
      psample: Require 'CAP_NET_ADMIN' when joining "packets" group · 44ec98ea
      Ido Schimmel authored
      The "psample" generic netlink family notifies sampled packets over the
      "packets" multicast group. This is problematic since by default generic
      netlink allows non-root users to listen to these notifications.
      
      Fix by marking the group with the 'GENL_UNS_ADMIN_PERM' flag. This will
      prevent non-root users or root without the 'CAP_NET_ADMIN' capability
      (in the user namespace owning the network namespace) from joining the
      group.
      
      Tested using [1].
      
      Before:
      
       # capsh -- -c ./psample_repo
       # capsh --drop=cap_net_admin -- -c ./psample_repo
      
      After:
      
       # capsh -- -c ./psample_repo
       # capsh --drop=cap_net_admin -- -c ./psample_repo
       Failed to join "packets" multicast group
      
      [1]
       $ cat psample.c
       #include <stdio.h>
       #include <netlink/genl/ctrl.h>
       #include <netlink/genl/genl.h>
       #include <netlink/socket.h>
      
       int join_grp(struct nl_sock *sk, const char *grp_name)
       {
       	int grp, err;
      
       	grp = genl_ctrl_resolve_grp(sk, "psample", grp_name);
       	if (grp < 0) {
       		fprintf(stderr, "Failed to resolve \"%s\" multicast group\n",
       			grp_name);
       		return grp;
       	}
      
       	err = nl_socket_add_memberships(sk, grp, NFNLGRP_NONE);
       	if (err) {
       		fprintf(stderr, "Failed to join \"%s\" multicast group\n",
       			grp_name);
       		return err;
       	}
      
       	return 0;
       }
      
       int main(int argc, char **argv)
       {
       	struct nl_sock *sk;
       	int err;
      
       	sk = nl_socket_alloc();
       	if (!sk) {
       		fprintf(stderr, "Failed to allocate socket\n");
       		return -1;
       	}
      
       	err = genl_connect(sk);
       	if (err) {
       		fprintf(stderr, "Failed to connect socket\n");
       		return err;
       	}
      
       	err = join_grp(sk, "config");
       	if (err)
       		return err;
      
       	err = join_grp(sk, "packets");
       	if (err)
       		return err;
      
       	return 0;
       }
       $ gcc -I/usr/include/libnl3 -lnl-3 -lnl-genl-3 -o psample_repo psample.c
      
      Fixes: 6ae0a628 ("net: Introduce psample, a new genetlink channel for packet sampling")
      Reported-by: default avatar"The UK's National Cyber Security Centre (NCSC)" <security@ncsc.gov.uk>
      Signed-off-by: default avatarIdo Schimmel <idosch@nvidia.com>
      Reviewed-by: default avatarJacob Keller <jacob.e.keller@intel.com>
      Reviewed-by: default avatarJiri Pirko <jiri@nvidia.com>
      Link: https://lore.kernel.org/r/20231206213102.1824398-2-idosch@nvidia.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      44ec98ea
    • Jakub Kicinski's avatar
      Merge branch 'fixes-for-ktls' · 4a02609d
      Jakub Kicinski authored
      John Fastabend says:
      
      ====================
      Couple fixes for TLS and BPF interactions.
      ====================
      
      Link: https://lore.kernel.org/r/20231206232706.374377-1-john.fastabend@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      4a02609d
    • John Fastabend's avatar
      bpf: sockmap, updating the sg structure should also update curr · bb9aefde
      John Fastabend authored
      Curr pointer should be updated when the sg structure is shifted.
      
      Fixes: 7246d8ed ("bpf: helper to pop data from messages")
      Signed-off-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Link: https://lore.kernel.org/r/20231206232706.374377-3-john.fastabend@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      bb9aefde
    • John Fastabend's avatar
      net: tls, update curr on splice as well · c5a59500
      John Fastabend authored
      The curr pointer must also be updated on the splice similar to how
      we do this for other copy types.
      
      Fixes: d829e9c4 ("tls: convert to generic sk_msg interface")
      Signed-off-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Reported-by: default avatarJann Horn <jannh@google.com>
      Link: https://lore.kernel.org/r/20231206232706.374377-2-john.fastabend@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      c5a59500
    • Jakub Kicinski's avatar
      Merge tag 'nf-23-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf · 4de75d3e
      Jakub Kicinski authored
      Pablo Neira Ayuso says:
      
      ====================
      Netfilter fixes for net
      
      The following patchset contains Netfilter fixes for net:
      
      1) Incorrect nf_defrag registration for bpf link infra, from D. Wythe.
      
      2) Skip inactive elements in pipapo set backend walk to avoid double
         deactivation, from Florian Westphal.
      
      3) Fix NFT_*_F_PRESENT check with big endian arch, also from Florian.
      
      4) Bail out if number of expressions in NFTA_DYNSET_EXPRESSIONS mismatch
         stateful expressions in set declaration.
      
      5) Honor family in table lookup by handle. Broken since 4.16.
      
      6) Use sk_callback_lock to protect access to sk->sk_socket in xt_owner.
         sock_orphan() might zap this pointer, from Phil Sutter.
      
      All of these fixes address broken stuff for several releases.
      
      * tag 'nf-23-12-06' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
        netfilter: xt_owner: Fix for unsafe access of sk->sk_socket
        netfilter: nf_tables: validate family when identifying table via handle
        netfilter: nf_tables: bail out on mismatching dynset and set expressions
        netfilter: nf_tables: fix 'exist' matching on bigendian arches
        netfilter: nft_set_pipapo: skip inactive elements during set walk
        netfilter: bpf: fix bad registration on nf_defrag
      ====================
      
      Link: https://lore.kernel.org/r/20231206180357.959930-1-pablo@netfilter.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      4de75d3e
    • Jakub Kicinski's avatar
      Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf · c85e5594
      Jakub Kicinski authored
      Daniel Borkmann says:
      
      ====================
      pull-request: bpf 2023-12-06
      
      We've added 4 non-merge commits during the last 6 day(s) which contain
      a total of 7 files changed, 185 insertions(+), 55 deletions(-).
      
      The main changes are:
      
      1) Fix race found by syzkaller on prog_array_map_poke_run when
         a BPF program's kallsym symbols were still missing, from Jiri Olsa.
      
      2) Fix BPF verifier's branch offset comparison for BPF_JMP32 | BPF_JA,
         from Yonghong Song.
      
      3) Fix xsk's poll handling to only set mask on bound xsk sockets,
         from Yewon Choi.
      
      * tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
        selftests/bpf: Add test for early update in prog_array_map_poke_run
        bpf: Fix prog_array_map_poke_run map poke update
        xsk: Skip polling event check for unbound socket
        bpf: Fix a verifier bug due to incorrect branch offset comparison with cpu=v4
      ====================
      
      Link: https://lore.kernel.org/r/20231206220528.12093-1-daniel@iogearbox.netSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      c85e5594
    • Hui Zhou's avatar
      nfp: flower: fix for take a mutex lock in soft irq context and rcu lock · 0ad722bd
      Hui Zhou authored
      The neighbour event callback call the function nfp_tun_write_neigh,
      this function will take a mutex lock and it is in soft irq context,
      change the work queue to process the neighbour event.
      
      Move the nfp_tun_write_neigh function out of range rcu_read_lock/unlock()
      in function nfp_tunnel_request_route_v4 and nfp_tunnel_request_route_v6.
      
      Fixes: abc21095 ("nfp: flower: tunnel neigh support bond offload")
      CC: stable@vger.kernel.org # 6.2+
      Signed-off-by: default avatarHui Zhou <hui.zhou@corigine.com>
      Signed-off-by: default avatarLouis Peens <louis.peens@corigine.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      0ad722bd
    • Jakub Kicinski's avatar
      Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue · 803a809d
      Jakub Kicinski authored
      Tony Nguyen says:
      
      ====================
      Intel Wired LAN Driver Updates 2023-12-05 (ice, i40e, iavf)
      
      This series contains updates to ice, i40e and iavf drivers.
      
      Michal fixes incorrect usage of VF MSIX value and index calculation for
      ice.
      
      Marcin restores disabling of Rx VLAN filtering which was inadvertently
      removed for ice.
      
      Ivan Vecera corrects improper messaging of MFS port for i40e.
      
      Jake fixes incorrect checking of coalesce values on iavf.
      
      * '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue:
        iavf: validate tx_coalesce_usecs even if rx_coalesce_usecs is zero
        i40e: Fix unexpected MFS warning message
        ice: Restore fix disabling RX VLAN filtering
        ice: change vfs.num_msix_per to vf->num_msix
      ====================
      
      Link: https://lore.kernel.org/r/20231205211918.2123019-1-anthony.l.nguyen@intel.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      803a809d
    • Tobias Waldekranz's avatar
      net: dsa: mv88e6xxx: Restore USXGMII support for 6393X · 0c7ed1f9
      Tobias Waldekranz authored
      In 4a562127, USXGMII support was added for 6393X, but this was
      lost in the PCS conversion (the blamed commit), most likely because
      these efforts where more or less done in parallel.
      
      Restore this feature by porting Michal's patch to fit the new
      implementation.
      Reviewed-by: default avatarFlorian Fainelli <florian.fainelli@broadcom.com>
      Tested-by: default avatarMichal Smulski <michal.smulski@ooma.com>
      Reviewed-by: default avatarVladimir Oltean <vladimir.oltean@nxp.com>
      Fixes: e5b732a2 ("net: dsa: mv88e6xxx: convert 88e639x to phylink_pcs")
      Signed-off-by: default avatarTobias Waldekranz <tobias@waldekranz.com>
      Link: https://lore.kernel.org/r/20231205221359.3926018-1-tobias@waldekranz.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      0c7ed1f9
    • Eric Dumazet's avatar
      tcp: do not accept ACK of bytes we never sent · 3d501dd3
      Eric Dumazet authored
      This patch is based on a detailed report and ideas from Yepeng Pan
      and Christian Rossow.
      
      ACK seq validation is currently following RFC 5961 5.2 guidelines:
      
         The ACK value is considered acceptable only if
         it is in the range of ((SND.UNA - MAX.SND.WND) <= SEG.ACK <=
         SND.NXT).  All incoming segments whose ACK value doesn't satisfy the
         above condition MUST be discarded and an ACK sent back.  It needs to
         be noted that RFC 793 on page 72 (fifth check) says: "If the ACK is a
         duplicate (SEG.ACK < SND.UNA), it can be ignored.  If the ACK
         acknowledges something not yet sent (SEG.ACK > SND.NXT) then send an
         ACK, drop the segment, and return".  The "ignored" above implies that
         the processing of the incoming data segment continues, which means
         the ACK value is treated as acceptable.  This mitigation makes the
         ACK check more stringent since any ACK < SND.UNA wouldn't be
         accepted, instead only ACKs that are in the range ((SND.UNA -
         MAX.SND.WND) <= SEG.ACK <= SND.NXT) get through.
      
      This can be refined for new (and possibly spoofed) flows,
      by not accepting ACK for bytes that were never sent.
      
      This greatly improves TCP security at a little cost.
      
      I added a Fixes: tag to make sure this patch will reach stable trees,
      even if the 'blamed' patch was adhering to the RFC.
      
      tp->bytes_acked was added in linux-4.2
      
      Following packetdrill test (courtesy of Yepeng Pan) shows
      the issue at hand:
      
      0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
      +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
      +0 bind(3, ..., ...) = 0
      +0 listen(3, 1024) = 0
      
      // ---------------- Handshake ------------------- //
      
      // when window scale is set to 14 the window size can be extended to
      // 65535 * (2^14) = 1073725440. Linux would accept an ACK packet
      // with ack number in (Server_ISN+1-1073725440. Server_ISN+1)
      // ,though this ack number acknowledges some data never
      // sent by the server.
      
      +0 < S 0:0(0) win 65535 <mss 1400,nop,wscale 14>
      +0 > S. 0:0(0) ack 1 <...>
      +0 < . 1:1(0) ack 1 win 65535
      +0 accept(3, ..., ...) = 4
      
      // For the established connection, we send an ACK packet,
      // the ack packet uses ack number 1 - 1073725300 + 2^32,
      // where 2^32 is used to wrap around.
      // Note: we used 1073725300 instead of 1073725440 to avoid possible
      // edge cases.
      // 1 - 1073725300 + 2^32 = 3221241997
      
      // Oops, old kernels happily accept this packet.
      +0 < . 1:1001(1000) ack 3221241997 win 65535
      
      // After the kernel fix the following will be replaced by a challenge ACK,
      // and prior malicious frame would be dropped.
      +0 > . 1:1(0) ack 1001
      
      Fixes: 354e4aa3 ("tcp: RFC 5961 5.2 Blind Data Injection Attack Mitigation")
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Reported-by: default avatarYepeng Pan <yepeng.pan@cispa.de>
      Reported-by: default avatarChristian Rossow <rossow@cispa.de>
      Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
      Link: https://lore.kernel.org/r/20231205161841.2702925-1-edumazet@google.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
      3d501dd3
  2. 06 Dec, 2023 29 commits