1. 21 Oct, 2017 1 commit
    • Eric Dumazet's avatar
      tcp/dccp: fix ireq->opt races · c92e8c02
      Eric Dumazet authored
      syzkaller found another bug in DCCP/TCP stacks [1]
      
      For the reasons explained in commit ce105008 ("tcp/dccp: fix
      ireq->pktopts race"), we need to make sure we do not access
      ireq->opt unless we own the request sock.
      
      Note the opt field is renamed to ireq_opt to ease grep games.
      
      [1]
      BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
      Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295
      
      CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80
      Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
      Call Trace:
       __dump_stack lib/dump_stack.c:16 [inline]
       dump_stack+0x194/0x257 lib/dump_stack.c:52
       print_address_description+0x73/0x250 mm/kasan/report.c:252
       kasan_report_error mm/kasan/report.c:351 [inline]
       kasan_report+0x25b/0x340 mm/kasan/report.c:409
       __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427
       ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
       tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135
       tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587
       tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557
       __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072
       tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline]
       tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071
       tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816
       tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682
       ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
       dst_input include/net/dst.h:464 [inline]
       ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
       __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
       __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
       netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
       netif_receive_skb+0xae/0x390 net/core/dev.c:4611
       tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
       tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
       tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
       call_write_iter include/linux/fs.h:1770 [inline]
       new_sync_write fs/read_write.c:468 [inline]
       __vfs_write+0x68a/0x970 fs/read_write.c:481
       vfs_write+0x18f/0x510 fs/read_write.c:543
       SYSC_write fs/read_write.c:588 [inline]
       SyS_write+0xef/0x220 fs/read_write.c:580
       entry_SYSCALL_64_fastpath+0x1f/0xbe
      RIP: 0033:0x40c341
      RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
      RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341
      RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015
      RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
      R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1
      R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000
      
      Allocated by task 3295:
       save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
       save_stack+0x43/0xd0 mm/kasan/kasan.c:447
       set_track mm/kasan/kasan.c:459 [inline]
       kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
       __do_kmalloc mm/slab.c:3725 [inline]
       __kmalloc+0x162/0x760 mm/slab.c:3734
       kmalloc include/linux/slab.h:498 [inline]
       tcp_v4_save_options include/net/tcp.h:1962 [inline]
       tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271
       tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283
       tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313
       tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857
       tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482
       tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711
       ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
       dst_input include/net/dst.h:464 [inline]
       ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
       __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
       __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
       netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
       netif_receive_skb+0xae/0x390 net/core/dev.c:4611
       tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
       tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
       tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
       call_write_iter include/linux/fs.h:1770 [inline]
       new_sync_write fs/read_write.c:468 [inline]
       __vfs_write+0x68a/0x970 fs/read_write.c:481
       vfs_write+0x18f/0x510 fs/read_write.c:543
       SYSC_write fs/read_write.c:588 [inline]
       SyS_write+0xef/0x220 fs/read_write.c:580
       entry_SYSCALL_64_fastpath+0x1f/0xbe
      
      Freed by task 3306:
       save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
       save_stack+0x43/0xd0 mm/kasan/kasan.c:447
       set_track mm/kasan/kasan.c:459 [inline]
       kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
       __cache_free mm/slab.c:3503 [inline]
       kfree+0xca/0x250 mm/slab.c:3820
       inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157
       __sk_destruct+0xfd/0x910 net/core/sock.c:1560
       sk_destruct+0x47/0x80 net/core/sock.c:1595
       __sk_free+0x57/0x230 net/core/sock.c:1603
       sk_free+0x2a/0x40 net/core/sock.c:1614
       sock_put include/net/sock.h:1652 [inline]
       inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959
       tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765
       tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675
       ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
       dst_input include/net/dst.h:464 [inline]
       ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
       NF_HOOK include/linux/netfilter.h:249 [inline]
       ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
       __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
       __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
       netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
       netif_receive_skb+0xae/0x390 net/core/dev.c:4611
       tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
       tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
       tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
       call_write_iter include/linux/fs.h:1770 [inline]
       new_sync_write fs/read_write.c:468 [inline]
       __vfs_write+0x68a/0x970 fs/read_write.c:481
       vfs_write+0x18f/0x510 fs/read_write.c:543
       SYSC_write fs/read_write.c:588 [inline]
       SyS_write+0xef/0x220 fs/read_write.c:580
       entry_SYSCALL_64_fastpath+0x1f/0xbe
      
      Fixes: e994b2f0 ("tcp: do not lock listener to process SYN packets")
      Fixes: 079096f1 ("tcp/dccp: install syn_recv requests into ehash table")
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      c92e8c02
  2. 20 Oct, 2017 7 commits
  3. 19 Oct, 2017 9 commits
    • Xin Long's avatar
      sctp: do not peel off an assoc from one netns to another one · df80cd9b
      Xin Long authored
      Now when peeling off an association to the sock in another netns, all
      transports in this assoc are not to be rehashed and keep use the old
      key in hashtable.
      
      As a transport uses sk->net as the hash key to insert into hashtable,
      it would miss removing these transports from hashtable due to the new
      netns when closing the sock and all transports are being freeed, then
      later an use-after-free issue could be caused when looking up an asoc
      and dereferencing those transports.
      
      This is a very old issue since very beginning, ChunYu found it with
      syzkaller fuzz testing with this series:
      
        socket$inet6_sctp()
        bind$inet6()
        sendto$inet6()
        unshare(0x40000000)
        getsockopt$inet_sctp6_SCTP_GET_ASSOC_ID_LIST()
        getsockopt$inet_sctp6_SCTP_SOCKOPT_PEELOFF()
      
      This patch is to block this call when peeling one assoc off from one
      netns to another one, so that the netns of all transport would not
      go out-sync with the key in hashtable.
      
      Note that this patch didn't fix it by rehashing transports, as it's
      difficult to handle the situation when the tuple is already in use
      in the new netns. Besides, no one would like to peel off one assoc
      to another netns, considering ipaddrs, ifaces, etc. are usually
      different.
      Reported-by: default avatarChunYu Wang <chunwang@redhat.com>
      Signed-off-by: default avatarXin Long <lucien.xin@gmail.com>
      Acked-by: default avatarMarcelo Ricardo Leitner <marcelo.leitner@gmail.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      df80cd9b
    • David S. Miller's avatar
      Merge branch 'bpf-Fix-for-BPF-devmap-percpu-allocation-splat' · 4bbb5083
      David S. Miller authored
      Daniel Borkmann says:
      
      ====================
      bpf: Fix for BPF devmap percpu allocation splat
      
      The set fixes a splat in devmap percpu allocation when we alloc
      the flush bitmap. Patch 1 is a prerequisite for the fix in patch 2,
      patch 1 is rather small, so if this could be routed via -net, for
      example, with Tejun's Ack that would be good. Patch 3 gets rid of
      remaining PCPU_MIN_UNIT_SIZE checks, which are percpu allocator
      internals and should not be used.
      ====================
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      4bbb5083
    • Daniel Borkmann's avatar
      bpf: do not test for PCPU_MIN_UNIT_SIZE before percpu allocations · bc6d5031
      Daniel Borkmann authored
      PCPU_MIN_UNIT_SIZE is an implementation detail of the percpu
      allocator. Given we support __GFP_NOWARN now, lets just let
      the allocation request fail naturally instead. The two call
      sites from BPF mistakenly assumed __GFP_NOWARN would work, so
      no changes needed to their actual __alloc_percpu_gfp() calls
      which use the flag already.
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      bc6d5031
    • Daniel Borkmann's avatar
      bpf: fix splat for illegal devmap percpu allocation · 82f8dd28
      Daniel Borkmann authored
      It was reported that syzkaller was able to trigger a splat on
      devmap percpu allocation due to illegal/unsupported allocation
      request size passed to __alloc_percpu():
      
        [   70.094249] illegal size (32776) or align (8) for percpu allocation
        [   70.094256] ------------[ cut here ]------------
        [   70.094259] WARNING: CPU: 3 PID: 3451 at mm/percpu.c:1365 pcpu_alloc+0x96/0x630
        [...]
        [   70.094325] Call Trace:
        [   70.094328]  __alloc_percpu_gfp+0x12/0x20
        [   70.094330]  dev_map_alloc+0x134/0x1e0
        [   70.094331]  SyS_bpf+0x9bc/0x1610
        [   70.094333]  ? selinux_task_setrlimit+0x5a/0x60
        [   70.094334]  ? security_task_setrlimit+0x43/0x60
        [   70.094336]  entry_SYSCALL_64_fastpath+0x1a/0xa5
      
      This was due to too large max_entries for the map such that we
      surpassed the upper limit of PCPU_MIN_UNIT_SIZE. It's fine to
      fail naturally here, so switch to __alloc_percpu_gfp() and pass
      __GFP_NOWARN instead.
      
      Fixes: 11393cc9 ("xdp: Add batching support to redirect map")
      Reported-by: default avatarMark Rutland <mark.rutland@arm.com>
      Reported-by: default avatarShankara Pailoor <sp3485@columbia.edu>
      Reported-by: default avatarRichard Weinberger <richard@nod.at>
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Cc: John Fastabend <john.fastabend@gmail.com>
      Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      82f8dd28
    • Daniel Borkmann's avatar
      mm, percpu: add support for __GFP_NOWARN flag · 0ea7eeec
      Daniel Borkmann authored
      Add an option for pcpu_alloc() to support __GFP_NOWARN flag.
      Currently, we always throw a warning when size or alignment
      is unsupported (and also dump stack on failed allocation
      requests). The warning itself is harmless since we return
      NULL anyway for any failed request, which callers are
      required to handle anyway. However, it becomes harmful when
      panic_on_warn is set.
      
      The rationale for the WARN() in pcpu_alloc() is that it can
      be tracked when larger than supported allocation requests are
      made such that allocations limits can be tweaked if warranted.
      This makes sense for in-kernel users, however, there are users
      of pcpu allocator where allocation size is derived from user
      space requests, e.g. when creating BPF maps. In these cases,
      the requests should fail gracefully without throwing a splat.
      
      The current work-around was to check allocation size against
      the upper limit of PCPU_MIN_UNIT_SIZE from call-sites for
      bailing out prior to a call to pcpu_alloc() in order to
      avoid throwing the WARN(). This is bad in multiple ways since
      PCPU_MIN_UNIT_SIZE is an implementation detail, and having
      the checks on call-sites only complicates the code for no
      good reason. Thus, lets fix it generically by supporting the
      __GFP_NOWARN flag that users can then use with calling the
      __alloc_percpu_gfp() helper instead.
      Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Cc: Tejun Heo <tj@kernel.org>
      Cc: Mark Rutland <mark.rutland@arm.com>
      Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      0ea7eeec
    • David S. Miller's avatar
      Merge branch 'ena-fixes' · 3fd3b03b
      David S. Miller authored
      Netanel Belgazal says:
      
      ====================
      ENA ethernet driver bug fixes
      
      Some fixes for ENA ethernet driver
      ====================
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      3fd3b03b
    • Netanel Belgazal's avatar
      net: ena: fix wrong max Tx/Rx queues on ethtool · a59df396
      Netanel Belgazal authored
      ethtool ena_get_channels() expose the max number of queues as the max
      number of queues ENA supports (128 queues) and not the actual number
      of created queues.
      Signed-off-by: default avatarNetanel Belgazal <netanel@amazon.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      a59df396
    • Netanel Belgazal's avatar
      net: ena: fix rare kernel crash when bar memory remap fails · 411838e7
      Netanel Belgazal authored
      This failure is rare and only found on testing where deliberately fail
      devm_ioremap()
      
      [  451.170464] ena 0000:04:00.0: failed to remap regs bar
      451.170549] Workqueue: pciehp-1 pciehp_power_thread
      [  451.170551] task: ffff88085a5f2d00 task.stack: ffffc9000756c000
      [  451.170552] RIP: 0010:devm_iounmap+0x2d/0x40
      [  451.170553] RSP: 0018:ffffc9000756fac0 EFLAGS: 00010282
      [  451.170554] RAX: 00000000fffffffe RBX: 0000000000000000 RCX:
      0000000000000000
      [  451.170555] RDX: ffffffff813a7e00 RSI: 0000000000000282 RDI:
      0000000000000282
      [  451.170556] RBP: ffffc9000756fac8 R08: 00000000fffffffe R09:
      00000000000009b7
      [  451.170557] R10: 0000000000000005 R11: 00000000000009b6 R12:
      ffff880856c9d0a0
      [  451.170558] R13: ffffc9000f5c90c0 R14: ffff880856c9d0a0 R15:
      0000000000000028
      [  451.170559] FS:  0000000000000000(0000) GS:ffff88085f400000(0000)
      knlGS:0000000000000000
      [  451.170560] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      [  451.170561] CR2: 00007f169038b000 CR3: 0000000001c09000 CR4:
      00000000003406f0
      [  451.170562] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
      0000000000000000
      [  451.170562] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
      0000000000000400
      [  451.170563] Call Trace:
      [  451.170572]  ena_release_bars.isra.48+0x34/0x60 [ena]
      [  451.170574]  ena_probe+0x144/0xd90 [ena]
      [  451.170579]  ? ida_simple_get+0x98/0x100
      [  451.170585]  ? kernfs_next_descendant_post+0x40/0x50
      [  451.170591]  local_pci_probe+0x45/0xa0
      [  451.170592]  pci_device_probe+0x157/0x180
      [  451.170599]  driver_probe_device+0x2a8/0x460
      [  451.170600]  __device_attach_driver+0x7e/0xe0
      [  451.170602]  ? driver_allows_async_probing+0x30/0x30
      [  451.170603]  bus_for_each_drv+0x68/0xb0
      [  451.170605]  __device_attach+0xdd/0x160
      [  451.170607]  device_attach+0x10/0x20
      [  451.170610]  pci_bus_add_device+0x4f/0xa0
      [  451.170611]  pci_bus_add_devices+0x39/0x70
      [  451.170613]  pciehp_configure_device+0x96/0x120
      [  451.170614]  pciehp_enable_slot+0x1b3/0x290
      [  451.170616]  pciehp_power_thread+0x3b/0xb0
      [  451.170622]  process_one_work+0x149/0x360
      [  451.170623]  worker_thread+0x4d/0x3c0
      [  451.170626]  kthread+0x109/0x140
      [  451.170627]  ? rescuer_thread+0x380/0x380
      [  451.170628]  ? kthread_park+0x60/0x60
      [  451.170632]  ret_from_fork+0x25/0x30
      Signed-off-by: default avatarNetanel Belgazal <netanel@amazon.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      411838e7
    • Netanel Belgazal's avatar
      net: ena: reduce the severity of some printouts · cd7aea18
      Netanel Belgazal authored
      Decrease log level of checksum errors as these messages can be
      triggered remotely by bad packets.
      Signed-off-by: default avatarNetanel Belgazal <netanel@amazon.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      cd7aea18
  4. 18 Oct, 2017 4 commits
    • Jakub Kicinski's avatar
      bpf: disallow arithmetic operations on context pointer · 28e33f9d
      Jakub Kicinski authored
      Commit f1174f77 ("bpf/verifier: rework value tracking")
      removed the crafty selection of which pointer types are
      allowed to be modified.  This is OK for most pointer types
      since adjust_ptr_min_max_vals() will catch operations on
      immutable pointers.  One exception is PTR_TO_CTX which is
      now allowed to be offseted freely.
      
      The intent of aforementioned commit was to allow context
      access via modified registers.  The offset passed to
      ->is_valid_access() verifier callback has been adjusted
      by the value of the variable offset.
      
      What is missing, however, is taking the variable offset
      into account when the context register is used.  Or in terms
      of the code adding the offset to the value passed to the
      ->convert_ctx_access() callback.  This leads to the following
      eBPF user code:
      
           r1 += 68
           r0 = *(u32 *)(r1 + 8)
           exit
      
      being translated to this in kernel space:
      
         0: (07) r1 += 68
         1: (61) r0 = *(u32 *)(r1 +180)
         2: (95) exit
      
      Offset 8 is corresponding to 180 in the kernel, but offset
      76 is valid too.  Verifier will "accept" access to offset
      68+8=76 but then "convert" access to offset 8 as 180.
      Effective access to offset 248 is beyond the kernel context.
      (This is a __sk_buff example on a debug-heavy kernel -
      packet mark is 8 -> 180, 76 would be data.)
      
      Dereferencing the modified context pointer is not as easy
      as dereferencing other types, because we have to translate
      the access to reading a field in kernel structures which is
      usually at a different offset and often of a different size.
      To allow modifying the pointer we would have to make sure
      that given eBPF instruction will always access the same
      field or the fields accessed are "compatible" in terms of
      offset and size...
      
      Disallow dereferencing modified context pointers and add
      to selftests the test case described here.
      
      Fixes: f1174f77 ("bpf/verifier: rework value tracking")
      Signed-off-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
      Acked-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
      Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
      Acked-by: default avatarEdward Cree <ecree@solarflare.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      28e33f9d
    • Johannes Berg's avatar
      netlink: fix netlink_ack() extack race · 48044eb4
      Johannes Berg authored
      It seems that it's possible to toggle NETLINK_F_EXT_ACK
      through setsockopt() while another thread/CPU is building
      a message inside netlink_ack(), which could then trigger
      the WARN_ON()s I added since if it goes from being turned
      off to being turned on between allocating and filling the
      message, the skb could end up being too small.
      
      Avoid this whole situation by storing the value of this
      flag in a separate variable and using that throughout the
      function instead.
      
      Fixes: 2d4bc933 ("netlink: extended ACK reporting")
      Signed-off-by: default avatarJohannes Berg <johannes.berg@intel.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      48044eb4
    • Thomas Falcon's avatar
      ibmvnic: Fix calculation of number of TX header descriptors · 2de09681
      Thomas Falcon authored
      This patch correctly sets the number of additional header descriptors
      that will be sent in an indirect SCRQ entry.
      Signed-off-by: default avatarThomas Falcon <tlfalcon@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      2de09681
    • Ido Schimmel's avatar
      mlxsw: core: Fix possible deadlock · d965465b
      Ido Schimmel authored
      When an EMAD is transmitted, a timeout work item is scheduled with a
      delay of 200ms, so that another EMAD will be retried until a maximum of
      five retries.
      
      In certain situations, it's possible for the function waiting on the
      EMAD to be associated with a work item that is queued on the same
      workqueue (`mlxsw_core`) as the timeout work item. This results in
      flushing a work item on the same workqueue.
      
      According to commit e159489b ("workqueue: relax lockdep annotation
      on flush_work()") the above may lead to a deadlock in case the workqueue
      has only one worker active or if the system in under memory pressure and
      the rescue worker is in use. The latter explains the very rare and
      random nature of the lockdep splats we have been seeing:
      
      [   52.730240] ============================================
      [   52.736179] WARNING: possible recursive locking detected
      [   52.742119] 4.14.0-rc3jiri+ #4 Not tainted
      [   52.746697] --------------------------------------------
      [   52.752635] kworker/1:3/599 is trying to acquire lock:
      [   52.758378]  (mlxsw_core_driver_name){+.+.}, at: [<ffffffff811c4fa4>] flush_work+0x3a4/0x5e0
      [   52.767837]
                     but task is already holding lock:
      [   52.774360]  (mlxsw_core_driver_name){+.+.}, at: [<ffffffff811c65c4>] process_one_work+0x7d4/0x12f0
      [   52.784495]
                     other info that might help us debug this:
      [   52.791794]  Possible unsafe locking scenario:
      [   52.798413]        CPU0
      [   52.801144]        ----
      [   52.803875]   lock(mlxsw_core_driver_name);
      [   52.808556]   lock(mlxsw_core_driver_name);
      [   52.813236]
                      *** DEADLOCK ***
      [   52.819857]  May be due to missing lock nesting notation
      [   52.827450] 3 locks held by kworker/1:3/599:
      [   52.832221]  #0:  (mlxsw_core_driver_name){+.+.}, at: [<ffffffff811c65c4>] process_one_work+0x7d4/0x12f0
      [   52.842846]  #1:  ((&(&bridge->fdb_notify.dw)->work)){+.+.}, at: [<ffffffff811c65c4>] process_one_work+0x7d4/0x12f0
      [   52.854537]  #2:  (rtnl_mutex){+.+.}, at: [<ffffffff822ad8e7>] rtnl_lock+0x17/0x20
      [   52.863021]
                     stack backtrace:
      [   52.867890] CPU: 1 PID: 599 Comm: kworker/1:3 Not tainted 4.14.0-rc3jiri+ #4
      [   52.875773] Hardware name: Mellanox Technologies Ltd. "MSN2100-CB2F"/"SA001017", BIOS 5.6.5 06/07/2016
      [   52.886267] Workqueue: mlxsw_core mlxsw_sp_fdb_notify_work [mlxsw_spectrum]
      [   52.894060] Call Trace:
      [   52.909122]  __lock_acquire+0xf6f/0x2a10
      [   53.025412]  lock_acquire+0x158/0x440
      [   53.047557]  flush_work+0x3c4/0x5e0
      [   53.087571]  __cancel_work_timer+0x3ca/0x5e0
      [   53.177051]  cancel_delayed_work_sync+0x13/0x20
      [   53.182142]  mlxsw_reg_trans_bulk_wait+0x12d/0x7a0 [mlxsw_core]
      [   53.194571]  mlxsw_core_reg_access+0x586/0x990 [mlxsw_core]
      [   53.225365]  mlxsw_reg_query+0x10/0x20 [mlxsw_core]
      [   53.230882]  mlxsw_sp_fdb_notify_work+0x2a3/0x9d0 [mlxsw_spectrum]
      [   53.237801]  process_one_work+0x8f1/0x12f0
      [   53.321804]  worker_thread+0x1fd/0x10c0
      [   53.435158]  kthread+0x28e/0x370
      [   53.448703]  ret_from_fork+0x2a/0x40
      [   53.453017] mlxsw_spectrum 0000:01:00.0: EMAD retries (2/5) (tid=bf4549b100000774)
      [   53.453119] mlxsw_spectrum 0000:01:00.0: EMAD retries (5/5) (tid=bf4549b100000770)
      [   53.453132] mlxsw_spectrum 0000:01:00.0: EMAD reg access failed (tid=bf4549b100000770,reg_id=200b(sfn),type=query,status=0(operation performed))
      [   53.453143] mlxsw_spectrum 0000:01:00.0: Failed to get FDB notifications
      
      Fix this by creating another workqueue for EMAD timeouts, thereby
      preventing the situation of a work item trying to flush a work item
      queued on the same workqueue.
      
      Fixes: caf7297e ("mlxsw: core: Introduce support for asynchronous EMAD register access")
      Signed-off-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Reported-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      d965465b
  5. 16 Oct, 2017 13 commits
  6. 15 Oct, 2017 6 commits