Commit fd3c040b authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2018-09-01

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Add AF_XDP zero-copy support for i40e driver (!), from Björn and Magnus.

2) BPF verifier improvements by giving each register its own liveness
   chain which allows to simplify and getting rid of skip_callee() logic,
   from Edward.

3) Add bpf fs pretty print support for percpu arraymap, percpu hashmap
   and percpu lru hashmap. Also add generic percpu formatted print on
   bpftool so the same can be dumped there, from Yonghong.

4) Add bpf_{set,get}sockopt() helper support for TCP_SAVE_SYN and
   TCP_SAVED_SYN options to allow reflection of tos/tclass from received
   SYN packet, from Nikita.

5) Misc improvements to the BPF sockmap test cases in terms of cgroup v2
   interaction and removal of incorrect shutdown() calls, from John.

6) Few cleanups in xdp_umem_assign_dev() and xdpsock samples, from Prashant.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ee713b6d 93ee30f3
......@@ -22,6 +22,7 @@ i40e-objs := i40e_main.o \
i40e_txrx.o \
i40e_ptp.o \
i40e_client.o \
i40e_virtchnl_pf.o
i40e_virtchnl_pf.o \
i40e_xsk.o
i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o
......@@ -786,6 +786,11 @@ struct i40e_vsi {
/* VSI specific handlers */
irqreturn_t (*irq_handler)(int irq, void *data);
/* AF_XDP zero-copy */
struct xdp_umem **xsk_umems;
u16 num_xsk_umems_used;
u16 num_xsk_umems;
} ____cacheline_internodealigned_in_smp;
struct i40e_netdev_priv {
......@@ -1090,6 +1095,20 @@ static inline bool i40e_enabled_xdp_vsi(struct i40e_vsi *vsi)
return !!vsi->xdp_prog;
}
static inline struct xdp_umem *i40e_xsk_umem(struct i40e_ring *ring)
{
bool xdp_on = i40e_enabled_xdp_vsi(ring->vsi);
int qid = ring->queue_index;
if (ring_is_xdp(ring))
qid -= ring->vsi->alloc_queue_pairs;
if (!ring->vsi->xsk_umems || !ring->vsi->xsk_umems[qid] || !xdp_on)
return NULL;
return ring->vsi->xsk_umems[qid];
}
int i40e_create_queue_channel(struct i40e_vsi *vsi, struct i40e_channel *ch);
int i40e_set_bw_limit(struct i40e_vsi *vsi, u16 seid, u64 max_tx_rate);
int i40e_add_del_cloud_filter(struct i40e_vsi *vsi,
......
......@@ -9,7 +9,9 @@
/* Local includes */
#include "i40e.h"
#include "i40e_diag.h"
#include "i40e_xsk.h"
#include <net/udp_tunnel.h>
#include <net/xdp_sock.h>
/* All i40e tracepoints are defined by the include below, which
* must be included exactly once across the whole kernel with
* CREATE_TRACE_POINTS defined
......@@ -3074,6 +3076,9 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
i40e_status err = 0;
u32 qtx_ctl = 0;
if (ring_is_xdp(ring))
ring->xsk_umem = i40e_xsk_umem(ring);
/* some ATR related tx ring init */
if (vsi->back->flags & I40E_FLAG_FD_ATR_ENABLED) {
ring->atr_sample_rate = vsi->back->atr_sample_rate;
......@@ -3183,13 +3188,46 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
struct i40e_hw *hw = &vsi->back->hw;
struct i40e_hmc_obj_rxq rx_ctx;
i40e_status err = 0;
bool ok;
int ret;
bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
/* clear the context structure first */
memset(&rx_ctx, 0, sizeof(rx_ctx));
ring->rx_buf_len = vsi->rx_buf_len;
if (ring->vsi->type == I40E_VSI_MAIN)
xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
ring->xsk_umem = i40e_xsk_umem(ring);
if (ring->xsk_umem) {
ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
XDP_PACKET_HEADROOM;
/* For AF_XDP ZC, we disallow packets to span on
* multiple buffers, thus letting us skip that
* handling in the fast-path.
*/
chain_len = 1;
ring->zca.free = i40e_zca_free;
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_ZERO_COPY,
&ring->zca);
if (ret)
return ret;
dev_info(&vsi->back->pdev->dev,
"Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
ring->queue_index);
} else {
ring->rx_buf_len = vsi->rx_buf_len;
if (ring->vsi->type == I40E_VSI_MAIN) {
ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
MEM_TYPE_PAGE_SHARED,
NULL);
if (ret)
return ret;
}
}
rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
BIT_ULL(I40E_RXQ_CTX_DBUFF_SHIFT));
......@@ -3245,7 +3283,15 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
ok = ring->xsk_umem ?
i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
!i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
if (!ok) {
dev_info(&vsi->back->pdev->dev,
"Failed allocate some buffers on %sRx ring %d (pf_q %d)\n",
ring->xsk_umem ? "UMEM enabled " : "",
ring->queue_index, pf_q);
}
return 0;
}
......@@ -11850,6 +11896,256 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi,
return 0;
}
/**
* i40e_enter_busy_conf - Enters busy config state
* @vsi: vsi
*
* Returns 0 on success, <0 for failure.
**/
static int i40e_enter_busy_conf(struct i40e_vsi *vsi)
{
struct i40e_pf *pf = vsi->back;
int timeout = 50;
while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) {
timeout--;
if (!timeout)
return -EBUSY;
usleep_range(1000, 2000);
}
return 0;
}
/**
* i40e_exit_busy_conf - Exits busy config state
* @vsi: vsi
**/
static void i40e_exit_busy_conf(struct i40e_vsi *vsi)
{
struct i40e_pf *pf = vsi->back;
clear_bit(__I40E_CONFIG_BUSY, pf->state);
}
/**
* i40e_queue_pair_reset_stats - Resets all statistics for a queue pair
* @vsi: vsi
* @queue_pair: queue pair
**/
static void i40e_queue_pair_reset_stats(struct i40e_vsi *vsi, int queue_pair)
{
memset(&vsi->rx_rings[queue_pair]->rx_stats, 0,
sizeof(vsi->rx_rings[queue_pair]->rx_stats));
memset(&vsi->tx_rings[queue_pair]->stats, 0,
sizeof(vsi->tx_rings[queue_pair]->stats));
if (i40e_enabled_xdp_vsi(vsi)) {
memset(&vsi->xdp_rings[queue_pair]->stats, 0,
sizeof(vsi->xdp_rings[queue_pair]->stats));
}
}
/**
* i40e_queue_pair_clean_rings - Cleans all the rings of a queue pair
* @vsi: vsi
* @queue_pair: queue pair
**/
static void i40e_queue_pair_clean_rings(struct i40e_vsi *vsi, int queue_pair)
{
i40e_clean_tx_ring(vsi->tx_rings[queue_pair]);
if (i40e_enabled_xdp_vsi(vsi))
i40e_clean_tx_ring(vsi->xdp_rings[queue_pair]);
i40e_clean_rx_ring(vsi->rx_rings[queue_pair]);
}
/**
* i40e_queue_pair_toggle_napi - Enables/disables NAPI for a queue pair
* @vsi: vsi
* @queue_pair: queue pair
* @enable: true for enable, false for disable
**/
static void i40e_queue_pair_toggle_napi(struct i40e_vsi *vsi, int queue_pair,
bool enable)
{
struct i40e_ring *rxr = vsi->rx_rings[queue_pair];
struct i40e_q_vector *q_vector = rxr->q_vector;
if (!vsi->netdev)
return;
/* All rings in a qp belong to the same qvector. */
if (q_vector->rx.ring || q_vector->tx.ring) {
if (enable)
napi_enable(&q_vector->napi);
else
napi_disable(&q_vector->napi);
}
}
/**
* i40e_queue_pair_toggle_rings - Enables/disables all rings for a queue pair
* @vsi: vsi
* @queue_pair: queue pair
* @enable: true for enable, false for disable
*
* Returns 0 on success, <0 on failure.
**/
static int i40e_queue_pair_toggle_rings(struct i40e_vsi *vsi, int queue_pair,
bool enable)
{
struct i40e_pf *pf = vsi->back;
int pf_q, ret = 0;
pf_q = vsi->base_queue + queue_pair;
ret = i40e_control_wait_tx_q(vsi->seid, pf, pf_q,
false /*is xdp*/, enable);
if (ret) {
dev_info(&pf->pdev->dev,
"VSI seid %d Tx ring %d %sable timeout\n",
vsi->seid, pf_q, (enable ? "en" : "dis"));
return ret;
}
i40e_control_rx_q(pf, pf_q, enable);
ret = i40e_pf_rxq_wait(pf, pf_q, enable);
if (ret) {
dev_info(&pf->pdev->dev,
"VSI seid %d Rx ring %d %sable timeout\n",
vsi->seid, pf_q, (enable ? "en" : "dis"));
return ret;
}
/* Due to HW errata, on Rx disable only, the register can
* indicate done before it really is. Needs 50ms to be sure
*/
if (!enable)
mdelay(50);
if (!i40e_enabled_xdp_vsi(vsi))
return ret;
ret = i40e_control_wait_tx_q(vsi->seid, pf,
pf_q + vsi->alloc_queue_pairs,
true /*is xdp*/, enable);
if (ret) {
dev_info(&pf->pdev->dev,
"VSI seid %d XDP Tx ring %d %sable timeout\n",
vsi->seid, pf_q, (enable ? "en" : "dis"));
}
return ret;
}
/**
* i40e_queue_pair_enable_irq - Enables interrupts for a queue pair
* @vsi: vsi
* @queue_pair: queue_pair
**/
static void i40e_queue_pair_enable_irq(struct i40e_vsi *vsi, int queue_pair)
{
struct i40e_ring *rxr = vsi->rx_rings[queue_pair];
struct i40e_pf *pf = vsi->back;
struct i40e_hw *hw = &pf->hw;
/* All rings in a qp belong to the same qvector. */
if (pf->flags & I40E_FLAG_MSIX_ENABLED)
i40e_irq_dynamic_enable(vsi, rxr->q_vector->v_idx);
else
i40e_irq_dynamic_enable_icr0(pf);
i40e_flush(hw);
}
/**
* i40e_queue_pair_disable_irq - Disables interrupts for a queue pair
* @vsi: vsi
* @queue_pair: queue_pair
**/
static void i40e_queue_pair_disable_irq(struct i40e_vsi *vsi, int queue_pair)
{
struct i40e_ring *rxr = vsi->rx_rings[queue_pair];
struct i40e_pf *pf = vsi->back;
struct i40e_hw *hw = &pf->hw;
/* For simplicity, instead of removing the qp interrupt causes
* from the interrupt linked list, we simply disable the interrupt, and
* leave the list intact.
*
* All rings in a qp belong to the same qvector.
*/
if (pf->flags & I40E_FLAG_MSIX_ENABLED) {
u32 intpf = vsi->base_vector + rxr->q_vector->v_idx;
wr32(hw, I40E_PFINT_DYN_CTLN(intpf - 1), 0);
i40e_flush(hw);
synchronize_irq(pf->msix_entries[intpf].vector);
} else {
/* Legacy and MSI mode - this stops all interrupt handling */
wr32(hw, I40E_PFINT_ICR0_ENA, 0);
wr32(hw, I40E_PFINT_DYN_CTL0, 0);
i40e_flush(hw);
synchronize_irq(pf->pdev->irq);
}
}
/**
* i40e_queue_pair_disable - Disables a queue pair
* @vsi: vsi
* @queue_pair: queue pair
*
* Returns 0 on success, <0 on failure.
**/
int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair)
{
int err;
err = i40e_enter_busy_conf(vsi);
if (err)
return err;
i40e_queue_pair_disable_irq(vsi, queue_pair);
err = i40e_queue_pair_toggle_rings(vsi, queue_pair, false /* off */);
i40e_queue_pair_toggle_napi(vsi, queue_pair, false /* off */);
i40e_queue_pair_clean_rings(vsi, queue_pair);
i40e_queue_pair_reset_stats(vsi, queue_pair);
return err;
}
/**
* i40e_queue_pair_enable - Enables a queue pair
* @vsi: vsi
* @queue_pair: queue pair
*
* Returns 0 on success, <0 on failure.
**/
int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair)
{
int err;
err = i40e_configure_tx_ring(vsi->tx_rings[queue_pair]);
if (err)
return err;
if (i40e_enabled_xdp_vsi(vsi)) {
err = i40e_configure_tx_ring(vsi->xdp_rings[queue_pair]);
if (err)
return err;
}
err = i40e_configure_rx_ring(vsi->rx_rings[queue_pair]);
if (err)
return err;
err = i40e_queue_pair_toggle_rings(vsi, queue_pair, true /* on */);
i40e_queue_pair_toggle_napi(vsi, queue_pair, true /* on */);
i40e_queue_pair_enable_irq(vsi, queue_pair);
i40e_exit_busy_conf(vsi);
return err;
}
/**
* i40e_xdp - implements ndo_bpf for i40e
* @dev: netdevice
......@@ -11870,6 +12166,12 @@ static int i40e_xdp(struct net_device *dev,
case XDP_QUERY_PROG:
xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
return 0;
case XDP_QUERY_XSK_UMEM:
return i40e_xsk_umem_query(vsi, &xdp->xsk.umem,
xdp->xsk.queue_id);
case XDP_SETUP_XSK_UMEM:
return i40e_xsk_umem_setup(vsi, xdp->xsk.umem,
xdp->xsk.queue_id);
default:
return -EINVAL;
}
......@@ -11909,6 +12211,7 @@ static const struct net_device_ops i40e_netdev_ops = {
.ndo_bridge_setlink = i40e_ndo_bridge_setlink,
.ndo_bpf = i40e_xdp,
.ndo_xdp_xmit = i40e_xdp_xmit,
.ndo_xsk_async_xmit = i40e_xsk_async_xmit,
};
/**
......
This diff is collapsed.
......@@ -296,13 +296,17 @@ struct i40e_tx_buffer {
struct i40e_rx_buffer {
dma_addr_t dma;
struct page *page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
__u32 page_offset;
#else
__u16 page_offset;
#endif
__u16 pagecnt_bias;
union {
struct {
struct page *page;
__u32 page_offset;
__u16 pagecnt_bias;
};
struct {
void *addr;
u64 handle;
};
};
};
struct i40e_queue_stats {
......@@ -414,6 +418,8 @@ struct i40e_ring {
struct i40e_channel *ch;
struct xdp_rxq_info xdp_rxq;
struct xdp_umem *xsk_umem;
struct zero_copy_allocator zca; /* ZC allocator anchor */
} ____cacheline_internodealigned_in_smp;
static inline bool ring_uses_build_skb(struct i40e_ring *ring)
......
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2018 Intel Corporation. */
#ifndef I40E_TXRX_COMMON_
#define I40E_TXRX_COMMON_
void i40e_fd_handle_status(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, u8 prog_id);
int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp, struct i40e_ring *xdp_ring);
struct i40e_rx_buffer *i40e_clean_programming_status(
struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc,
u64 qw);
void i40e_process_skb_fields(struct i40e_ring *rx_ring,
union i40e_rx_desc *rx_desc, struct sk_buff *skb,
u8 rx_ptype);
void i40e_receive_skb(struct i40e_ring *rx_ring,
struct sk_buff *skb, u16 vlan_tag);
void i40e_xdp_ring_update_tail(struct i40e_ring *xdp_ring);
void i40e_update_rx_stats(struct i40e_ring *rx_ring,
unsigned int total_rx_bytes,
unsigned int total_rx_packets);
void i40e_finalize_xdp_rx(struct i40e_ring *rx_ring, unsigned int xdp_res);
void i40e_release_rx_desc(struct i40e_ring *rx_ring, u32 val);
#define I40E_XDP_PASS 0
#define I40E_XDP_CONSUMED BIT(0)
#define I40E_XDP_TX BIT(1)
#define I40E_XDP_REDIR BIT(2)
/**
* build_ctob - Builds the Tx descriptor (cmd, offset and type) qword
**/
static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
u32 td_tag)
{
return cpu_to_le64(I40E_TX_DESC_DTYPE_DATA |
((u64)td_cmd << I40E_TXD_QW1_CMD_SHIFT) |
((u64)td_offset << I40E_TXD_QW1_OFFSET_SHIFT) |
((u64)size << I40E_TXD_QW1_TX_BUF_SZ_SHIFT) |
((u64)td_tag << I40E_TXD_QW1_L2TAG1_SHIFT));
}
/**
* i40e_update_tx_stats - Update the egress statistics for the Tx ring
* @tx_ring: Tx ring to update
* @total_packet: total packets sent
* @total_bytes: total bytes sent
**/
static inline void i40e_update_tx_stats(struct i40e_ring *tx_ring,
unsigned int total_packets,
unsigned int total_bytes)
{
u64_stats_update_begin(&tx_ring->syncp);
tx_ring->stats.bytes += total_bytes;
tx_ring->stats.packets += total_packets;
u64_stats_update_end(&tx_ring->syncp);
tx_ring->q_vector->tx.total_bytes += total_bytes;
tx_ring->q_vector->tx.total_packets += total_packets;
}
#define WB_STRIDE 4
/**
* i40e_arm_wb - (Possibly) arms Tx write-back
* @tx_ring: Tx ring to update
* @vsi: the VSI
* @budget: the NAPI budget left
**/
static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
struct i40e_vsi *vsi,
int budget)
{
if (tx_ring->flags & I40E_TXR_FLAGS_WB_ON_ITR) {
/* check to see if there are < 4 descriptors
* waiting to be written back, then kick the hardware to force
* them to be written back in case we stay in NAPI.
* In this mode on X722 we do not enable Interrupt.
*/
unsigned int j = i40e_get_tx_pending(tx_ring, false);
if (budget &&
((j / WB_STRIDE) == 0) && j > 0 &&
!test_bit(__I40E_VSI_DOWN, vsi->state) &&
(I40E_DESC_UNUSED(tx_ring) != tx_ring->count))
tx_ring->arm_wb = true;
}
}
#endif /* I40E_TXRX_COMMON_ */
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright(c) 2018 Intel Corporation. */
#ifndef _I40E_XSK_H_
#define _I40E_XSK_H_
struct i40e_vsi;
struct xdp_umem;
struct zero_copy_allocator;
int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair);
int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair);
int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem,
u16 qid);
int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid);
void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle);
bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 cleaned_count);
int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget);
bool i40e_clean_xdp_tx_irq(struct i40e_vsi *vsi,
struct i40e_ring *tx_ring, int napi_budget);
int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id);
#endif /* _I40E_XSK_H_ */
......@@ -41,6 +41,7 @@ enum bpf_reg_liveness {
};
struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type;
union {
/* valid when type == PTR_TO_PACKET */
......@@ -59,7 +60,6 @@ struct bpf_reg_state {
* came from, when one is tested for != NULL.
*/
u32 id;
/* Ordering of fields matters. See states_equal() */
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
......@@ -76,15 +76,15 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
/* parentage chain for liveness checking */
struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
* This field must be second to last, for states_equal() reasons.
*/
u32 frameno;
/* This field must be last, for states_equal() reasons. */
enum bpf_reg_liveness live;
};
......@@ -107,7 +107,6 @@ struct bpf_stack_state {
*/
struct bpf_func_state {
struct bpf_reg_state regs[MAX_BPF_REG];
struct bpf_verifier_state *parent;
/* index of call instruction that called into this func */
int callsite;
/* stack frame number of this function state from pov of
......@@ -129,7 +128,6 @@ struct bpf_func_state {
struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
struct bpf_verifier_state *parent;
u32 curframe;
};
......
......@@ -535,6 +535,32 @@ static inline void napi_synchronize(const struct napi_struct *n)
barrier();
}
/**
* napi_if_scheduled_mark_missed - if napi is running, set the
* NAPIF_STATE_MISSED
* @n: NAPI context
*
* If napi is running, set the NAPIF_STATE_MISSED, and return true if
* NAPI is scheduled.
**/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
unsigned long val, new;
do {
val = READ_ONCE(n->state);
if (val & NAPIF_STATE_DISABLE)
return true;
if (!(val & NAPIF_STATE_SCHED))
return false;
new = val | NAPIF_STATE_MISSED;
} while (cmpxchg(&n->state, val, new) != val);
return true;
}
enum netdev_queue_state_t {
__QUEUE_STATE_DRV_XOFF,
__QUEUE_STATE_STACK_XOFF,
......
......@@ -91,6 +91,8 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame)
frame->dev_rx = NULL;
}
struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
......@@ -99,9 +101,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
int metasize;
int headroom;
/* TODO: implement clone, copy, use "native" MEM_TYPE */
if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY)
return NULL;
return xdp_convert_zc_to_xdp_frame(xdp);
/* Assure headroom is available for storing info */
headroom = xdp->data - xdp->data_hard_start;
......@@ -135,6 +136,7 @@ void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
enum xdp_mem_type type, void *allocator);
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
/* Drivers not supporting XDP metadata can use this helper, which
* rejects any room expansion for metadata as a result.
......
......@@ -16,11 +16,6 @@
struct net_device;
struct xsk_queue;
struct xdp_umem_props {
u64 chunk_mask;
u64 size;
};
struct xdp_umem_page {
void *addr;
dma_addr_t dma;
......@@ -30,7 +25,8 @@ struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
struct xdp_umem_page *pages;
struct xdp_umem_props props;
u64 chunk_mask;
u64 size;
u32 headroom;
u32 chunk_size_nohr;
struct user_struct *user;
......@@ -79,6 +75,16 @@ void xsk_umem_discard_addr(struct xdp_umem *umem);
void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
void xsk_umem_consume_tx_done(struct xdp_umem *umem);
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
}
#else
static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
......@@ -98,6 +104,39 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs)
{
return false;
}
static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr)
{
return NULL;
}
static inline void xsk_umem_discard_addr(struct xdp_umem *umem)
{
}
static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
{
}
static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma,
u32 *len)
{
return false;
}
static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem)
{
}
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return NULL;
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return 0;
}
#endif /* CONFIG_XDP_SOCKETS */
#endif /* _LINUX_XDP_SOCK_H */
......@@ -358,6 +358,29 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
void __percpu *pptr;
int cpu;
rcu_read_lock();
seq_printf(m, "%u: {\n", *(u32 *)key);
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
seq_puts(m, "\n");
}
seq_puts(m, "}\n");
rcu_read_unlock();
}
static int array_map_check_btf(const struct bpf_map *map,
const struct btf_type *key_type,
const struct btf_type *value_type)
......@@ -398,6 +421,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_lookup_elem = percpu_array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
......
......@@ -1285,6 +1285,35 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
return ret;
}
static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key,
struct seq_file *m)
{
struct htab_elem *l;
void __percpu *pptr;
int cpu;
rcu_read_lock();
l = __htab_map_lookup_elem(map, key);
if (!l) {
rcu_read_unlock();
return;
}
btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
seq_puts(m, ": {\n");
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
seq_printf(m, "\tcpu%d: ", cpu);
btf_type_seq_show(map->btf, map->btf_value_type_id,
per_cpu_ptr(pptr, cpu), m);
seq_puts(m, "\n");
}
seq_puts(m, "}\n");
rcu_read_unlock();
}
const struct bpf_map_ops htab_percpu_map_ops = {
.map_alloc_check = htab_map_alloc_check,
.map_alloc = htab_map_alloc,
......@@ -1293,6 +1322,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_lookup_elem = htab_percpu_map_lookup_elem,
.map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
};
const struct bpf_map_ops htab_lru_percpu_map_ops = {
......@@ -1303,6 +1333,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
};
static int fd_htab_map_alloc_check(union bpf_attr *attr)
......
......@@ -30,7 +30,6 @@
#include <linux/cred.h>
#include <linux/timekeeping.h>
#include <linux/ctype.h>
#include <linux/btf.h>
#include <linux/nospec.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
......
This diff is collapsed.
......@@ -4007,6 +4007,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
tp->snd_ssthresh = val;
}
break;
case TCP_SAVE_SYN:
if (val < 0 || val > 1)
ret = -EINVAL;
else
tp->save_syn = val;
break;
default:
ret = -EINVAL;
}
......@@ -4032,21 +4038,32 @@ static const struct bpf_func_proto bpf_setsockopt_proto = {
BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
struct inet_connection_sock *icsk;
struct sock *sk = bpf_sock->sk;
struct tcp_sock *tp;
if (!sk_fullsock(sk))
goto err_clear;
#ifdef CONFIG_INET
if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
if (optname == TCP_CONGESTION) {
struct inet_connection_sock *icsk = inet_csk(sk);
switch (optname) {
case TCP_CONGESTION:
icsk = inet_csk(sk);
if (!icsk->icsk_ca_ops || optlen <= 1)
goto err_clear;
strncpy(optval, icsk->icsk_ca_ops->name, optlen);
optval[optlen - 1] = 0;
} else {
break;
case TCP_SAVED_SYN:
tp = tcp_sk(sk);
if (optlen <= 0 || !tp->saved_syn ||
optlen > tp->saved_syn[0])
goto err_clear;
memcpy(optval, tp->saved_syn + 1, optlen);
break;
default:
goto err_clear;
}
} else if (level == SOL_IP) {
......
......@@ -94,11 +94,21 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
int id = xdp_rxq->mem.id;
if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
WARN(1, "Missing register, driver bug");
return;
}
if (xdp_rxq->mem.type != MEM_TYPE_PAGE_POOL &&
xdp_rxq->mem.type != MEM_TYPE_ZERO_COPY) {
return;
}
if (id == 0)
return;
......@@ -110,6 +120,7 @@ static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
mutex_unlock(&mem_id_lock);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
......@@ -119,7 +130,7 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
__xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
......@@ -398,3 +409,41 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
info->flags = bpf->flags;
}
EXPORT_SYMBOL_GPL(xdp_attachment_setup);
struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
{
unsigned int metasize, totsize;
void *addr, *data_to_copy;
struct xdp_frame *xdpf;
struct page *page;
/* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
metasize = xdp_data_meta_unsupported(xdp) ? 0 :
xdp->data - xdp->data_meta;
totsize = xdp->data_end - xdp->data + metasize;
if (sizeof(*xdpf) + totsize > PAGE_SIZE)
return NULL;
page = dev_alloc_page();
if (!page)
return NULL;
addr = page_to_virt(page);
xdpf = addr;
memset(xdpf, 0, sizeof(*xdpf));
addr += sizeof(*xdpf);
data_to_copy = metasize ? xdp->data_meta : xdp->data;
memcpy(addr, data_to_copy, totsize);
xdpf->data = addr + metasize;
xdpf->len = totsize - metasize;
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
xdp_return_buff(xdp);
return xdpf;
}
EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
......@@ -76,8 +76,6 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */
bpf.command = XDP_QUERY_XSK_UMEM;
rtnl_lock();
err = xdp_umem_query(dev, queue_id);
if (err) {
......@@ -314,8 +312,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
umem->pid = get_task_pid(current, PIDTYPE_PID);
umem->address = (unsigned long)addr;
umem->props.chunk_mask = ~((u64)chunk_size - 1);
umem->props.size = size;
umem->chunk_mask = ~((u64)chunk_size - 1);
umem->size = size;
umem->headroom = headroom;
umem->chunk_size_nohr = chunk_size - headroom;
umem->npgs = size / PAGE_SIZE;
......
......@@ -8,16 +8,6 @@
#include <net/xdp_sock.h>
static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
}
static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr)
{
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
}
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u32 queue_id, u16 flags);
bool xdp_umem_validate_queues(struct xdp_umem *umem);
......
/* SPDX-License-Identifier: GPL-2.0 */
/* XDP user-space packet buffer
* Copyright(c) 2018 Intel Corporation.
*/
#ifndef XDP_UMEM_PROPS_H_
#define XDP_UMEM_PROPS_H_
struct xdp_umem_props {
u64 chunk_mask;
u64 size;
};
#endif /* XDP_UMEM_PROPS_H_ */
......@@ -55,20 +55,30 @@ EXPORT_SYMBOL(xsk_umem_discard_addr);
static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
void *buffer;
void *to_buf, *from_buf;
u32 metalen;
u64 addr;
int err;
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
len > xs->umem->chunk_size_nohr) {
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
addr += xs->umem->headroom;
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data, len);
if (unlikely(xdp_data_meta_unsupported(xdp))) {
from_buf = xdp->data;
metalen = 0;
} else {
from_buf = xdp->data_meta;
metalen = xdp->data - xdp->data_meta;
}
to_buf = xdp_umem_get_data(xs->umem, addr);
memcpy(to_buf, from_buf, len + metalen);
addr += metalen;
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
......@@ -111,6 +121,7 @@ void xsk_flush(struct xdp_sock *xs)
int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
u32 metalen = xdp->data - xdp->data_meta;
u32 len = xdp->data_end - xdp->data;
void *buffer;
u64 addr;
......@@ -120,7 +131,7 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
return -EINVAL;
if (!xskq_peek_addr(xs->umem->fq, &addr) ||
len > xs->umem->chunk_size_nohr) {
len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) {
xs->rx_dropped++;
return -ENOSPC;
}
......@@ -128,7 +139,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
addr += xs->umem->headroom;
buffer = xdp_umem_get_data(xs->umem, addr);
memcpy(buffer, xdp->data, len);
memcpy(buffer, xdp->data_meta, len + metalen);
addr += metalen;
err = xskq_produce_batch_desc(xs->rx, addr, len);
if (!err) {
xskq_discard_addr(xs->umem->fq);
......@@ -458,8 +470,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
goto out_unlock;
} else {
/* This xsk has its own umem. */
xskq_set_umem(xs->umem->fq, &xs->umem->props);
xskq_set_umem(xs->umem->cq, &xs->umem->props);
xskq_set_umem(xs->umem->fq, xs->umem->size,
xs->umem->chunk_mask);
xskq_set_umem(xs->umem->cq, xs->umem->size,
xs->umem->chunk_mask);
err = xdp_umem_assign_dev(xs->umem, dev, qid, flags);
if (err)
......@@ -469,8 +483,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->queue_id = qid;
xskq_set_umem(xs->rx, &xs->umem->props);
xskq_set_umem(xs->tx, &xs->umem->props);
xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
xdp_add_sk_umem(xs->umem, xs);
out_unlock:
......
......@@ -7,12 +7,13 @@
#include "xsk_queue.h"
void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props)
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask)
{
if (!q)
return;
q->umem_props = *umem_props;
q->size = size;
q->chunk_mask = chunk_mask;
}
static u32 xskq_umem_get_ring_size(struct xsk_queue *q)
......
......@@ -31,7 +31,8 @@ struct xdp_umem_ring {
};
struct xsk_queue {
struct xdp_umem_props umem_props;
u64 chunk_mask;
u64 size;
u32 ring_mask;
u32 nentries;
u32 prod_head;
......@@ -78,7 +79,7 @@ static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
static inline bool xskq_is_valid_addr(struct xsk_queue *q, u64 addr)
{
if (addr >= q->umem_props.size) {
if (addr >= q->size) {
q->invalid_descs++;
return false;
}
......@@ -92,7 +93,7 @@ static inline u64 *xskq_validate_addr(struct xsk_queue *q, u64 *addr)
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
unsigned int idx = q->cons_tail & q->ring_mask;
*addr = READ_ONCE(ring->desc[idx]) & q->umem_props.chunk_mask;
*addr = READ_ONCE(ring->desc[idx]) & q->chunk_mask;
if (xskq_is_valid_addr(q, *addr))
return addr;
......@@ -173,8 +174,8 @@ static inline bool xskq_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d)
if (!xskq_is_valid_addr(q, d->addr))
return false;
if (((d->addr + d->len) & q->umem_props.chunk_mask) !=
(d->addr & q->umem_props.chunk_mask)) {
if (((d->addr + d->len) & q->chunk_mask) !=
(d->addr & q->chunk_mask)) {
q->invalid_descs++;
return false;
}
......@@ -253,7 +254,7 @@ static inline bool xskq_empty_desc(struct xsk_queue *q)
return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
}
void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask);
struct xsk_queue *xskq_create(u32 nentries, bool umem_queue);
void xskq_destroy(struct xsk_queue *q_ops);
......
......@@ -153,6 +153,7 @@ always += tcp_cong_kern.o
always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2018 Facebook
*
* BPF program to automatically reflect TOS option from received syn packet
*
* Use load_sock_ops to load this BPF program.
*/
#include <uapi/linux/bpf.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/if_packet.h>
#include <uapi/linux/ip.h>
#include <uapi/linux/ipv6.h>
#include <uapi/linux/in.h>
#include <linux/socket.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
#define DEBUG 1
#define bpf_printk(fmt, ...) \
({ \
char ____fmt[] = fmt; \
bpf_trace_printk(____fmt, sizeof(____fmt), \
##__VA_ARGS__); \
})
SEC("sockops")
int bpf_basertt(struct bpf_sock_ops *skops)
{
char header[sizeof(struct ipv6hdr)];
struct ipv6hdr *hdr6;
struct iphdr *hdr;
int hdr_size = 0;
int save_syn = 1;
int tos = 0;
int rv = 0;
int op;
op = (int) skops->op;
#ifdef DEBUG
bpf_printk("BPF command: %d\n", op);
#endif
switch (op) {
case BPF_SOCK_OPS_TCP_LISTEN_CB:
rv = bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
&save_syn, sizeof(save_syn));
break;
case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
if (skops->family == AF_INET)
hdr_size = sizeof(struct iphdr);
else
hdr_size = sizeof(struct ipv6hdr);
rv = bpf_getsockopt(skops, SOL_TCP, TCP_SAVED_SYN,
header, hdr_size);
if (!rv) {
if (skops->family == AF_INET) {
hdr = (struct iphdr *) header;
tos = hdr->tos;
if (tos != 0)
bpf_setsockopt(skops, SOL_IP, IP_TOS,
&tos, sizeof(tos));
} else {
hdr6 = (struct ipv6hdr *) header;
tos = ((hdr6->priority) << 4 |
(hdr6->flow_lbl[0]) >> 4);
if (tos)
bpf_setsockopt(skops, SOL_IPV6,
IPV6_TCLASS,
&tos, sizeof(tos));
}
rv = 0;
}
break;
default:
rv = -1;
}
#ifdef DEBUG
bpf_printk("Returning %d\n", rv);
#endif
skops->reply = rv;
return 1;
}
char _license[] SEC("license") = "GPL";
......@@ -16,7 +16,7 @@ struct bpf_map_def SEC("maps") xsks_map = {
.type = BPF_MAP_TYPE_XSKMAP,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 4,
.max_entries = MAX_SOCKS,
};
struct bpf_map_def SEC("maps") rr_map = {
......
......@@ -118,7 +118,6 @@ struct xdpsock {
unsigned long prev_tx_npkts;
};
#define MAX_SOCKS 4
static int num_socks;
struct xdpsock *xsks[MAX_SOCKS];
......@@ -596,7 +595,7 @@ static void dump_stats(void)
prev_time = now;
for (i = 0; i < num_socks; i++) {
for (i = 0; i < num_socks && xsks[i]; i++) {
char *fmt = "%-15s %'-11.0f %'-11lu\n";
double rx_pps, tx_pps;
......@@ -649,6 +648,8 @@ static struct option long_options[] = {
{"xdp-skb", no_argument, 0, 'S'},
{"xdp-native", no_argument, 0, 'N'},
{"interval", required_argument, 0, 'n'},
{"zero-copy", no_argument, 0, 'z'},
{"copy", no_argument, 0, 'c'},
{0, 0, 0, 0}
};
......@@ -667,6 +668,8 @@ static void usage(const char *prog)
" -S, --xdp-skb=n Use XDP skb-mod\n"
" -N, --xdp-native=n Enfore XDP native mode\n"
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
" -z, --zero-copy Force zero-copy mode.\n"
" -c, --copy Force copy mode.\n"
"\n";
fprintf(stderr, str, prog);
exit(EXIT_FAILURE);
......@@ -679,7 +682,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
c = getopt_long(argc, argv, "rtli:q:psSNn:", long_options,
c = getopt_long(argc, argv, "rtli:q:psSNn:cz", long_options,
&option_index);
if (c == -1)
break;
......@@ -716,6 +719,12 @@ static void parse_command_line(int argc, char **argv)
case 'n':
opt_interval = atoi(optarg);
break;
case 'z':
opt_xdp_bind_flags |= XDP_ZEROCOPY;
break;
case 'c':
opt_xdp_bind_flags |= XDP_COPY;
break;
default:
usage(basename(argv[0]));
}
......
......@@ -169,9 +169,28 @@ static int do_dump_btf(const struct btf_dumper *d,
if (ret)
goto err_end_obj;
jsonw_name(d->jw, "value");
if (!map_is_per_cpu(map_info->type)) {
jsonw_name(d->jw, "value");
ret = btf_dumper_type(d, map_info->btf_value_type_id, value);
} else {
unsigned int i, n, step;
ret = btf_dumper_type(d, map_info->btf_value_type_id, value);
jsonw_name(d->jw, "values");
jsonw_start_array(d->jw);
n = get_possible_cpus();
step = round_up(map_info->value_size, 8);
for (i = 0; i < n; i++) {
jsonw_start_object(d->jw);
jsonw_int_field(d->jw, "cpu", i);
jsonw_name(d->jw, "value");
ret = btf_dumper_type(d, map_info->btf_value_type_id,
value + i * step);
jsonw_end_object(d->jw);
if (ret)
break;
}
jsonw_end_array(d->jw);
}
err_end_obj:
/* end of key-value pair */
......@@ -298,6 +317,16 @@ static void print_entry_json(struct bpf_map_info *info, unsigned char *key,
jsonw_end_object(json_wtr);
}
jsonw_end_array(json_wtr);
if (btf) {
struct btf_dumper d = {
.btf = btf,
.jw = json_wtr,
.is_plain_text = false,
};
jsonw_name(json_wtr, "formatted");
do_dump_btf(&d, info, key, value);
}
}
jsonw_end_object(json_wtr);
......
......@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include <linux/kernel.h>
#include <bpf/bpf.h>
#include <sys/resource.h>
#include <libelf.h>
......@@ -45,7 +46,6 @@ static int count_result(int err)
return err;
}
#define min(a, b) ((a) < (b) ? (a) : (b))
#define __printf(a, b) __attribute__((format(printf, a, b)))
__printf(1, 2)
......@@ -130,6 +130,7 @@ struct btf_raw_test {
bool map_create_err;
bool ordered_map;
bool lossless_map;
bool percpu_map;
int hdr_len_delta;
int type_off_delta;
int str_off_delta;
......@@ -2157,6 +2158,7 @@ static struct btf_pprint_test_meta {
const char *map_name;
bool ordered_map;
bool lossless_map;
bool percpu_map;
} pprint_tests_meta[] = {
{
.descr = "BTF pretty print array",
......@@ -2164,6 +2166,7 @@ static struct btf_pprint_test_meta {
.map_name = "pprint_test_array",
.ordered_map = true,
.lossless_map = true,
.percpu_map = false,
},
{
......@@ -2172,6 +2175,7 @@ static struct btf_pprint_test_meta {
.map_name = "pprint_test_hash",
.ordered_map = false,
.lossless_map = true,
.percpu_map = false,
},
{
......@@ -2180,30 +2184,83 @@ static struct btf_pprint_test_meta {
.map_name = "pprint_test_lru_hash",
.ordered_map = false,
.lossless_map = false,
.percpu_map = false,
},
{
.descr = "BTF pretty print percpu array",
.map_type = BPF_MAP_TYPE_PERCPU_ARRAY,
.map_name = "pprint_test_percpu_array",
.ordered_map = true,
.lossless_map = true,
.percpu_map = true,
},
{
.descr = "BTF pretty print percpu hash",
.map_type = BPF_MAP_TYPE_PERCPU_HASH,
.map_name = "pprint_test_percpu_hash",
.ordered_map = false,
.lossless_map = true,
.percpu_map = true,
},
{
.descr = "BTF pretty print lru percpu hash",
.map_type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
.map_name = "pprint_test_lru_percpu_hash",
.ordered_map = false,
.lossless_map = false,
.percpu_map = true,
},
};
static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i)
static void set_pprint_mapv(struct pprint_mapv *v, uint32_t i,
int num_cpus, int rounded_value_size)
{
v->ui32 = i;
v->si32 = -i;
v->unused_bits2a = 3;
v->bits28 = i;
v->unused_bits2b = 3;
v->ui64 = i;
v->aenum = i & 0x03;
int cpu;
for (cpu = 0; cpu < num_cpus; cpu++) {
v->ui32 = i + cpu;
v->si32 = -i;
v->unused_bits2a = 3;
v->bits28 = i;
v->unused_bits2b = 3;
v->ui64 = i;
v->aenum = i & 0x03;
v = (void *)v + rounded_value_size;
}
}
static int check_line(const char *expected_line, int nexpected_line,
int expected_line_len, const char *line)
{
if (CHECK(nexpected_line == expected_line_len,
"expected_line is too long"))
return -1;
if (strcmp(expected_line, line)) {
fprintf(stderr, "unexpected pprint output\n");
fprintf(stderr, "expected: %s", expected_line);
fprintf(stderr, " read: %s", line);
return -1;
}
return 0;
}
static int do_test_pprint(void)
{
const struct btf_raw_test *test = &pprint_test_template;
struct bpf_create_map_attr create_attr = {};
bool ordered_map, lossless_map, percpu_map;
int err, ret, num_cpus, rounded_value_size;
struct pprint_mapv *mapv = NULL;
unsigned int key, nr_read_elems;
bool ordered_map, lossless_map;
int map_fd = -1, btf_fd = -1;
struct pprint_mapv mapv = {};
unsigned int raw_btf_size;
char expected_line[255];
FILE *pin_file = NULL;
......@@ -2212,7 +2269,6 @@ static int do_test_pprint(void)
char *line = NULL;
uint8_t *raw_btf;
ssize_t nread;
int err, ret;
fprintf(stderr, "%s......", test->descr);
raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
......@@ -2261,9 +2317,18 @@ static int do_test_pprint(void)
if (CHECK(err, "bpf_obj_pin(%s): errno:%d.", pin_path, errno))
goto done;
percpu_map = test->percpu_map;
num_cpus = percpu_map ? bpf_num_possible_cpus() : 1;
rounded_value_size = round_up(sizeof(struct pprint_mapv), 8);
mapv = calloc(num_cpus, rounded_value_size);
if (CHECK(!mapv, "mapv allocation failure")) {
err = -1;
goto done;
}
for (key = 0; key < test->max_entries; key++) {
set_pprint_mapv(&mapv, key);
bpf_map_update_elem(map_fd, &key, &mapv, 0);
set_pprint_mapv(mapv, key, num_cpus, rounded_value_size);
bpf_map_update_elem(map_fd, &key, mapv, 0);
}
pin_file = fopen(pin_path, "r");
......@@ -2286,33 +2351,74 @@ static int do_test_pprint(void)
ordered_map = test->ordered_map;
lossless_map = test->lossless_map;
do {
struct pprint_mapv *cmapv;
ssize_t nexpected_line;
unsigned int next_key;
int cpu;
next_key = ordered_map ? nr_read_elems : atoi(line);
set_pprint_mapv(&mapv, next_key);
nexpected_line = snprintf(expected_line, sizeof(expected_line),
"%u: {%u,0,%d,0x%x,0x%x,0x%x,{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n",
next_key,
mapv.ui32, mapv.si32,
mapv.unused_bits2a, mapv.bits28, mapv.unused_bits2b,
mapv.ui64,
mapv.ui8a[0], mapv.ui8a[1], mapv.ui8a[2], mapv.ui8a[3],
mapv.ui8a[4], mapv.ui8a[5], mapv.ui8a[6], mapv.ui8a[7],
pprint_enum_str[mapv.aenum]);
if (CHECK(nexpected_line == sizeof(expected_line),
"expected_line is too long")) {
err = -1;
goto done;
set_pprint_mapv(mapv, next_key, num_cpus, rounded_value_size);
cmapv = mapv;
for (cpu = 0; cpu < num_cpus; cpu++) {
if (percpu_map) {
/* for percpu map, the format looks like:
* <key>: {
* cpu0: <value_on_cpu0>
* cpu1: <value_on_cpu1>
* ...
* cpun: <value_on_cpun>
* }
*
* let us verify the line containing the key here.
*/
if (cpu == 0) {
nexpected_line = snprintf(expected_line,
sizeof(expected_line),
"%u: {\n",
next_key);
err = check_line(expected_line, nexpected_line,
sizeof(expected_line), line);
if (err == -1)
goto done;
}
/* read value@cpu */
nread = getline(&line, &line_len, pin_file);
if (nread < 0)
break;
}
nexpected_line = snprintf(expected_line, sizeof(expected_line),
"%s%u: {%u,0,%d,0x%x,0x%x,0x%x,"
"{%lu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s}\n",
percpu_map ? "\tcpu" : "",
percpu_map ? cpu : next_key,
cmapv->ui32, cmapv->si32,
cmapv->unused_bits2a,
cmapv->bits28,
cmapv->unused_bits2b,
cmapv->ui64,
cmapv->ui8a[0], cmapv->ui8a[1],
cmapv->ui8a[2], cmapv->ui8a[3],
cmapv->ui8a[4], cmapv->ui8a[5],
cmapv->ui8a[6], cmapv->ui8a[7],
pprint_enum_str[cmapv->aenum]);
err = check_line(expected_line, nexpected_line,
sizeof(expected_line), line);
if (err == -1)
goto done;
cmapv = (void *)cmapv + rounded_value_size;
}
if (strcmp(expected_line, line)) {
err = -1;
fprintf(stderr, "unexpected pprint output\n");
fprintf(stderr, "expected: %s", expected_line);
fprintf(stderr, " read: %s", line);
goto done;
if (percpu_map) {
/* skip the last bracket for the percpu map */
nread = getline(&line, &line_len, pin_file);
if (nread < 0)
break;
}
nread = getline(&line, &line_len, pin_file);
......@@ -2334,6 +2440,8 @@ static int do_test_pprint(void)
err = 0;
done:
if (mapv)
free(mapv);
if (!err)
fprintf(stderr, "OK");
if (*btf_log_buf && (err || args.always_log))
......@@ -2361,6 +2469,7 @@ static int test_pprint(void)
pprint_test_template.map_name = pprint_tests_meta[i].map_name;
pprint_test_template.ordered_map = pprint_tests_meta[i].ordered_map;
pprint_test_template.lossless_map = pprint_tests_meta[i].lossless_map;
pprint_test_template.percpu_map = pprint_tests_meta[i].percpu_map;
err |= count_result(do_test_pprint());
}
......
......@@ -469,8 +469,6 @@ static int sendmsg_test(struct sockmap_options *opt)
fprintf(stderr,
"msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
shutdown(p2, SHUT_RDWR);
shutdown(p1, SHUT_RDWR);
if (s.end.tv_sec - s.start.tv_sec) {
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
......@@ -500,7 +498,6 @@ static int sendmsg_test(struct sockmap_options *opt)
fprintf(stderr,
"msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
iov_count, iov_buf, cnt, err);
shutdown(c1, SHUT_RDWR);
if (s.end.tv_sec - s.start.tv_sec) {
sent_Bps = sentBps(s);
recvd_Bps = recvdBps(s);
......@@ -1348,9 +1345,9 @@ static int populate_progs(char *bpf_file)
return 0;
}
static int __test_suite(char *bpf_file)
static int __test_suite(int cg_fd, char *bpf_file)
{
int cg_fd, err;
int err, cleanup = cg_fd;
err = populate_progs(bpf_file);
if (err < 0) {
......@@ -1358,22 +1355,24 @@ static int __test_suite(char *bpf_file)
return err;
}
if (setup_cgroup_environment()) {
fprintf(stderr, "ERROR: cgroup env failed\n");
return -EINVAL;
}
cg_fd = create_and_get_cgroup(CG_PATH);
if (cg_fd < 0) {
fprintf(stderr,
"ERROR: (%i) open cg path failed: %s\n",
cg_fd, optarg);
return cg_fd;
}
if (setup_cgroup_environment()) {
fprintf(stderr, "ERROR: cgroup env failed\n");
return -EINVAL;
}
cg_fd = create_and_get_cgroup(CG_PATH);
if (cg_fd < 0) {
fprintf(stderr,
"ERROR: (%i) open cg path failed: %s\n",
cg_fd, optarg);
return cg_fd;
}
if (join_cgroup(CG_PATH)) {
fprintf(stderr, "ERROR: failed to join cgroup\n");
return -EINVAL;
if (join_cgroup(CG_PATH)) {
fprintf(stderr, "ERROR: failed to join cgroup\n");
return -EINVAL;
}
}
/* Tests basic commands and APIs with range of iov values */
......@@ -1394,20 +1393,24 @@ static int __test_suite(char *bpf_file)
out:
printf("Summary: %i PASSED %i FAILED\n", passed, failed);
cleanup_cgroup_environment();
close(cg_fd);
if (cleanup < 0) {
cleanup_cgroup_environment();
close(cg_fd);
}
return err;
}
static int test_suite(void)
static int test_suite(int cg_fd)
{
int err;
err = __test_suite(BPF_SOCKMAP_FILENAME);
err = __test_suite(cg_fd, BPF_SOCKMAP_FILENAME);
if (err)
goto out;
err = __test_suite(BPF_SOCKHASH_FILENAME);
err = __test_suite(cg_fd, BPF_SOCKHASH_FILENAME);
out:
if (cg_fd > -1)
close(cg_fd);
return err;
}
......@@ -1420,7 +1423,7 @@ int main(int argc, char **argv)
int test = PING_PONG;
if (argc < 2)
return test_suite();
return test_suite(-1);
while ((opt = getopt_long(argc, argv, ":dhvc:r:i:l:t:",
long_options, &longindex)) != -1) {
......@@ -1486,6 +1489,9 @@ int main(int argc, char **argv)
}
}
if (argc <= 3 && cg_fd)
return test_suite(cg_fd);
if (!cg_fd) {
fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
argv[0]);
......
......@@ -5,6 +5,7 @@
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/tcp.h>
......@@ -17,6 +18,13 @@ struct bpf_map_def SEC("maps") global_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(struct tcpbpf_globals),
.max_entries = 4,
};
struct bpf_map_def SEC("maps") sockopt_results = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(int),
.max_entries = 2,
};
......@@ -45,11 +53,14 @@ int _version SEC("version") = 1;
SEC("sockops")
int bpf_testcb(struct bpf_sock_ops *skops)
{
int rv = -1;
int bad_call_rv = 0;
char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)];
struct tcphdr *thdr;
int good_call_rv = 0;
int op;
int bad_call_rv = 0;
int save_syn = 1;
int rv = -1;
int v = 0;
int op;
op = (int) skops->op;
......@@ -82,6 +93,21 @@ int bpf_testcb(struct bpf_sock_ops *skops)
v = 0xff;
rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
sizeof(v));
if (skops->family == AF_INET6) {
v = bpf_getsockopt(skops, IPPROTO_TCP, TCP_SAVED_SYN,
header, (sizeof(struct ipv6hdr) +
sizeof(struct tcphdr)));
if (!v) {
int offset = sizeof(struct ipv6hdr);
thdr = (struct tcphdr *)(header + offset);
v = thdr->syn;
__u32 key = 1;
bpf_map_update_elem(&sockopt_results, &key, &v,
BPF_ANY);
}
}
break;
case BPF_SOCK_OPS_RTO_CB:
break;
......@@ -111,6 +137,12 @@ int bpf_testcb(struct bpf_sock_ops *skops)
break;
case BPF_SOCK_OPS_TCP_LISTEN_CB:
bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
v = bpf_setsockopt(skops, IPPROTO_TCP, TCP_SAVE_SYN,
&save_syn, sizeof(save_syn));
/* Update global map w/ result of setsock opt */
__u32 key = 0;
bpf_map_update_elem(&sockopt_results, &key, &v, BPF_ANY);
break;
default:
rv = -1;
......
......@@ -54,6 +54,26 @@ int verify_result(const struct tcpbpf_globals *result)
return -1;
}
int verify_sockopt_result(int sock_map_fd)
{
__u32 key = 0;
int res;
int rv;
/* check setsockopt for SAVE_SYN */
rv = bpf_map_lookup_elem(sock_map_fd, &key, &res);
EXPECT_EQ(0, rv, "d");
EXPECT_EQ(0, res, "d");
key = 1;
/* check getsockopt for SAVED_SYN */
rv = bpf_map_lookup_elem(sock_map_fd, &key, &res);
EXPECT_EQ(0, rv, "d");
EXPECT_EQ(1, res, "d");
return 0;
err:
return -1;
}
static int bpf_find_map(const char *test, struct bpf_object *obj,
const char *name)
{
......@@ -70,11 +90,11 @@ static int bpf_find_map(const char *test, struct bpf_object *obj,
int main(int argc, char **argv)
{
const char *file = "test_tcpbpf_kern.o";
int prog_fd, map_fd, sock_map_fd;
struct tcpbpf_globals g = {0};
const char *cg_path = "/foo";
int error = EXIT_FAILURE;
struct bpf_object *obj;
int prog_fd, map_fd;
int cg_fd = -1;
__u32 key = 0;
int rv;
......@@ -110,6 +130,10 @@ int main(int argc, char **argv)
if (map_fd < 0)
goto err;
sock_map_fd = bpf_find_map(__func__, obj, "sockopt_results");
if (sock_map_fd < 0)
goto err;
rv = bpf_map_lookup_elem(map_fd, &key, &g);
if (rv != 0) {
printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
......@@ -121,6 +145,11 @@ int main(int argc, char **argv)
goto err;
}
if (verify_sockopt_result(sock_map_fd)) {
printf("FAILED: Wrong sockopt stats\n");
goto err;
}
printf("PASSED!\n");
error = 0;
err:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment