Commit 9879a381 authored by Jiong Wang's avatar Jiong Wang Committed by Daniel Borkmann

nfp: bpf: implement memory bulk copy for length within 32-bytes

For NFP, we want to re-group a sequence of load/store pairs lowered from
memcpy/memmove into single memory bulk operation which then could be
accelerated using NFP CPP bus.

This patch extends the existing load/store auxiliary information by adding
two new fields:

	struct bpf_insn *paired_st;
	s16 ldst_gather_len;

Both fields are supposed to be carried by the the load instruction at the
head of the sequence. "paired_st" is the corresponding store instruction at
the head and "ldst_gather_len" is the gathered length.

If "ldst_gather_len" is negative, then the sequence is doing memory
load/store in descending order, otherwise it is in ascending order. We need
this information to detect overlapped memory access.

This patch then optimize memory bulk copy when the copy length is within
32-bytes.

The strategy of read/write used is:

  * Read.
    Use read32 (direct_ref), always.

  * Write.
    - length <= 8-bytes
      write8 (direct_ref).
    - length <= 32-bytes and is 4-byte aligned
      write32 (direct_ref).
    - length <= 32-bytes but is not 4-byte aligned
      write8 (indirect_ref).

NOTE: the optimization should not change program semantics. The destination
register of the last load instruction should contain the same value before
and after this optimization.
Signed-off-by: default avatarJiong Wang <jiong.wang@netronome.com>
Reviewed-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent 5e4d6d20
...@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, ...@@ -154,6 +154,13 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false); emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false);
} }
static void
emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
swreg lreg, swreg rreg, u8 size, bool sync)
{
emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true);
}
static void static void
__emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip, __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
enum br_ctx_signal_state css, u16 addr, u8 defer) enum br_ctx_signal_state css, u16 addr, u8 defer)
...@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src) ...@@ -515,6 +522,109 @@ static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
wrp_mov(nfp_prog, reg_both(dst), reg_b(src)); wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
} }
/* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
* result to @dst from low end.
*/
static void
wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
u8 offset)
{
enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
u8 mask = (1 << field_len) - 1;
emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
}
/* NFP has Command Push Pull bus which supports bluk memory operations. */
static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
{
bool descending_seq = meta->ldst_gather_len < 0;
s16 len = abs(meta->ldst_gather_len);
swreg src_base, off;
unsigned int i;
u8 xfer_num;
if (WARN_ON_ONCE(len > 32))
return -EOPNOTSUPP;
off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
src_base = reg_a(meta->insn.src_reg * 2);
xfer_num = round_up(len, 4) / 4;
/* Memory read from source addr into transfer-in registers. */
emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, off,
xfer_num - 1, true);
/* Move from transfer-in to transfer-out. */
for (i = 0; i < xfer_num; i++)
wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
if (len <= 8) {
/* Use single direct_ref write8. */
emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
true);
} else if (IS_ALIGNED(len, 4)) {
/* Use single direct_ref write32. */
emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
true);
} else {
/* Use single indirect_ref write8. */
wrp_immed(nfp_prog, reg_none(),
CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
reg_a(meta->paired_st->dst_reg * 2), off,
len - 1, true);
}
/* TODO: The following extra load is to make sure data flow be identical
* before and after we do memory copy optimization.
*
* The load destination register is not guaranteed to be dead, so we
* need to make sure it is loaded with the value the same as before
* this transformation.
*
* These extra loads could be removed once we have accurate register
* usage information.
*/
if (descending_seq)
xfer_num = 0;
else if (BPF_SIZE(meta->insn.code) != BPF_DW)
xfer_num = xfer_num - 1;
else
xfer_num = xfer_num - 2;
switch (BPF_SIZE(meta->insn.code)) {
case BPF_B:
wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
reg_xfer(xfer_num), 1,
IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
break;
case BPF_H:
wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
reg_xfer(xfer_num), 2, (len & 3) ^ 2);
break;
case BPF_W:
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
reg_xfer(0));
break;
case BPF_DW:
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
reg_xfer(xfer_num));
wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
reg_xfer(xfer_num + 1));
break;
}
if (BPF_SIZE(meta->insn.code) != BPF_DW)
wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
return 0;
}
static int static int
data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size)
{ {
...@@ -1490,6 +1600,9 @@ static int ...@@ -1490,6 +1600,9 @@ static int
mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
unsigned int size) unsigned int size)
{ {
if (meta->ldst_gather_len)
return nfp_cpp_memcpy(nfp_prog, meta);
if (meta->ptr.type == PTR_TO_CTX) { if (meta->ptr.type == PTR_TO_CTX) {
if (nfp_prog->type == BPF_PROG_TYPE_XDP) if (nfp_prog->type == BPF_PROG_TYPE_XDP)
return mem_ldx_xdp(nfp_prog, meta, size); return mem_ldx_xdp(nfp_prog, meta, size);
......
...@@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); ...@@ -95,6 +95,8 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *);
* struct nfp_insn_meta - BPF instruction wrapper * struct nfp_insn_meta - BPF instruction wrapper
* @insn: BPF instruction * @insn: BPF instruction
* @ptr: pointer type for memory operations * @ptr: pointer type for memory operations
* @ldst_gather_len: memcpy length gathered from load/store sequence
* @paired_st: the paired store insn at the head of the sequence
* @ptr_not_const: pointer is not always constant * @ptr_not_const: pointer is not always constant
* @jmp_dst: destination info for jump instructions * @jmp_dst: destination info for jump instructions
* @off: index of first generated machine instruction (in nfp_prog.prog) * @off: index of first generated machine instruction (in nfp_prog.prog)
...@@ -109,6 +111,8 @@ struct nfp_insn_meta { ...@@ -109,6 +111,8 @@ struct nfp_insn_meta {
union { union {
struct { struct {
struct bpf_reg_state ptr; struct bpf_reg_state ptr;
struct bpf_insn *paired_st;
s16 ldst_gather_len;
bool ptr_not_const; bool ptr_not_const;
}; };
struct nfp_insn_meta *jmp_dst; struct nfp_insn_meta *jmp_dst;
......
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = {
[CMD_TGT_WRITE8_SWAP] = { 0x02, 0x42 }, [CMD_TGT_WRITE8_SWAP] = { 0x02, 0x42 },
[CMD_TGT_WRITE32_SWAP] = { 0x02, 0x5f },
[CMD_TGT_READ8] = { 0x01, 0x43 }, [CMD_TGT_READ8] = { 0x01, 0x43 },
[CMD_TGT_READ32] = { 0x00, 0x5c }, [CMD_TGT_READ32] = { 0x00, 0x5c },
[CMD_TGT_READ32_LE] = { 0x01, 0x5c }, [CMD_TGT_READ32_LE] = { 0x01, 0x5c },
......
...@@ -220,6 +220,7 @@ struct cmd_tgt_act { ...@@ -220,6 +220,7 @@ struct cmd_tgt_act {
enum cmd_tgt_map { enum cmd_tgt_map {
CMD_TGT_READ8, CMD_TGT_READ8,
CMD_TGT_WRITE8_SWAP, CMD_TGT_WRITE8_SWAP,
CMD_TGT_WRITE32_SWAP,
CMD_TGT_READ32, CMD_TGT_READ32,
CMD_TGT_READ32_LE, CMD_TGT_READ32_LE,
CMD_TGT_READ32_SWAP, CMD_TGT_READ32_SWAP,
...@@ -241,6 +242,9 @@ enum cmd_ctx_swap { ...@@ -241,6 +242,9 @@ enum cmd_ctx_swap {
CMD_CTX_NO_SWAP = 3, CMD_CTX_NO_SWAP = 3,
}; };
#define CMD_OVE_LEN BIT(7)
#define CMD_OV_LEN GENMASK(12, 8)
#define OP_LCSR_BASE 0x0fc00000000ULL #define OP_LCSR_BASE 0x0fc00000000ULL
#define OP_LCSR_A_SRC 0x000000003ffULL #define OP_LCSR_A_SRC 0x000000003ffULL
#define OP_LCSR_B_SRC 0x000000ffc00ULL #define OP_LCSR_B_SRC 0x000000ffc00ULL
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment