Commit 2df03a50 authored by Jakub Kicinski's avatar Jakub Kicinski Committed by David S. Miller

nfp: bpf: support accessing the stack beyond 64 bytes

To access beyond 64th byte of the stack we need to set a new
stack pointer register (LMEM is accessed indirectly through
those pointers).  Add a function for encoding local CSR access
instruction.  Use stack pointer number 3.

Note that stack pointer registers allow us to index into 32
bytes of LMEM (with shift operations i.e. when operands are
restricted).  This means if access is crossing 32 byte boundary
we must not use offsetting, we have to set the pointer to the
exact address and move it with post-increments.

We depend on the datapath placing the stack base address in
GPR A22 for our use.
Signed-off-by: default avatarJakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: default avatarQuentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d3488480
...@@ -427,6 +427,48 @@ emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src, ...@@ -427,6 +427,48 @@ emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false); emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
} }
static void
__emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
bool dst_lmextn, bool src_lmextn)
{
u64 insn;
insn = OP_LCSR_BASE |
FIELD_PREP(OP_LCSR_A_SRC, areg) |
FIELD_PREP(OP_LCSR_B_SRC, breg) |
FIELD_PREP(OP_LCSR_WRITE, wr) |
FIELD_PREP(OP_LCSR_ADDR, addr) |
FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
nfp_prog_push(nfp_prog, insn);
}
static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
{
struct nfp_insn_ur_regs reg;
int err;
/* This instruction takes immeds instead of reg_none() for the ignored
* operand, but we can't encode 2 immeds in one instr with our normal
* swreg infra so if param is an immed, we encode as reg_none() and
* copy the immed to both operands.
*/
if (swreg_type(src) == NN_REG_IMM) {
err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
reg.breg = reg.areg;
} else {
err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
}
if (err) {
nfp_prog->error = err;
return;
}
__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr / 4,
false, reg.src_lmextn);
}
static void emit_nop(struct nfp_prog *nfp_prog) static void emit_nop(struct nfp_prog *nfp_prog)
{ {
__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0); __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
...@@ -644,12 +686,15 @@ data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset, ...@@ -644,12 +686,15 @@ data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
typedef int typedef int
(*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off, (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
unsigned int size, bool first, bool new_gpr, bool last); unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
bool needs_inc);
static int static int
wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off, wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
unsigned int size, bool first, bool new_gpr, bool last) unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
bool needs_inc)
{ {
bool should_inc = needs_inc && new_gpr && !last;
u32 idx, src_byte; u32 idx, src_byte;
enum shf_sc sc; enum shf_sc sc;
swreg reg; swreg reg;
...@@ -663,10 +708,14 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off, ...@@ -663,10 +708,14 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
/* Move the entire word */ /* Move the entire word */
if (size == 4) { if (size == 4) {
wrp_mov(nfp_prog, reg_both(dst), reg_lm(0, idx)); wrp_mov(nfp_prog, reg_both(dst),
should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
return 0; return 0;
} }
if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
return -EOPNOTSUPP;
src_byte = off % 4; src_byte = off % 4;
mask = (1 << size) - 1; mask = (1 << size) - 1;
...@@ -689,7 +738,7 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off, ...@@ -689,7 +738,7 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
* Because we RMV twice we waste 2 cycles on unaligned 8 byte writes. * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
*/ */
if (idx <= RE_REG_LM_IDX_MAX) { if (idx <= RE_REG_LM_IDX_MAX) {
reg = reg_lm(0, idx); reg = reg_lm(lm3 ? 3 : 0, idx);
} else { } else {
reg = imm_a(nfp_prog); reg = imm_a(nfp_prog);
/* If it's not the first part of the load and we start a new GPR /* If it's not the first part of the load and we start a new GPR
...@@ -703,13 +752,18 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off, ...@@ -703,13 +752,18 @@ wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr); emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
if (should_inc)
wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
return 0; return 0;
} }
static int static int
wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off, wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
unsigned int size, bool first, bool new_gpr, bool last) unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
bool needs_inc)
{ {
bool should_inc = needs_inc && new_gpr && !last;
u32 idx, dst_byte; u32 idx, dst_byte;
enum shf_sc sc; enum shf_sc sc;
swreg reg; swreg reg;
...@@ -723,10 +777,15 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off, ...@@ -723,10 +777,15 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
/* Move the entire word */ /* Move the entire word */
if (size == 4) { if (size == 4) {
wrp_mov(nfp_prog, reg_lm(0, idx), reg_b(src)); wrp_mov(nfp_prog,
should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
reg_b(src));
return 0; return 0;
} }
if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
return -EOPNOTSUPP;
dst_byte = off % 4; dst_byte = off % 4;
mask = (1 << size) - 1; mask = (1 << size) - 1;
...@@ -749,7 +808,7 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off, ...@@ -749,7 +808,7 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
* Because we RMV twice we waste 2 cycles on unaligned 8 byte writes. * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
*/ */
if (idx <= RE_REG_LM_IDX_MAX) { if (idx <= RE_REG_LM_IDX_MAX) {
reg = reg_lm(0, idx); reg = reg_lm(lm3 ? 3 : 0, idx);
} else { } else {
reg = imm_a(nfp_prog); reg = imm_a(nfp_prog);
/* Only first and last LMEM locations are going to need RMW, /* Only first and last LMEM locations are going to need RMW,
...@@ -764,6 +823,8 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off, ...@@ -764,6 +823,8 @@ wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
if (new_gpr || last) { if (new_gpr || last) {
if (idx > RE_REG_LM_IDX_MAX) if (idx > RE_REG_LM_IDX_MAX)
wrp_mov(nfp_prog, reg_lm(0, idx), reg); wrp_mov(nfp_prog, reg_lm(0, idx), reg);
if (should_inc)
wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
} }
return 0; return 0;
...@@ -776,10 +837,44 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, ...@@ -776,10 +837,44 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
{ {
s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off; s32 off = nfp_prog->stack_depth + meta->insn.off + ptr_off;
bool first = true, last; bool first = true, last;
bool needs_inc = false;
swreg stack_off_reg;
u8 prev_gpr = 255; u8 prev_gpr = 255;
u32 gpr_byte = 0; u32 gpr_byte = 0;
bool lm3 = true;
int ret; int ret;
if (off + size <= 64) {
/* We can reach bottom 64B with LMaddr0 */
lm3 = false;
} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
/* We have to set up a new pointer. If we know the offset
* and the entire access falls into a single 32 byte aligned
* window we won't have to increment the LM pointer.
* The 32 byte alignment is imporant because offset is ORed in
* not added when doing *l$indexN[off].
*/
stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
stack_imm(nfp_prog));
emit_alu(nfp_prog, imm_b(nfp_prog),
stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
off %= 32;
} else {
stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
stack_imm(nfp_prog));
emit_alu(nfp_prog, imm_b(nfp_prog),
stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
needs_inc = true;
}
if (lm3) {
emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
/* For size < 4 one slot will be filled by zeroing of upper. */
wrp_nops(nfp_prog, clr_gpr && size < 8 ? 2 : 3);
}
if (clr_gpr && size < 8) if (clr_gpr && size < 8)
wrp_immed(nfp_prog, reg_both(gpr + 1), 0); wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
...@@ -793,8 +888,11 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, ...@@ -793,8 +888,11 @@ mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
last = slice_size == size; last = slice_size == size;
if (needs_inc)
off %= 4;
ret = step(nfp_prog, gpr, gpr_byte, off, slice_size, ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
first, gpr != prev_gpr, last); first, gpr != prev_gpr, last, lm3, needs_inc);
if (ret) if (ret)
return ret; return ret;
......
...@@ -168,12 +168,6 @@ nfp_net_bpf_offload_prepare(struct nfp_net *nn, ...@@ -168,12 +168,6 @@ nfp_net_bpf_offload_prepare(struct nfp_net *nn,
start_off = nn_readw(nn, NFP_NET_CFG_BPF_START); start_off = nn_readw(nn, NFP_NET_CFG_BPF_START);
done_off = nn_readw(nn, NFP_NET_CFG_BPF_DONE); done_off = nn_readw(nn, NFP_NET_CFG_BPF_DONE);
if (cls_bpf->prog->aux->stack_depth > 64) {
nn_info(nn, "large stack not supported: program %dB > 64B\n",
cls_bpf->prog->aux->stack_depth);
return -EOPNOTSUPP;
}
stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64; stack_size = nn_readb(nn, NFP_NET_CFG_BPF_STACK_SZ) * 64;
if (cls_bpf->prog->aux->stack_depth > stack_size) { if (cls_bpf->prog->aux->stack_depth > stack_size) {
nn_info(nn, "stack too large: program %dB > FW stack %dB\n", nn_info(nn, "stack too large: program %dB > FW stack %dB\n",
......
...@@ -257,6 +257,11 @@ enum lcsr_wr_src { ...@@ -257,6 +257,11 @@ enum lcsr_wr_src {
#define OP_CARB_BASE 0x0e000000000ULL #define OP_CARB_BASE 0x0e000000000ULL
#define OP_CARB_OR 0x00000010000ULL #define OP_CARB_OR 0x00000010000ULL
#define NFP_CSR_ACT_LM_ADDR0 0x64
#define NFP_CSR_ACT_LM_ADDR1 0x6c
#define NFP_CSR_ACT_LM_ADDR2 0x94
#define NFP_CSR_ACT_LM_ADDR3 0x9c
/* Software register representation, independent of operand type */ /* Software register representation, independent of operand type */
#define NN_REG_TYPE GENMASK(31, 24) #define NN_REG_TYPE GENMASK(31, 24)
#define NN_REG_LM_IDX GENMASK(23, 22) #define NN_REG_LM_IDX GENMASK(23, 22)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment