Commit 2eecf81e authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf: Fix to preserve reg parent/live fields when copying range info'

Eduard Zingerman says:

====================

Struct bpf_reg_state is copied directly in several places including:
- check_stack_write_fixed_off() (via save_register_state());
- check_stack_read_fixed_off();
- find_equal_scalars().

However, a literal copy of this struct also copies the following fields:

struct bpf_reg_state {
	...
	struct bpf_reg_state *parent;
	...
	enum bpf_reg_liveness live;
	...
};

This breaks register parentage chain and liveness marking logic.
The commit message for the first patch has a detailed example.
This patch-set replaces direct copies with a call to a function
copy_register_state(dst,src), which preserves 'parent' and 'live'
fields of the 'dst'.

The fix comes with a significant verifier runtime penalty for some
selftest binaries listed in tools/testing/selftests/bpf/veristat.cfg
and cilium BPF binaries (see [1]):

$ ./veristat -e file,prog,states -C -f 'states_diff>10' master-baseline.log current.log
File                        Program                           States (A)  States (B)  States   (DIFF)
--------------------------  --------------------------------  ----------  ----------  ---------------
bpf_host.o                  tail_handle_ipv4_from_host               231         299    +68 (+29.44%)
bpf_host.o                  tail_handle_nat_fwd_ipv4                1088        1320   +232 (+21.32%)
bpf_host.o                  tail_handle_nat_fwd_ipv6                 716         729     +13 (+1.82%)
bpf_host.o                  tail_nodeport_nat_ingress_ipv4           281         314    +33 (+11.74%)
bpf_host.o                  tail_nodeport_nat_ingress_ipv6           245         256     +11 (+4.49%)
bpf_lxc.o                   tail_handle_nat_fwd_ipv4                1088        1320   +232 (+21.32%)
bpf_lxc.o                   tail_handle_nat_fwd_ipv6                 716         729     +13 (+1.82%)
bpf_lxc.o                   tail_ipv4_ct_egress                      239         262     +23 (+9.62%)
bpf_lxc.o                   tail_ipv4_ct_ingress                     239         262     +23 (+9.62%)
bpf_lxc.o                   tail_ipv4_ct_ingress_policy_only         239         262     +23 (+9.62%)
bpf_lxc.o                   tail_ipv6_ct_egress                      181         195     +14 (+7.73%)
bpf_lxc.o                   tail_ipv6_ct_ingress                     181         195     +14 (+7.73%)
bpf_lxc.o                   tail_ipv6_ct_ingress_policy_only         181         195     +14 (+7.73%)
bpf_lxc.o                   tail_nodeport_nat_ingress_ipv4           281         314    +33 (+11.74%)
bpf_lxc.o                   tail_nodeport_nat_ingress_ipv6           245         256     +11 (+4.49%)
bpf_overlay.o               tail_handle_nat_fwd_ipv4                 799         829     +30 (+3.75%)
bpf_overlay.o               tail_nodeport_nat_ingress_ipv4           281         314    +33 (+11.74%)
bpf_overlay.o               tail_nodeport_nat_ingress_ipv6           245         256     +11 (+4.49%)
bpf_sock.o                  cil_sock4_connect                         47          70    +23 (+48.94%)
bpf_sock.o                  cil_sock4_sendmsg                         45          68    +23 (+51.11%)
bpf_sock.o                  cil_sock6_post_bind                       31          42    +11 (+35.48%)
bpf_xdp.o                   tail_lb_ipv4                            4413        6457  +2044 (+46.32%)
bpf_xdp.o                   tail_lb_ipv6                            6876        7249    +373 (+5.42%)
test_cls_redirect.bpf.o     cls_redirect                            4704        4799     +95 (+2.02%)
test_tcp_hdr_options.bpf.o  estab                                    180         206    +26 (+14.44%)
xdp_synproxy_kern.bpf.o     syncookie_tc                           21059       21485    +426 (+2.02%)
xdp_synproxy_kern.bpf.o     syncookie_xdp                          21857       23122   +1265 (+5.79%)
--------------------------  --------------------------------  ----------  ----------  ---------------

I looked through verification log for bpf_xdp.o tail_lb_ipv4 program in
order to identify the reason for ~50% visited states increase.
The slowdown is triggered by a difference in handling of three stack slots:
fp-56, fp-72 and fp-80, with the main difference coming from fp-72.
In fact the following change removes all the difference:

@@ -3256,7 +3256,10 @@ static void save_register_state(struct bpf_func_state *state,
 {
        int i;

-       copy_register_state(&state->stack[spi].spilled_ptr, reg);
+       if ((spi == 6 /*56*/ || spi == 8 /*72*/ || spi == 9 /*80*/) && size != BPF_REG_SIZE)
+               state->stack[spi].spilled_ptr = *reg;
+       else
+               copy_register_state(&state->stack[spi].spilled_ptr, reg);

For fp-56 I found the following pattern for divergences between
verification logs with and w/o this patch:

- At some point insn 1862 is reached and checkpoint is created;
- At some other point insn 1862 is reached again:
  - with this patch:
    - the current state is considered *not* equivalent to the old checkpoint;
    - the reason for mismatch is the state of fp-56:
      - current state: fp-56=????mmmm
      - checkpoint: fp-56_rD=mmmmmmmm
  - without this patch the current state is considered equivalent to the
    checkpoint, the fp-56 is not present in the checkpoint.

Here is a fragment of the verification log for when the checkpoint in
question created at insn 1862:

checkpoint 1862:  ... fp-56=mmmmmmmm ...
1862: ...
1863: ...
1864: (61) r1 = *(u32 *)(r0 +0)
1865: ...
1866: (63) *(u32 *)(r10 -56) = r1     ; R1_w=scalar(...) R10=fp0 fp-56=
1867: (bf) r2 = r10                   ; R2_w=fp0 R10=fp0
1868: (07) r2 += -56                  ; R2_w=fp-56
; return map_lookup_elem(&LB4_BACKEND_MAP_V2, &backend_id);
1869: (18) r1 = 0xffff888100286000    ; R1_w=map_ptr(off=0,ks=4,vs=8,imm=0)
1871: (85) call bpf_map_lookup_elem#1

- Without this patch:
  - at insn 1864 r1 liveness is set to REG_LIVE_WRITTEN;
  - at insn 1866 fp-56 liveness is set REG_LIVE_WRITTEN mark because
    of the direct r1 copy in save_register_state();
  - at insn 1871 REG_LIVE_READ is not propagated to fp-56 at
    checkpoint 1862 because of the REG_LIVE_WRITTEN mark;
  - eventually fp-56 is pruned from checkpoint at 1862 in
    clean_func_state().
- With this patch:
  - at insn 1864 r1 liveness is set to REG_LIVE_WRITTEN;
  - at insn 1866 fp-56 liveness is *not* set to REG_LIVE_WRITTEN mark
    because write size is not equal to BPF_REG_SIZE;
  - at insn 1871 REG_LIVE_READ is propagated to fp-56 at checkpoint 1862.

Hence more states have to be visited by verifier with this patch compared
to current master.

Similar patterns could be found for both fp-72 and fp-80, although these
are harder to track trough the log because of a big number of insns between
slot write and bpf_map_lookup_elem() call triggering read mark, boils down
to the following C code:

	struct ipv4_frag_id frag_id = {
		.daddr = ip4->daddr,
		.saddr = ip4->saddr,
		.id = ip4->id,
		.proto = ip4->protocol,
		.pad = 0,
	};
    ...
    map_lookup_elem(..., &frag_id);

Where:
- .id is mapped to fp-72, write of size u16;
- .saddr is mapped to fp-80, write of size u32.

This patch-set is a continuation of discussion from [2].

Changes v1 -> v2 (no changes in the code itself):
- added analysis for the tail_lb_ipv4 verification slowdown;
- rebase against fresh master branch.

[1] git@github.com:anakryiko/cilium.git
[2] https://lore.kernel.org/bpf/517af2c57ee4b9ce2d96a8cf33f7295f2d2dfe13.camel@gmail.com/
====================
Acked-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents bdb7fdb0 b9fa9bc8
......@@ -3243,13 +3243,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
return reg->type != SCALAR_VALUE;
}
/* Copy src state preserving dst->parent and dst->live fields */
static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
{
struct bpf_reg_state *parent = dst->parent;
enum bpf_reg_liveness live = dst->live;
*dst = *src;
dst->parent = parent;
dst->live = live;
}
static void save_register_state(struct bpf_func_state *state,
int spi, struct bpf_reg_state *reg,
int size)
{
int i;
state->stack[spi].spilled_ptr = *reg;
copy_register_state(&state->stack[spi].spilled_ptr, reg);
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
......@@ -3577,7 +3588,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
*/
s32 subreg_def = state->regs[dst_regno].subreg_def;
state->regs[dst_regno] = *reg;
copy_register_state(&state->regs[dst_regno], reg);
state->regs[dst_regno].subreg_def = subreg_def;
} else {
for (i = 0; i < size; i++) {
......@@ -3598,7 +3609,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno >= 0) {
/* restore register state from stack */
state->regs[dst_regno] = *reg;
copy_register_state(&state->regs[dst_regno], reg);
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
......@@ -9592,7 +9603,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
*/
if (!ptr_is_dst_reg) {
tmp = *dst_reg;
*dst_reg = *ptr_reg;
copy_register_state(dst_reg, ptr_reg);
}
ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
env->insn_idx);
......@@ -10845,7 +10856,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
*dst_reg = *src_reg;
copy_register_state(dst_reg, src_reg);
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
......@@ -10856,7 +10867,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
*dst_reg = *src_reg;
copy_register_state(dst_reg, src_reg);
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
......@@ -11655,7 +11666,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
*reg = *known_reg;
copy_register_state(reg, known_reg);
}));
}
......
......@@ -225,3 +225,39 @@
.result_unpriv = ACCEPT,
.insn_processed = 15,
},
/* The test performs a conditional 64-bit write to a stack location
* fp[-8], this is followed by an unconditional 8-bit write to fp[-8],
* then data is read from fp[-8]. This sequence is unsafe.
*
* The test would be mistakenly marked as safe w/o dst register parent
* preservation in verifier.c:copy_register_state() function.
*
* Note the usage of BPF_F_TEST_STATE_FREQ to force creation of the
* checkpoint state after conditional 64-bit assignment.
*/
{
"write tracking and register parent chain bug",
.insns = {
/* r6 = ktime_get_ns() */
BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
/* r0 = ktime_get_ns() */
BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
/* if r0 > r6 goto +1 */
BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_6, 1),
/* *(u64 *)(r10 - 8) = 0xdeadbeef */
BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0xdeadbeef),
/* r1 = 42 */
BPF_MOV64_IMM(BPF_REG_1, 42),
/* *(u8 *)(r10 - 8) = r1 */
BPF_STX_MEM(BPF_B, BPF_REG_FP, BPF_REG_1, -8),
/* r2 = *(u64 *)(r10 - 8) */
BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_FP, -8),
/* exit(0) */
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.flags = BPF_F_TEST_STATE_FREQ,
.errstr = "invalid read from stack off -8+1 size 8",
.result = REJECT,
},
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment