暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

项目经验分享:Linux内核eBPF模块源码分析——verifier与jit部分

开源之夏 2021-08-23
1108

暑期2021项目研发正在火热进行中,开源之夏公众号面向广大社区及项目承担学生征稿,欢迎大家热情分享:

发送投稿文章至官方联络邮箱:

summer@iscas.ac.cn

添加公众号小编微信投稿:

本期分享来自openEuler社区的郭子仪同学带来的项目(提供通用的软件故障注入工具)经验分享。

本文主要是梳理eBPF模块在verifier与jit阶段的主要流程。

对于理解eBPF程序在内核中的加载,检查,编译阶段等很有意义。

verifier:eBPF的一个验证器,实现了一个本模块下的CFI/CFG(控制流完整性)机制。

jit:Just-In-Time,即时编译,eBPF汇编会在内核中被规则替换成真正的x64的指令。


01

前情提要

eBPF汇编中有r0至r10一共11个寄存器,作用如下:

R0(rax),函数返回值

R1(rdi),arg1

R2(rsi),arg2

R3(rdx),arg3

R4(rcx),arg4

R5(r8),arg5

R6(rbx),callee保存

R7(r13),callee保存

R8(r14),callee保存

R9(r15),callee保存

R10(rbp),栈帧寄存器

所有的eBPF汇编在内核中定义为一个 struct bpf_insn ,当我们需要写的时候一般将连续的指令布置成一个结构体数组:

struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};


struct bpf_insn insn[] = {
BPF_LD_MAP_FD(BPF_REG_1,3),
......


};


然后通过内核态的:bpf_prog_load 载入,编译,运行。

更具体的,在用户态可以定义辅助函数:

static int bpf_prog_load(enum bpf_prog_type prog_type,
const struct bpf_insn *insns, int prog_len,
const char *license, int kern_version){


union bpf_attr attr = {
.prog_type = prog_type,
.insns = (uint64_t)insns,
.insn_cnt = prog_len / sizeof(struct bpf_insn),
.license = (uint64_t)license,
.log_buf = (uint64_t)bpf_log_buf,
.log_size = LOG_BUF_SIZE,
.log_level = 1,
};
attr.kern_version = kern_version;
bpf_log_buf[0] = 0;
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
}

而内核中对应的处理函数就是:

  case BPF_PROG_LOAD:
err = bpf_prog_load(&attr, uattr);
break;

相应的,struct bpf_prog 此结构体用于维护一个内核中的eBPF程序:

struct bpf_prog {
u16 pages; /* Number of allocated pages */
u16 jited:1, /* Is our filter JIT'ed? */
jit_requested:1,/* archs need to JIT the prog */
gpl_compatible:1, /* Is filter GPL compatible? */
cb_access:1, /* Is control block accessed? */
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
u32 jited_len; /* Size of jited insns in bytes */
u8 tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux; /* Auxiliary fields */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
unsigned int (*bpf_func)(const void *ctx,
const struct bpf_insn *insn);
/* Instructions for interpreter */
union {
struct sock_filter insns[0];
struct bpf_insn insnsi[0];
};
};

02

bpf_prog_load

此函数中主要做了如下关键的事情:

首先有一系列的检查。(这部分我贴上来的代码中才剪掉了,主要涉及一些权限和标识)

通过 bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER) 为我们的prog分配空间,分配大小为struct bpf_prog加指令长度。返回一个 struct bpf_prog *

检测LSM hook。

将程序长度赋值为指令长度:prog->len = attr->insn_cnt;

将用户态的insn拷贝到内核态分配的空间。

通过prog的类型赋值 ops 虚表,里面定义了对应的绑定好的函数。

调用verifier检测控制流完整性等。(这一部分也很重要,但是本篇文章主要重点放在jit编译阶段)

单bpf函数调用

bpf_prog_select_runtime(prog,&err) jit编译prog。多bpf函数的prog调用jit_subprog。两者都会统一到针对do_jit的调用

为编译后的prog分配一个唯一的id,bpftools会用到这个id。

static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
int err;
char license[128];
bool is_gpl;


......


/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;


......




prog->len = attr->insn_cnt;


err = -EFAULT;
if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
bpf_prog_insn_size(prog)) != 0)
goto free_prog;


prog->orig_prog = NULL;
prog->jited = 0; //还没有jit编译


......


/* find program type: socket_filter vs tracing_filter */
/*

type = array_index_nospec(type, ARRAY_SIZE(bpf_prog_types));
ops = bpf_prog_types[type];

*/
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;


......


/* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);
if (err < 0)
goto free_used_maps;


prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;


err = bpf_prog_alloc_id(prog);
if (err)
goto free_used_maps;

......
}


03

verifier部分

入口位置在:

  /* run eBPF verifier */
err = bpf_check(&prog, attr, uattr);


bpf_check

本函数是verifier的主要函数,可以看作一个eBPF-扩展CFI机制

主要做了如下操作:

1. 首先对 struct bpf_verifier_env *env 进行一些设置。

2. 调用 replace_map_fd_with_map_ptr(env); 将对应的map_fd转换为map_ptr。

int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
int i, len, ret = -EINVAL;
bool is_priv;


......


len = (*prog)->len;
env->insn_aux_data =
vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len));
ret = -ENOMEM;
if (!env->insn_aux_data)
goto err_free_env;
for (i = 0; i < len; i++)
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
is_priv = capable(CAP_SYS_ADMIN);


......


/* 设置对齐 */
env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
env->strict_alignment = true;
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;


env->allow_ptr_leaks = is_priv;


if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;


ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;


if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
goto skip_full_check;
}


env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;


ret = check_subprogs(env);
if (ret < 0)
goto skip_full_check;


ret = check_btf_info(env, attr, uattr);
if (ret < 0)
goto skip_full_check;


ret = check_attach_btf_id(env);
if (ret)
goto skip_full_check;


ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;


ret = do_check_subprogs(env);
ret = ret ?: do_check_main(env);


if (ret == 0 && bpf_prog_is_dev_bound(env->prog->aux))
ret = bpf_prog_offload_finalize(env);


skip_full_check:
kvfree(env->explored_states);


if (ret == 0)
ret = check_max_stack_depth(env);


/* instruction rewrites happen after this point */
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
if (ret == 0)
ret = opt_remove_dead_code(env);
if (ret == 0)
ret = opt_remove_nops(env);
} else {
if (ret == 0)
sanitize_dead_code(env);
}


if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);


if (ret == 0)
ret = fixup_bpf_calls(env);


/* do 32-bit optimization after insn patching has done so those patched
* insns could be handled correctly.
*/
if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
: false;
}


if (ret == 0)
ret = fixup_call_args(env);


env->verification_time = ktime_get_ns() - start_time;
print_verification_stats(env);


if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
if (log->level && !log->ubuf) {
ret = -EFAULT;
goto err_release_maps;
}


if (ret == 0 && env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
GFP_KERNEL);


if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
goto err_release_maps;
}


memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;


/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}


if (ret == 0)
adjust_btf_func(env);


err_release_maps:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
*prog = env->prog;
err_unlock:
if (!is_priv)
mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
kfree(env);
return ret;
}


replace_map_fd_with_map_ptr

函数主要目的:

将map_fd,转换为map的地址。

将加载后的map_addr分高低32位在上下两条指令中存储。

简单的校验非加载map指令的操作码合法性。

主要流程如下:

1. 首先取出env中保存的insn指令和长度。

2. 调用 bpf_prog_calc_tag ,在其中算了一下SHA1,然后把算好的摘要放进env->prog->tag 中。值得注意的是函数中有这么几行:

int bpf_prog_calc_tag(struct bpf_prog *fp)
{
......
bool was_ld_map;
struct bpf_insn *dst;
u8 *raw, *todo;
......
raw = vmalloc(raw_size);
if (!raw)
return -ENOMEM;
......
/* We need to take out the map fd for the digest calculation
* since they are unstable from user space side.
*/
dst = (void *)raw;
for (i = 0, was_ld_map = false; i < fp->len; i++) {
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
(dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
was_ld_map = true;
dst[i].imm = 0;
} else if (was_ld_map &&
dst[i].code == 0 &&
dst[i].dst_reg == 0 &&
dst[i].src_reg == 0 &&
dst[i].off == 0) {
was_ld_map = false;
dst[i].imm = 0;
} else {
was_ld_map = false;
}
}


psize = bpf_prog_insn_size(fp);
memset(&raw[psize], 0, raw_size - psize);
raw[psize++] = 0x80;
......
}

a. 扫描指令,将对应的指令(即insn结构体放入)dst[i]

b. 如果是加载map的指令,那么设置 was_ld_map = true ,设置立即数为0: dst[i].imm = 0

c. 如果检测到was_ld_map = true 已经设置,且对应的insn结构体都为空,即加载map指令的下一条指令。重新设置 was_ld_map = false 立即数为0 。

d. 也就是说,加载map指令和其下一条指令实际上是构成了一个整体,我们可以看一下对应的eBPF汇编:

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD) \
BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)




#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
((struct bpf_insn) { \
.code = BPF_LD | BPF_DW | BPF_IMM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = (__u32) (IMM) }), \
((struct bpf_insn) { \
.code = 0, /* zero is reserved opcode */ \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })

也就是说,加载map的指令后面是跟了一条空指令的。作为padding。

3. for循环中扫描每一条指令。

1)如果是内存加载(BPF_LDX),必须保证带有BPF_MEM,并且立即数为0 。

2)如果是内存存储(BPF_STX),必须保证带有BPF_MEM,并且立即数为0 。

3)如果是:BPF_LD_IMM64_RAW ,标志为:.code = BPF_LD | BPF_DW | BPF_IMMBPF_LD_MAP_FD ,此时的code是加载map的操作。

如果此时是最后一条指令 ||  下一条指令不是空指令,abort。

如果当前load_map指令的src为0,即map_fd为0,跳过。

如果当前指令的src_reg,不是BPF_PSEUDO_MAP_FD,不是BPF_PSEUDO_MAP_VALUE;或者是BPF_PSEUDO_MAP_FD但下一条指令imm不为空。abort,其中BPF_PSEUDO_MAP_FD被条件define为1 。

check结束,通过 insn[0].imm 拿到fd,然__bpf_map_get转换为对应的bpf_map的地址。判断地址合法性。赋值给map。

check_map_prog_compatibility 检测与相应环境的兼容性。

如果设置了BPF_PSEUDO_MAP_FD标志,那么直接将刚刚iv.中取到的map赋值给addr。

否则,通过下一条指令的imm作为off,配合虚表中的 map_direct_value_addr  。

主要实现在:array_map_direct_value_addr ,被用于bpf_array中获取到第一个value。value的地址赋值给addr。然后addr+=off。

以上实际就是为了取addr。

重新设置当前指令的imm为addr的低32位。而addr的高32位存储在下一条指令中。

检查当前map是否处于used状态。

使用 bpf_map_inc(map) ,当前map引用计数+1 。主要是为了拿住map,如果此map被verifier拒绝,那么调用release_maps释放,或者此map被其他有效的程序使用,直到此map被卸载(所有的map都在free_used_maps()中被释放)

将此map标记为used,添加进 env->used_maps表中。同时used_map_cnt计数器+1 。

如果当前的map是一个cgroup存储。进行检测,是否每个type只有一个对应的cgroup stroge。

4)insn++,i++,开始扫描下一条指令。

4. 如果当前指令不是加载map的指令。调用 bpf_opcode_in_insntable 进行基本的指令unkown判断。

bool bpf_opcode_in_insntable(u8 code)
{
#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true
#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true
static const bool public_insntable[256] = {
[0 ... 255] = false,
/* Now overwrite non-defaults ... */
BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL),
/* UAPI exposed, but rewritten opcodes. cBPF carry-over. */
[BPF_LD | BPF_ABS | BPF_B] = true,
[BPF_LD | BPF_ABS | BPF_H] = true,
[BPF_LD | BPF_ABS | BPF_W] = true,
[BPF_LD | BPF_IND | BPF_B] = true,
[BPF_LD | BPF_IND | BPF_H] = true,
[BPF_LD | BPF_IND | BPF_W] = true,
};
#undef BPF_INSN_3_TBL
#undef BPF_INSN_2_TBL
return public_insntable[code];
}

首先初始化一个16*16的bool矩阵 public_insntable

BPF_INSN_MAP中存储的是合法的指令。通过

BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL) 配合,相当于将合法的指令映射到public_insntable 矩阵(向量)的某个位置,然后将其设置为true。

然后再补充6条额外的合法指令,设置其在BPF_INSN_MAP上的位置位true。

返回当前的指令code,在BPF_INSN_MAP位置上的bool标志。如果为true则通过检查,如果为false就abort。

注意,此时进行的只是一个最简单的指令(操作码)合法性检查。

5. 返回。

check_subprogs

函数主要目的:

生成 env->subprog_info[] 数组,存储要跳转的函数的id。

检测JMP指令的跳转范围是否在subprog中。

函数主要流程:

1. 调用 add_subprog(env,0) ,首先在 env->subprog_info 数组中二分查找off=0,此时查找失败,return -ENOENT,因为此时数组中还没有任何元素。接下来将新的off=0,插入到数组第一项的start成员中:env->subprog_info[env-。>subprog_cnt++].start = off ,最后进行排序。

2. 通过for循环扫描insn[]。

找到所有的BPF_JMP、BPF_CALL指令,或者src_reg为BPF_PSEUDO_CALL的指令。如果设置了allow_ptr_leaks标志,那么abort,防止指针泄漏。

找到对应的JMP、CALL指令后,调用:add_subprog(env, i + insn[i].imm + 1) 。此时insn[i].imm存储的是对应的函数id。off = i + insn[i].imm + 1 ,我们将其进行二分查找。如果此时要插入off的是已经存在的,那么直接return 0 。否则将off插入到 env->subprog_info[env->subprog_cnt++].start 中,subprog_cnt计数加一。最后再对subprog_info中的元素进行排序。

此时已经通过add_subprog 插入完了所有的insn中的需要调用的函数信息(id)到env->subprog_info 数组中。

3. 在subprog[] 数组的最后一项补充一个insn_cnt。

subprog[env->subprog_cnt].start = insn_cnt;

4. 接下来检测所有的跳转指令都在同一个 subprog中。

/* now check that all jumps are within the same subprog */
subprog_start = subprog[cur_subprog].start;
subprog_end = subprog[cur_subprog + 1].start;
for (i = 0; i < insn_cnt; i++) {
u8 code = insn[i].code;


if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32)
goto next;
if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
goto next;
off = i + insn[i].off + 1;
if (off < subprog_start || off >= subprog_end) {
verbose(env, "jump out of range from insn %d to %d\n", i, off);
return -EINVAL;
}
next:
if (i == subprog_end - 1) {
if (code != (BPF_JMP | BPF_EXIT) &&
code != (BPF_JMP | BPF_JA)) {
verbose(env, "last insn is not an exit or jmp\n");
return -EINVAL;
}
subprog_start = subprog_end;
cur_subprog++;
if (cur_subprog < env->subprog_cnt)
subprog_end = subprog[cur_subprog + 1].start;
}
}

首先保证class必须是:BPF_JMP或者BPF_JMP32 。

op不是EXIT或者CALL,也就是不能是:BPF_JMP | BPF_CALL这样的。主要针对的是:

BPF_JMP_REG,BPF_JMP32_REG,BPF_JMP_IMM,BPF_JMP32_IMM

这样的。

获取到对应的off作为跳转的距离,检测off在当前的subprog之内。(主要是相对insn指令的距离)

如果已经是最后一条指令了,要保证最后一条是一个JMP或者ExiT。

ps:根据别的师傅的解释:

bpf指令支持两种call调用,一种bpf函数对bpf函数的调用,一种是bpf中对内核helper func的调用。前者是指令class为BPF_JMP,指令opcode位BPF_CALL,并且src寄存器为BPF_PSEUDO_CALL,指令imm为callee函数到本指令的距离。另外一种是对对内核helper func的调用,指令class为特征是BPF_JMP,指令opcode为BPF_CALL,并且src_reg=0,指令imm在jit之前位内核helper func id,jit之后为对应的func到本指令的距离。

我这里遇到的主要是第二种。

check_cfg

函数主要目的:

非递归的深度优先遍历检测是否存在有向无环图(循环)

检测控制流合法性。

  struct {
int *insn_state;
int *insn_stack;
int cur_stack;
} cfg;

函数流程:

enum {
DISCOVERED = 0x10,
EXPLORED = 0x20,
FALLTHROUGH = 1, //顺序执行
BRANCH = 2, //条件跳转
};

本函数中,首先通过DFS转换成执行流tree。在这里图上的边edge被分成四种情况:

tree edge:最正常的顺序执行的边。

forward edge:在同一侧前向跨越的边,可能由jmp指令导致。

cross edge:跨越左右两侧的边。

back edge:后向边,检测到他代表出现了循环。

同样的,图中的点也有几种情况:

explored,代表一个已经被DFS完毕所有(出)边的结点。

discovered,代表一个结点w,在上一结点t通过边e可达。

fallthrough,代表一个结点w为顺序执行的可达点。

branch,代表一个结点是条件跳转的分支可达点。

非递归的DFS大致流程如下:

1. 首先标记第一个点v,压入栈S。

2. 如果S不空,进入while循环。

1)弹出栈顶第一个元素t。如果t是我们需要找的直接return。

2)如果t不是,那么通过for循环扫描t结点的每一个边edge。

如果边e已经被标记了,那么跳过,扫描t的下一个边。

扫描到t通过e到达的临近顶点w。

如果w未被标记。标记e为tree-edge。标记w为已经发现的点(discovered)。

压栈w,重新对w进行while循环。直到:

1. 发现w又是当前路径下一个已发现的点,此时则寻找到一条back-edge,说明存在loop

2. w已经是叶子结点或标记为探索完成explored。标记为cross或者forawrd。

此时t结点所有的边DFS完毕,标记结点t为explored。弹出栈。

函数实现如下:

/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
int *insn_stack, *insn_state;
int ret = 0;
int i, t;


//分配空间
//insn_state 用来记录指令的状态
insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;

//insn_stack 是存储指令的栈
insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
kvfree(insn_state);
return -ENOMEM;
}


//首先将第一个结点的状态标记为 discovered
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
//第一条指令为0
insn_stack[0] = 0; /* 0 is the first instruction */
//cfg_cur_stack指向指令栈中当前指令的位置(index)
env->cfg.cur_stack = 1;


peek_stack:
//如果栈中已经没有元素,全部退栈,检查所有指令的状态是否都为explored。
//若有不是explored说明不可达。
if (env->cfg.cur_stack == 0)
goto check_state;

//取当前的指令为结点t。
t = insn_stack[env->cfg.cur_stack - 1];


if (BPF_CLASS(insns[t].code) == BPF_JMP ||
BPF_CLASS(insns[t].code) == BPF_JMP32)
{
u8 opcode = BPF_OP(insns[t].code);


if (opcode == BPF_EXIT) {
//如果是exit,则标记为explored。
//因为exit不可能再有出边了。
goto mark_explored;
} else if (opcode == BPF_CALL) {
//当前指令如果是call调用函数
//将下一条指令压栈,在push中做了loop detect
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
//如果压栈成功,ret=1
if (ret == 1)
goto peek_stack;
//如果压栈失败,ret<0
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);

//这里对应两种BPF_CALL
//若果是bpf函数调用,将被调用函数入栈,标记结点为branch
//如果src_reg = 0,说明是kernel func,不需要对被调用函数入栈的操作
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
init_explored_state(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else if (opcode == BPF_JA) {
//如果是无条件跳转
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
goto err_free;
}
/* unconditional jump with single edge */
//将跳转的对应跳转分支结点入栈
ret = push_insn(t, t + insns[t].off + 1,
FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
/* unconditional jmp is not a good pruning point,
* but it's marked, since backtracking needs
* to record jmp history in is_state_visited().
*/
init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
//有双分支结点的条件跳转指令
//两个分支都进行压栈,模拟执行
init_explored_state(env, t);
//先压fasle分支
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
//再压true分支
ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}
} else {
/* all other non-branch instructions with single
* fall-through edge
*/
//不存在分支的正常指令入栈
ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
}


//exit指令,标记当前结点为explored状态。栈指针减一,退栈。
mark_explored:
insn_state[t] = EXPLORED;
if (env->cfg.cur_stack-- <= 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
}
goto peek_stack;


//检测是否最终所有指令结点都被扫描完了
check_state:
for (i = 0; i < insn_cnt; i++) {
if (insn_state[i] != EXPLORED) {
verbose(env, "unreachable insn %d\n", i);
ret = -EINVAL;
goto err_free;
}
}
ret = 0; /* cfg looks good */


err_free:
kvfree(insn_state);
kvfree(insn_stack);
env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}

值得注意的是其中有一个一直出现的函数:init_explored_state

我这里找到了一个commit。

https://zh.osdn.net/projects/android-x86/scm/git/kernel/commits/5762a20b11ef261ae8436868555fab4340cb3ca0

Convert explored_states array into hash table and use simple hash to reduce verifier peak memory consumption for programs with bpf2bpf calls. More details in patch 3.

为了减少bpf到bpf的函数调用的峰值内存消耗,设置了对应结点的prune_point为true。

push_insn

函数主要目的:

对指令结点t与t+1入栈(t+1也可能是其他的分支或者函数结点),模拟执行。

检测控制流合法性。

返回0,代表下一条指令w不压栈,当前为退栈流程。返回1代表成果压栈

函数实现如下:

/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
bool loop_ok)
{


int *insn_stack = env->cfg.insn_stack;
int *insn_state = env->cfg.insn_state;

//e是fallthrough代表顺序执行(t到w)
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
//如果遇见分支跳转指令
if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
return 0;


//判断下一条指令的范围
if (w < 0 || w >= env->prog->len) {
verbose_linfo(env, t, "%d: ", t);
verbose(env, "jump out of range from insn %d to %d\n", t, w);
return -EINVAL;
}


//如果当前的边是branch,分支。
if (e == BRANCH)
/* mark branch target for state pruning */
init_explored_state(env, w);


//如果下一条指令还没有入栈,没有设置标志
if (insn_state[w] == 0) {
//对下一条指令进行压栈操作。
/* tree-edge */
insn_state[t] = DISCOVERED | e; //补充标记当前结点状态
insn_state[w] = DISCOVERED; //标记下一条结点状态为discovered

//保证栈大小合法
if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;

//下一条指令进入insn_stack栈中,栈指针后移。
insn_stack[env->cfg.cur_stack++] = w;
//返回1,压栈成功
return 1;
}
//如果下一条指令已经处于discovered状态
else if ((insn_state[w] & 0xF0) == DISCOVERED) {
//如果设置了允许loop,且允许ptr leak,返回0
if (loop_ok && env->allow_ptr_leaks)
return 0;
//否则,检测到back-edge,提示存在loop
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
return -EINVAL;
}
//如果下一条指令结点已经是explored状态,说明是cross或者forward,正常标记当前结点
else if (insn_state[w] == EXPLORED) {
/* forward- or cross-edge */
insn_state[t] = DISCOVERED | e;
}
else {
verbose(env, "insn state internal bug\n");
return -EFAULT;
}
return 0;
}

do_check_common

do_check_subprogs中会对每个没有设置 BTF_FUNC_GLOBAL 的subprog调用do_check_common进行检查。


函数主要目的:

verifier通过调用此函数来检测每一个subprog的合法性。

函数中主要涉及的结构:

bpf_verifier_state

#define MAX_CALL_FRAMES 8
struct bpf_verifier_state {
struct bpf_func_state *frame[MAX_CALL_FRAMES]; //保存调用栈
struct bpf_verifier_state *parent;
/*
* 'branches' field is the number of branches left to explore:
* 0 - all possible paths from this state reached bpf_exit or
* were safely pruned
* 1 - at least one path is being explored.
* This state hasn't reached bpf_exit
* 2 - at least two paths are being explored.
* This state is an immediate parent of two children.
* One is fallthrough branch with branches==1 and another
* state is pushed into stack (to be explored later) also with
* branches==1. The parent of this state has branches==1.
* The verifier state tree connected via 'parent' pointer looks like:
* 1
* 1
* 2 -> 1 (first 'if' pushed into stack)
* 1
* 2 -> 1 (second 'if' pushed into stack)
* 1
* 1
* 1 bpf_exit.
*
* Once do_check() reaches bpf_exit, it calls update_branch_counts()
* and the verifier state tree will look:
* 1
* 1
* 2 -> 1 (first 'if' pushed into stack)
* 1
* 1 -> 1 (second 'if' pushed into stack)
* 0
* 0
* 0 bpf_exit.
* After pop_stack() the do_check() will resume at second 'if'.
*
* If is_state_visited() sees a state with branches > 0 it means
* there is a loop. If such state is exactly equal to the current state
* it's an infinite loop. Note states_equal() checks for states
* equvalency, so two states being 'states_equal' does not mean
* infinite loop. The exact comparison is provided by
* states_maybe_looping() function. It's a stronger pre-check and
* much faster than states_equal().
*
* This algorithm may not find all possible infinite loops or
* loop iteration count may be too high.
* In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in.
*/
u32 branches;
u32 insn_idx;
u32 curframe;
u32 active_spin_lock;
bool speculative;


/* first and last insn idx of this verifier state */
u32 first_insn_idx;
u32 last_insn_idx;
/* jmp history recorded from first to last.
* backtracking is using it to go from last to first.
* For most states jmp_history_cnt is [0-3].
* For loops can go up to ~40.
*/
struct bpf_idx_pair *jmp_history;
u32 jmp_history_cnt;
};


branch字段代表的是剩余探索的分支数量。

branch = 0,从这个状态出发的所有可能路径都达到了bpf_exit,或者已经被修剪了。

branch = 1,至少有一条路径正在被探索,这个状态还没有达到bpf_exit。

branch = 2,至少有两条路径正在被探索,这个状态是两个子节点的直接父节点。

其中一个是FALLTHROUGH也就是顺序执行的状态树边。另一个(稍后将被explored的)状态也被压入栈中,也有branch=1。他的父状态结点也有branch=1。

举个例子如果我们通过if的一条路径走到bpf_exit了,会调用update_branch_counts() 回溯更新每个状态树节点branches的值。if的分支节点之间通过 struct bpf_verifier_state *parent; 指针相连。

bpf_reg_state:维护了BPF寄存器的状态。

struct bpf_reg_state {
/* Ordering of fields matters. See states_equal() */
enum bpf_reg_type type;
union {
/* valid when type == PTR_TO_PACKET */
u16 range;


/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;


u32 btf_id; /* for PTR_TO_BTF_ID */


/* Max size from any of the above. */
unsigned long raw;
};
/* Fixed part of pointer offset, pointer types only */
s32 off;
/* For PTR_TO_PACKET, used to find other pointers with the same variable
* offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
* came from, when one is tested for != NULL.
* For PTR_TO_SOCKET this is used to share which pointers retain the
* same reference to the socket, to determine proper reference freeing.
*/
u32 id;
/* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned
* from a pointer-cast helper, bpf_sk_fullsock() and
* bpf_tcp_sock().
*
* Consider the following where "sk" is a reference counted
* pointer returned from "sk = bpf_sk_lookup_tcp();":
*
* 1: sk = bpf_sk_lookup_tcp();
* 2: if (!sk) { return 0; }
* 3: fullsock = bpf_sk_fullsock(sk);
* 4: if (!fullsock) { bpf_sk_release(sk); return 0; }
* 5: tp = bpf_tcp_sock(fullsock);
* 6: if (!tp) { bpf_sk_release(sk); return 0; }
* 7: bpf_sk_release(sk);
* 8: snd_cwnd = tp->snd_cwnd; // verifier will complain
*
* After bpf_sk_release(sk) at line 7, both "fullsock" ptr and
* "tp" ptr should be invalidated also. In order to do that,
* the reg holding "fullsock" and "sk" need to remember
* the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id
* such that the verifier can reset all regs which have
* ref_obj_id matching the sk_reg->id.
*
* sk_reg->ref_obj_id is set to sk_reg->id at line 1.
* sk_reg->id will stay as NULL-marking purpose only.
* After NULL-marking is done, sk_reg->id can be reset to 0.
*
* After "fullsock = bpf_sk_fullsock(sk);" at line 3,
* fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id.
*
* After "tp = bpf_tcp_sock(fullsock);" at line 5,
* tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id
* which is the same as sk_reg->ref_obj_id.
*
* From the verifier perspective, if sk, fullsock and tp
* are not NULL, they are the same ptr with different
* reg->type. In particular, bpf_sk_release(tp) is also
* allowed and has the same effect as bpf_sk_release(sk).
*/
u32 ref_obj_id;
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
* the actual value.
* For pointer types, this represents the variable part of the offset
* from the pointed-to object, and is shared with all bpf_reg_states
* with the same id as us.
*/
struct tnum var_off;
/* Used to determine if any memory access using this register will
* result in a bad access.
* These refer to the same value as var_off, not necessarily the actual
* contents of the register.
*/
s64 smin_value; /* minimum possible (s64)value */
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
/* parentage chain for liveness checking */
struct bpf_reg_state *parent;
/* Inside the callee two registers can be both PTR_TO_STACK like
* R1=fp-8 and R2=fp-8, but one of them points to this function stack
* while another to the caller's stack. To differentiate them 'frameno'
* is used which is an index in bpf_verifier_state->frame[] array
* pointing to bpf_func_state.
*/
u32 frameno;
/* Tracks subreg definition. The stored value is the insn_idx of the
* writing insn. This is safe because subreg_def is used before any insn
* patching which only happens after main verification finished.
*/
s32 subreg_def;
enum bpf_reg_liveness live;
/* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */
bool precise;
};

bpf_reg_type

/* types of values stored in eBPF registers */
/* Pointer types represent:
* pointer
* pointer + imm
* pointer + (u16) var
* pointer + (u16) var + imm
* if (range > 0) then [ptr, ptr + range - off) is safe to access
* if (id > 0) means that some 'var' was added
* if (off > 0) means that 'imm' was added
*/
enum bpf_reg_type {
NOT_INIT = 0, /* nothing was written into register */
SCALAR_VALUE, /* reg doesn't contain a valid pointer */
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
PTR_TO_STACK, /* reg == frame_pointer + offset */
PTR_TO_PACKET_META, /* skb->data - meta_len */
PTR_TO_PACKET, /* reg points to skb->data */
PTR_TO_PACKET_END, /* skb->data + headlen */
PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */
PTR_TO_SOCKET, /* reg points to struct bpf_sock */
PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */
PTR_TO_SOCK_COMMON, /* reg points to sock_common */
PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */
PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
PTR_TO_BTF_ID, /* reg points to kernel struct */
};

struct tnum

当reg是一个具体的数值(范围值),本结构代表真正的值。

当reg是一个指针,这代表了到被指向对象的偏移量。

struct tnum {
u64 value;
u64 mask;
};


#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m}


/* A completely unknown value */
const struct tnum tnum_unknown = { .value = 0, .mask = -1 };

bpf_stack_slot_type

enum bpf_stack_slot_type {
STACK_INVALID, /* nothing was stored in this stack slot */
STACK_SPILL, /* register spilled into stack */
STACK_MISC, /* BPF program wrote some data into this slot */
STACK_ZERO, /* BPF program wrote constant zero */
};


函数的主要流程:

1. 初始化当前state信息:

  state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
state->curframe = 0; //当前在调用栈的最上层
state->speculative = false;
state->branches = 1; //当前节点branches = 1
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); //记录调用栈


env->cur_state = state; //记录当前初始的state节点

2. 初始化函数状态与寄存器。

init_func_state(env, state->frame[0],BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog);


在 init_func_state 中,调用了 init_reg_state 来初始化寄存器。

static void init_reg_state(struct bpf_verifier_env *env,
struct bpf_func_state *state)
{
struct bpf_reg_state *regs = state->regs;
int i;
//挨个扫描当前state中的寄存器
for (i = 0; i < MAX_BPF_REG; i++) {
//将一个寄存器标识为unknown
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE; //生存期?
regs[i].parent = NULL;
regs[i].subreg_def = DEF_NOT_SUBREG;
}


/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK; //设置栈帧寄存器r10,指向BPF栈数据
mark_reg_known_zero(env, regs, BPF_REG_FP); //将reg的信息都设置为0
regs[BPF_REG_FP].frameno = state->frameno; //当前r10的栈帧号等于初始化的state栈帧号。
}

3. 如果设置了 env->prog->type == BPF_PROG_TYPE_EXT 调用btf_prepare_func_args 对func的参数类型进行转换与检查。转换成PTR_TO_CTX or SCALAR_VALUE

转换后,如果参数是PTR_TO_CTX(指向bpf_context),标志regs为0 。否则如果是SCALAR_VALUE,标记reg为unknown。

4. 否则,直接标记r1的类型为PTR_TO_CTX,且标0 。然后检测参数类型。

5. 调用 do_check(env);

do_check

函数的主要目的:

刚刚那张图的分支检测等。

check寄存器,stack,bpf context,map等的可读写性。

更新栈内存,寄存器的状态。

1. 当遇见BPF_ALU、BPF_ALU64的指令。对多种可能的ALU操作,比如neg、add、xor、rsh等进行可能的合法性校验(源、目的操作数、寄存器等)。过度细节的校验就不贴上来了。

检验完之后调用 adjust_reg_min_max_vals(env, insn)  计算新的min/max范围与var_off

2. 如果是class==BPF_LDX。即加载指令。

a. 先调用 check_reg_arg(env, insn->src_reg, SRC_OP) 检测src_reg是否可读。

b.再调用 check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK) 检测dst_reg是否可写。

c. 调用 check_mem_access(env, env->insn_idx, insn->src_reg,insn->off, BPF_SIZE(insn->code),BPF_READ, insn->dst_reg, false); 检测真正要读取的位置:src_reg + off 是否可读。

3. 除了BPF_LDX以外。本函数还对其他的类型的指令进行了对应的检查。在每个子检查中都是使用的:

check_reg_arg check_mem_access 等进行组合检查。

check_mem_access

这个函数是check中用来检查内存访问非常重要的一个函数。

/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno,
int off, int bpf_size, enum bpf_access_type t,
int value_regno, bool strict_alignment_once)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
struct bpf_func_state *state;
int size, err = 0;


size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;


/* alignment checks will add in reg->off themselves */
//检查指针对齐
err = check_ptr_alignment(env, reg, off, size, strict_alignment_once);
if (err)
return err;


/* for access checks, reg->off is just part of off */


off += reg->off;


//如果reg指向的是map中的值
if (reg->type == PTR_TO_MAP_VALUE) {

......
}
//如果reg指向bpf_context
else if (reg->type == PTR_TO_CTX) {
......
}
//如果reg指向stack
else if (reg->type == PTR_TO_STACK) {
off += reg->var_off.value; //
err = check_stack_access(env, reg, off, size);
if (err)
return err;


state = func(env, reg);
err = update_stack_depth(env, state, off);
if (err)
return err;


if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
value_regno, insn_idx);
else
err = check_stack_read(env, state, off, size,
value_regno);
}
......


我们主要看一个典型的,当reg是一个指向栈的指针时的情况。

首先我们通过 check_stack_access 检测栈的可访问状况。

static int check_stack_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size)
{
/* Stack accesses must be at a fixed offset, so that we
* can determine what type of data were returned. See
* check_stack_read().
*/
//tnum_is_const检测我们要访问的offset必须是一个常量
if (!tnum_is_const(reg->var_off)) {
char tn_buf[48];


tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
tn_buf, off, size);
return -EACCES;
}
//我们访问的位置必须在合法范围内,最大offset不超过-512字节
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose(env, "invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}


return 0;
}

接下来通过func函数查找当前的调用栈。

然后通过 update_stack_depth 对每个函数的栈消耗进行维护。如果当前访问的地址超出当前函数的栈范围,那么对当前函数进行栈扩充。

接下来判断读or写操作。


写操作,调用 check_stack_write 检测栈的可写性。

未设置 allow_ptr_leaks不允许部分写STACK_SPILL类型的栈数据。

reg是STACK_SPILL,部分写不允许。

reg是非STACK_SPILL,根据reg和写入字节的情况设置栈相应的slot类型,支持部分写。

state->stack[spi].slot_type[]中低地址保存实际栈高地址的数据类型

/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
static int check_stack_write(struct bpf_verifier_env *env,
struct bpf_func_state *state, /* func where register points to */
int off, int size, int value_regno, int insn_idx)
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
struct bpf_reg_state *reg = NULL;


//由于栈扩展导致的需要重新申请空间放state跟踪stack状态
err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
state->acquired_refs, true);
if (err)
return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
//不允许非特权用户由于部分写破坏栈上合法指针
if (!env->allow_ptr_leaks &&
state->stack[spi].slot_type[0] == STACK_SPILL &&
size != BPF_REG_SIZE) {
verbose(env, "attempt to corrupt spilled pointer on stack\n");
return -EACCES;
}


cur = env->cur_state->frame[env->cur_state->curframe];
if (value_regno >= 0)
reg = &cur->regs[value_regno];


//优化回溯
if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
!register_is_null(reg) && env->allow_ptr_leaks) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
* stack slot address like [fp - 8]. Other spill of
* scalar via different register has to be conervative.
* Backtrack from here and mark all registers as precise
* that contributed into 'reg' being a constant.
*/
err = mark_chain_precision(env, value_regno);
if (err)
return err;
}
save_register_state(state, spi, reg);
}
else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
//源寄存器是STACK_SPILL,部分写不允许
if (size != BPF_REG_SIZE) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "invalid size of register spill\n");
return -EACCES;
}


if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}


//引入sanitize,防止stack的重用,防范侧信道攻击?
if (!env->allow_ptr_leaks)
{
bool sanitize = false;


if (state->stack[spi].slot_type[0] == STACK_SPILL &&
register_is_const(&state->stack[spi].spilled_ptr))
sanitize = true;
for (i = 0; i < BPF_REG_SIZE; i++)
if (state->stack[spi].slot_type[i] == STACK_MISC) {
sanitize = true;
break;
}
if (sanitize) {
int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
int soff = (-spi - 1) * BPF_REG_SIZE;


/* detected reuse of integer stack slot with a pointer
* which means either llvm is reusing stack slot or
* an attacker is trying to exploit CVE-2018-3639
* (speculative store bypass)
* Have to sanitize that slot with preemptive
* store of zero.
*/
if (*poff && *poff != soff) {
/* disallow programs where single insn stores
* into two different stack slots, since verifier
* cannot sanitize them
*/
verbose(env,
"insn %d cannot access two stack slots fp%d and fp%d",
insn_idx, *poff, soff);
return -EINVAL;
}
*poff = soff;
}
}
save_register_state(state, spi, reg);
} else {
//reg是非有效指针
u8 type = STACK_MISC;


/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;

//标记栈为STACK_SPILL, 代表包含非指针、非0变量等
/* Mark slots as STACK_MISC if they belonged to spilled ptr. */
if (state->stack[spi].slot_type[0] == STACK_SPILL)
for (i = 0; i < BPF_REG_SIZE; i++)
state->stack[spi].slot_type[i] = STACK_MISC;


/* only mark the slot as written if all 8 bytes were written
* otherwise read propagation may incorrectly stop too soon
* when stack slots are partially written.
* This heuristic means that read propagation will be
* conservative, since it will add reg_live_read marks
* to stack slots all the way to first state when programs
* writes+reads less than 8 bytes
*/
if (size == BPF_REG_SIZE)
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;


/* when we zero initialize stack slots mark them as such */
//reg为0
if (reg && register_is_null(reg)) {
/* backtracking doesn't work for STACK_ZERO yet. */
err = mark_chain_precision(env, value_regno);
if (err)
return err;
type = STACK_ZERO; //设置为STACK_ZERO
}


/* Mark slots affected by this stack write. */
//reg为0,标记所写栈的slot类型为设置为STACK_ZERO,否则仍然是一开始的MISC
for (i = 0; i < size; i++)
state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
type;
}
return 0;
}

读操作,检测可读性,这里就不展开了,比可写性简单一些。

check_max_stack_depth

我们重新回到上层的 bpf_check 中。

函数目的:

保证函数调用深度不超过 MAX_BPF_STACK

挑选出code为BPF_JMP , BPF_CALL ;src_reg为:BPF_PSEUDO_CALL,即eBPF2eBPF的函数调用。记录对应的深度与返回地址,调用完返回上层subprog。在这之间如果调用深度超过8或者最大栈消耗超过512字节,abort。

/* starting from main bpf function walk all instructions of the function
* and recursively walk all callees that given function can call.
* Ignore jump and exit insns.
* Since recursion is prevented by check_cfg() this algorithm
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
int depth = 0, frame = 0, idx = 0, i = 0, subprog_end;
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
int ret_insn[MAX_CALL_FRAMES]; //函数返回地址
int ret_prog[MAX_CALL_FRAMES]; //函数返回地址的index


process_func:
/* round up to 32-bytes, since this is granularity
* of interpreter stack size
*/
depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
//栈不能不超过512
if (depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
frame + 1, depth);
return -EACCES;
}
//subprog[0]为main函数,取下一个函数
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
if (insn[i].code != (BPF_JMP | BPF_CALL))
continue;
if (insn[i].src_reg != BPF_PSEUDO_CALL)
continue;
/* remember insn and function to return to */
//bpf fun之间的调用
ret_insn[frame] = i + 1;//记录返回地址
ret_prog[frame] = idx; //记录idx


/* find the callee */
i = i + insn[i].imm + 1;//找到callee的地址
idx = find_subprog(env, i);//查找callee的idx
if (idx < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
i);
return -EFAULT;
}
frame++;//深度++
//调用深度限制小于8
if (frame >= MAX_CALL_FRAMES) {
verbose(env, "the call stack of %d frames is too deep !\n",
frame);
return -E2BIG;
}
goto process_func;
}
/* end of for() loop means the last insn of the 'subprog'
* was reached. Doesn't matter whether it was JA or EXIT
*/
//深度为0说明之行结束,返回
if (frame == 0)
return 0;
depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32);
frame--;
i = ret_insn[frame]; //弹出返回地址
idx = ret_prog[frame]; //弹出上层函数idx
goto continue_func; //返回
}


fixup_bpf_calls

本函数也很长,不过相对好理解,我们挑一部分关键的来说。

函数目的:

修复bpf_call指令的insn->imm字段,并将符合条件的helper func内联为明确的BPF指令序列。

首先,本函数对指令做了patch,修复一些不合法的指令。处理尾调用等。

{
if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
//将div指令扩展为4个指令。
//如果源操作数为0,清零dst
//跳转到下一条指令执行
//类似一个security hook?
struct bpf_insn mask_and_div[] = {
BPF_MOV32_REG(insn->src_reg, insn->src_reg),
/* Rx div 0 -> 0 */
BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2),
BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg),
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
*insn,
};

//将mod指令扩展为两个指令
//如果源操作数为0,目的操作数不变,跳到原指令的下一条执行
struct bpf_insn mask_and_mod[] = {
BPF_MOV32_REG(insn->src_reg, insn->src_reg),
/* Rx mod 0 -> Rx */
BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1),
*insn,
};

//用patch指令替换源指令。
struct bpf_insn *patchlet;


if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
patchlet = mask_and_div + (is64 ? 1 : 0);
cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0);
} else {
patchlet = mask_and_mod + (is64 ? 1 : 0);
cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0);
}


new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
if (!new_prog)
return -ENOMEM;


delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
continue;
}
......
//修正跳转距离,imm为调用函数到当前指令的距离。
//通过BPF_CAST_CALL获取该函数的地址直接减去内核的__bpf_call_base,做的实际只是部分修正。
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
__bpf_call_base;
continue;
case BPF_FUNC_map_update_elem:
insn->imm = BPF_CAST_CALL(ops->map_update_elem) -
__bpf_call_base;
continue;
case BPF_FUNC_map_delete_elem:
insn->imm = BPF_CAST_CALL(ops->map_delete_elem) -
__bpf_call_base;
continue;
case BPF_FUNC_map_push_elem:
insn->imm = BPF_CAST_CALL(ops->map_push_elem) -
__bpf_call_base;
continue;
case BPF_FUNC_map_pop_elem:
insn->imm = BPF_CAST_CALL(ops->map_pop_elem) -
__bpf_call_base;
continue;
case BPF_FUNC_map_peek_elem:
insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
__bpf_call_base;
continue;
}


goto patch_call_imm;
}


04

jit部分

fixup_call_args 调用了 jit_subprogs

jit_subprogs

函数主要目的:

扫描prog,找到bpf2bpf函数调用,找到对应的subprog,存储。

为每个要jit的subprog申请空间。

bpf_int_jit_compile 每个subprog

修正bpf2bpf调用的函数距离(bpf_int_jit_compile)

这里实际上对于函数地址的修正经过了多轮jit,这里看了别的大佬的一种解释。

由于第一次不完全修正函数距离时

函数流程:

static int jit_subprogs(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog, **func, *tmp;
int i, j, subprog_start, subprog_end = 0, len, subprog;
struct bpf_insn *insn;
void *old_bpf_func;
int err;


//判断subprog数量
if (env->subprog_cnt <= 1)
return 0;


//扫描指令,找到bpf2bpf的调用
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
* propagated in any case.
*/
//根据调用地址找到被调用subprog的索引,保存索引到insn->off中。
subprog = find_subprog(env, i + insn->imm + 1);
if (subprog < 0) {
WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
i + insn->imm + 1);
return -EFAULT;
}
/* temporarily remember subprog id inside insn instead of
* aux_data, since next loop will split up all insns into funcs
*/
insn->off = subprog;
/* remember original imm in case JIT fails and fallback
* to interpreter will be needed
*/
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
}

//分配空间
err = bpf_prog_alloc_jited_linfo(prog);
if (err)
goto out_undo_insn;


err = -ENOMEM;
func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL);
if (!func)
goto out_undo_insn;


//为了每个subprog申请空间,然后根据prog来做初始化
for (i = 0; i < env->subprog_cnt; i++) {
subprog_start = subprog_end;
subprog_end = env->subprog_info[i + 1].start;


len = subprog_end - subprog_start;
/* BPF_PROG_RUN doesn't call subprogs directly,
* hence main prog stats include the runtime of subprogs.
* subprogs don't have IDs and not reachable via prog_get_next_id
* func[i]->aux->stats will never be accessed and stays NULL
*/
func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER);
if (!func[i])
goto out_free;
memcpy(func[i]->insnsi, &prog->insnsi[subprog_start],
len * sizeof(struct bpf_insn));
func[i]->type = prog->type;
func[i]->len = len;
if (bpf_prog_calc_tag(func[i]))
goto out_free;
func[i]->is_func = 1;
func[i]->aux->func_idx = i;
/* the btf and func_info will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;


/* Use bpf_prog_F_tag to indicate functions in stack traces.
* Long term would need debug info to populate names
*/
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1; //开启jit
func[i]->aux->linfo = prog->aux->linfo;
func[i]->aux->nr_linfo = prog->aux->nr_linfo;
func[i]->aux->jited_linfo = prog->aux->jited_linfo;
func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx;
func[i] = bpf_int_jit_compile(func[i]); //对该subprog做jit
if (!func[i]->jited) { //如果没有jit成功
err = -ENOTSUPP;
goto out_free;
}
cond_resched();
}
/* at this point all bpf functions were successfully JITed
* now populate all bpf_calls with correct addresses and
* run last pass of JIT
*/
//修正bpf2bpf调用的函数距离
for (i = 0; i < env->subprog_cnt; i++) {
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
//这里也只是部分修复,最后真正的修复在bpf_int_jit_compile中
insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
__bpf_call_base;
}


/* we use the aux data to keep a list of the start addresses
* of the JITed images for each function in the program
*
* for some architectures, such as powerpc64, the imm field
* might not be large enough to hold the offset of the start
* address of the callee's JITed image from __bpf_call_base
*
* in such cases, we can lookup the start address of a callee
* by using its subprog id, available from the off field of
* the call instruction, as an index for this list
*/
func[i]->aux->func = func;
func[i]->aux->func_cnt = env->subprog_cnt;
}
for (i = 0; i < env->subprog_cnt; i++) {
old_bpf_func = func[i]->bpf_func;
tmp = bpf_int_jit_compile(func[i]); //真正的修复bpf2bpf函数的调用距离
if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) {
verbose(env, "JIT doesn't support bpf-to-bpf calls\n");
err = -ENOTSUPP;
goto out_free;
}
cond_resched();
}


/* finally lock prog and jit images for all functions and
* populate kallsysm
*/
for (i = 0; i < env->subprog_cnt; i++) {
bpf_prog_lock_ro(func[i]); //jit之后子函数内存只读
bpf_prog_kallsyms_add(func[i]); //添加到kallsyms
}


/* Last step: make now unused interpreter insns from main
* prog consistent for later dump requests, so they can
* later look the same as if they were interpreted only.
*/
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
insn->off = env->insn_aux_data[i].call_imm;
subprog = find_subprog(env, i + insn->off + 1);
insn->imm = subprog;
}


prog->jited = 1; //设置jit完成
prog->bpf_func = func[0]->bpf_func; //prog在内核里的可执行地址
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_free_unused_jited_linfo(prog);
return 0;
out_free:
for (i = 0; i < env->subprog_cnt; i++)
if (func[i])
bpf_jit_free(func[i]);
kfree(func);
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (insn->code != (BPF_JMP | BPF_CALL) ||
insn->src_reg != BPF_PSEUDO_CALL)
continue;
insn->off = 0;
insn->imm = env->insn_aux_data[i].call_imm;
}
bpf_prog_free_jited_linfo(prog);
return err;
}


do_jit/bpf_int_jit_compile

这是最主要的jit函数。

内核中实际上有两条可以到达这里的有效jit路径:

第一条是jit_subprogs(这里涉及到bpf2bpf函数调用等,所以是多函数的bpf jit路径)。

另一条是通过bpf_prog_select_runtime,即当bpf_check结束后,调用bpf_prog_select_runtime。

do_jit

调用:static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,int oldproglen, struct jit_context *ctx)

1. 首先通过 emit_prologue 构建栈初始化环境。为了符合x64的abi,主要涉及rsp,rbp的操作等。

/*
* Emit x86-64 prologue code for BPF program and check its size.
* bpf_tail_call helper will skip it while jumping into another program
*/
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
u8 *prog = *pprog;
int cnt = X86_PATCH_SIZE;


/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
prog += cnt;
EMIT1(0x55); /* push rbp */
EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
/* sub rsp, rounded_stack_depth */
EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
EMIT1(0x53); /* push rbx */
EMIT2(0x41, 0x55); /* push r13 */
EMIT2(0x41, 0x56); /* push r14 */
EMIT2(0x41, 0x57); /* push r15 */
if (!ebpf_from_cbpf) {
/* zero init tail_call_cnt */
EMIT2(0x6a, 0x00);
BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
}
*pprog = prog;
}

2. 针对类似的普通指令,可以一对一的进行翻译,opcode、eBPF、寄存器相匹配:

/* ALU */
case BPF_ALU | BPF_ADD | BPF_X:
case BPF_ALU | BPF_SUB | BPF_X:
case BPF_ALU | BPF_AND | BPF_X:
case BPF_ALU | BPF_OR | BPF_X:
case BPF_ALU | BPF_XOR | BPF_X:
case BPF_ALU64 | BPF_ADD | BPF_X:
case BPF_ALU64 | BPF_SUB | BPF_X:
case BPF_ALU64 | BPF_AND | BPF_X:
case BPF_ALU64 | BPF_OR | BPF_X:
case BPF_ALU64 | BPF_XOR | BPF_X:
switch (BPF_OP(insn->code)) {
case BPF_ADD: b2 = 0x01; break;
case BPF_SUB: b2 = 0x29; break;
case BPF_AND: b2 = 0x21; break;
case BPF_OR: b2 = 0x09; break;
case BPF_XOR: b2 = 0x31; break;
}
if (BPF_CLASS(insn->code) == BPF_ALU64)
EMIT1(add_2mod(0x48, dst_reg, src_reg));
else if (is_ereg(dst_reg) || is_ereg(src_reg))
EMIT1(add_2mod(0x40, dst_reg, src_reg));
EMIT2(b2, add_2reg(0xC0, dst_reg, src_reg));
break;

3.修正BPF_CALL,当前的imm32 = func_addr-__bpf_call_base

/* call */
case BPF_JMP | BPF_CALL:
func = (u8 *) __bpf_call_base + imm32;
if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) /*!*/
return -EINVAL;
break;

最后调用:

static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode)
{
u8 *prog = *pprog;
int cnt = 0;
s64 offset;


offset = func - (ip + X86_PATCH_SIZE); /*!*/
if (!is_simm32(offset)) {
pr_err("Target call %p is out of range\n", func);
return -ERANGE;
}
EMIT1_off32(opcode, offset);
*pprog = prog;
return 0;
}

计算出真正的bpf2bpf的调用地址。

4. 如果是exit指令,那么按照x64的方式回复栈空间。leave销毁局部栈

  ......
case BPF_JMP | BPF_EXIT:
if (seen_exit) {
jmp_offset = ctx->cleanup_addr - addrs[i];
goto emit_jmp;
}
seen_exit = true;
/* Update cleanup_addr */
ctx->cleanup_addr = proglen;
if (!bpf_prog_was_classic(bpf_prog))
EMIT1(0x5B); /* get rid of tail_call_cnt */
EMIT2(0x41, 0x5F); /* pop r15 */
EMIT2(0x41, 0x5E); /* pop r14 */
EMIT2(0x41, 0x5D); /* pop r13 */
EMIT1(0x5B); /* pop rbx */
EMIT1(0xC9); /* leave */
EMIT1(0xC3); /* ret */
break;

5. 如果涉及到尾调用的情况:

    case BPF_JMP | BPF_TAIL_CALL:
if (imm32)
emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1],
&prog, addrs[i], image);
else
emit_bpf_tail_call_indirect(&prog);
break;

当imm修正好,深度允许时(小于MAX_TAIL_CALL_CNT),可以直接跳转到callee上(ip+adj_off)

static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
u8 **pprog, int addr, u8 *image)
{
u8 *prog = *pprog;
int cnt = 0;


/*
* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
* goto out;
*/
EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */
EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
EMIT2(X86_JA, 14); /* ja out */
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */


poke->ip = image + (addr - X86_PATCH_SIZE);
poke->adj_off = PROLOGUE_SIZE;


memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
/* out: */


*pprog = prog;
}

否则,可以通过emit_bpf_tail_call_indirect ,通过indirect的一些检查后,可以实现goto *(prog->bpf_func + prologue_size); ,进而jmp到另外的bpf prog上。

......
/* goto *(prog->bpf_func + prologue_size); */
EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */
offsetof(struct bpf_prog, bpf_func));
EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */


/*
* Wow we're ready to jump into next BPF program
* rdi == ctx (1st arg)
* rax == prog->bpf_func + prologue_size
*/
RETPOLINE_RAX_BPF_JIT();

6. 最终在 prog->bpf_func 存放编译后的可执行代码。

unsigned int(*bpf_func)(const void *ctx, const struct bpf_insn *insn);


而在jit阶段很多指令都是基于EMIT来做转换的。

#define EMIT(bytes, len) \
do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)


#define EMIT1(b1) EMIT(b1, 1)
#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)

举个例子。

/* mov r11, src_reg */
EMIT_mov(AUX_REG, src_reg);


/* mov dst, src */
#define EMIT_mov(DST, SRC)

在EMIT_mov中首先校验了DST不等于SRC。之后调用了:

EMIT3(add_2mod(0x48, DST, SRC), 0x89, add_2reg(0xC0, DST, SRC));

在add_2mod中,根据DST,SRC的的值对于byte(指令字节码)做了编码修正。返回修正后的指令编码。

在add_2reg中基于如下的寄存器映射表继续编码修正传递的byte指令。

/* Encode 'dst_reg' and 'src_reg' registers into x86-64 opcode 'byte' */
static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
{
return byte + reg2hex[dst_reg] + (reg2hex[src_reg] << 3);
}
/*
* The following table maps BPF registers to x86-64 registers.
*
* x86-64 register R12 is unused, since if used as base address
* register in load/store instructions, it always needs an
* extra byte of encoding and is callee saved.
*
* x86-64 register R9 is not used by BPF programs, but can be used by BPF
* trampoline. x86-64 register R10 is used for blinding (if enabled).
*/
static const int reg2hex[] = {
[BPF_REG_0] = 0, /* RAX */
[BPF_REG_1] = 7, /* RDI */
[BPF_REG_2] = 6, /* RSI */
[BPF_REG_3] = 2, /* RDX */
[BPF_REG_4] = 1, /* RCX */
[BPF_REG_5] = 0, /* R8 */
[BPF_REG_6] = 3, /* RBX callee saved */
[BPF_REG_7] = 5, /* R13 callee saved */
[BPF_REG_8] = 6, /* R14 callee saved */
[BPF_REG_9] = 7, /* R15 callee saved */
[BPF_REG_FP] = 5, /* RBP readonly */
[BPF_REG_AX] = 2, /* R10 temp register */
[AUX_REG] = 3, /* R11 temp register */
[X86_REG_R9] = 1, /* R9 register, 6th function argument */
};

EMIT3 的三条指令编码全部修正完毕后,EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) 通过位移操作将三条指令整合起来作为参数。

最终调用 emit_code(prog, bytes, len); 直接将对应编码写入对应prog的内存中。



static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
if (len == 1)
*ptr = bytes;
else if (len == 2)
*(u16 *)ptr = bytes;
else {
*(u32 *)ptr = bytes;
barrier();
}
return ptr + len;
}

bpf_int_jit_compile

1. 首先申请addrs数组用来存储BPF指令翻译后的x64指令地址

addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL);
if (!addrs) {
prog = orig_prog;
goto out_addrs;
}


/*
* Before first pass, make a rough estimation of addrs[]
* each BPF instruction is translated to less than 64 bytes
*/
for (proglen = 0, i = 0; i <= prog->len; i++) {
proglen += 64;
addrs[i] = proglen;
}

2. 对一个函数进行多轮pass,每轮pass都做jit,这是因为我们一开始jit的时候,很多后面的指令都没有生成,从而导致jmp跳转无法捕捉到最准确的距离,只能是按照上一步,先预留64bytes(偏大)。第一轮pass过后,addr中会储存每一条指令的偏移地址,但是由于jmp指令不准确,所以此时的地址不是完全正确的,指令长度(jmp)可能也有问题,而通过多轮pass来jit,指令长度不断收敛,直到 (proglen == oldproglen) 才得到准确的位置信息

/*
* JITed image shrinks with every pass and the loop iterates
* until the image stops shrinking. Very large BPF programs
* may converge on the last pass. In such case do one more
* pass to emit the final image.
*/
for (pass = 0; pass < 20 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
......
if (proglen == oldproglen) {
/*
* The number of entries in extable is the number of BPF_LDX
* insns that access kernel memory via "pointer to BTF type".
* The verifier changed their opcode from LDX|MEM|size
* to LDX|PROBE_MEM|size to make JITing easier.
*/
u32 align = __alignof__(struct exception_table_entry);
u32 extable_size = prog->aux->num_exentries *
sizeof(struct exception_table_entry);


/* allocate module memory for x86 insns and extable */
header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size,
&image, align, jit_fill_hole);
if (!header) {
prog = orig_prog;
goto out_addrs;
}
prog->aux->extable = (void *) image + roundup(proglen, align);
}
oldproglen = proglen;
cond_resched();
}

3.当正确收敛之后,我们会分配空间保存再过最后一轮pass收敛后的jit结果。至此,jit完成。

ref

https://blog.csdn.net/m0_37921080/article/details/82530191

https://blog.csdn.net/liukuan73/article/details/102705963

https://www.cnblogs.com/LittleHann/p/4134939.html

https://www.cnblogs.com/bsauce/p/11583304.html

https://blog.csdn.net/s2603898260/article/details/79371024

https://blog.csdn.net/hjkfcz/article/details/104916719


本文为郭子仪同学原创文章,欢迎更多同学投稿。


文章转载自开源之夏,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

评论