jail: add support for cgroup devices as in OCI run-time spec
[project/procd.git] / jail / cgroups-bpf.c
1 /*
2 * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * somehow emulate devices.allow/devices.deny using eBPF
14 *
15 * OCI run-time spec defines the syntax for allowing/denying access
16 * to devices according to the definition of cgroup-v1 in the Kernel
17 * as described in Documentation/admin-guide/cgroup-v1.
18 */
19
20 #include <assert.h>
21 #include <linux/bpf.h>
22 #include <sys/reg.h>
23 #include <sys/syscall.h>
24
25 #include <libubox/blobmsg.h>
26 #include <libubox/blobmsg_json.h>
27 #include <libubox/list.h>
28
29 #include "cgroups.h"
30 #include "cgroups-bpf.h"
31 #include "log.h"
32
33 static struct bpf_insn *program = NULL;
34 static int bpf_total_insn = 0;
35 static const char *license = "GPL";
36
37 static int
38 syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size)
39 {
40 return (int) syscall (__NR_bpf, cmd, attr, size);
41 }
42
43 /* from crun/src/libcrun/ebpf.c */
44 #define BPF_ALU32_IMM(OP, DST, IMM) \
45 ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
46
47 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
48 ((struct bpf_insn){ \
49 .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
50
51 #define BPF_MOV64_REG(DST, SRC) \
52 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
53
54 #define BPF_JMP_A(OFF) \
55 ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
56
57 #define BPF_JMP_IMM(OP, DST, IMM, OFF) \
58 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
59
60 #define BPF_JMP_REG(OP, DST, SRC, OFF) \
61 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
62
63 #define BPF_MOV64_IMM(DST, IMM) \
64 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
65
66 #define BPF_MOV32_REG(DST, SRC) \
67 ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
68
69 #define BPF_EXIT_INSN() \
70 ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
71
72 /* taken from systemd. */
73 static const struct bpf_insn pre_insn[] = {
74 /* type -> R2. */
75 BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
76 BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),
77 /* access -> R3. */
78 BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
79 BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),
80 /* major -> R4. */
81 BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),
82 /* minor -> R5. */
83 BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
84 };
85
86 enum {
87 OCI_LINUX_CGROUPS_DEVICES_ALLOW,
88 OCI_LINUX_CGROUPS_DEVICES_TYPE,
89 OCI_LINUX_CGROUPS_DEVICES_MAJOR,
90 OCI_LINUX_CGROUPS_DEVICES_MINOR,
91 OCI_LINUX_CGROUPS_DEVICES_ACCESS,
92 __OCI_LINUX_CGROUPS_DEVICES_MAX,
93 };
94
95 static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = {
96 [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL },
97 [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
98 [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
99 [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
100 [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING },
101 };
102
103 /*
104 * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
105 * define datatypes similar to the legacy kernel code.
106 */
107 #define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
108 #define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
109
110 enum devcg_behavior {
111 DEVCG_DEFAULT_NONE,
112 DEVCG_DEFAULT_ALLOW,
113 DEVCG_DEFAULT_DENY,
114 };
115
116 struct dev_exception_item {
117 uint32_t major, minor;
118 short type;
119 short access;
120 struct list_head list;
121 bool allow;
122 };
123
124 /*
125 * add a bunch of default rules
126 */
127 static int add_default_exceptions(struct list_head *exceptions)
128 {
129 int i, ret = 0;
130 struct dev_exception_item *cur;
131 /* from crun/src/libcrun/cgroup.c */
132 const struct dev_exception_item defrules[] = {
133 /* always allow mknod */
134 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
135 { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0, .minor = ~0, .access = BPF_DEVCG_ACC_MKNOD },
136 /* /dev/null */
137 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 3, .access = DEVCG_ACC_ALL },
138 /* /dev/random */
139 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 8, .access = DEVCG_ACC_ALL },
140 /* /dev/full */
141 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 7, .access = DEVCG_ACC_ALL },
142 /* /dev/tty */
143 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 0, .access = DEVCG_ACC_ALL },
144 /* /dev/zero */
145 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 5, .access = DEVCG_ACC_ALL },
146 /* /dev/urandom */
147 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 1, .minor = 9, .access = DEVCG_ACC_ALL },
148 /* /dev/console */
149 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 1, .access = DEVCG_ACC_ALL },
150 /* /dev/pts/[0-255] */
151 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 136, .minor = ~0, .access = DEVCG_ACC_ALL },
152 /* /dev/ptmx */
153 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 5, .minor = 2, .access = DEVCG_ACC_ALL },
154 /* /dev/net/tun */
155 { .allow = true, .type = BPF_DEVCG_DEV_CHAR, .major = 10, .minor = 200, .access = DEVCG_ACC_ALL },
156 };
157
158 for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) {
159 cur = malloc(sizeof(struct dev_exception_item));
160 if (!cur) {
161 ret = ENOMEM;
162 break;
163 }
164 /* add defaults to list in reverse order (last item will be first in list) */
165 memcpy(cur, &defrules[i], sizeof(struct dev_exception_item));
166 list_add(&cur->list, exceptions);
167 }
168
169 return ret;
170 }
171
172 /*
173 * free all exceptions in the list
174 */
175 static void flush_exceptions(struct list_head *freelist)
176 {
177 struct dev_exception_item *dl, *dln;
178
179 if (!list_empty(freelist))
180 list_for_each_entry_safe(dl, dln, freelist, list) {
181 list_del(&dl->list);
182 free(dl);
183 }
184 }
185
186 /*
187 * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
188 */
189 int parseOCIlinuxcgroups_devices(struct blob_attr *msg)
190 {
191 struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX];
192 struct blob_attr *cur;
193 int rem, ret = 0;
194 int bpf_type, bpf_access;
195 unsigned char acidx;
196 bool allow = false,
197 has_access = false,
198 has_type = false,
199 has_major = false,
200 has_minor = false;
201 int total_ins = 0,
202 cur_ins = 0,
203 pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn),
204 next_ins;
205 char *access, *devtype;
206 uint32_t devmajor, devminor;
207 struct dev_exception_item *dl;
208 struct list_head exceptions;
209 enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW;
210 INIT_LIST_HEAD(&exceptions);
211
212 /* parse according to OCI spec */
213 blobmsg_for_each_attr(cur, msg, rem) {
214 blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX,
215 tb, blobmsg_data(cur), blobmsg_len(cur));
216
217 if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) {
218 ret = EINVAL;
219 goto out;
220 }
221
222 allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]);
223
224 bpf_access = 0;
225 if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) {
226 access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]);
227 if ((strlen(access) > 3) || (strlen(access) == 0)) {
228 ret = EINVAL;
229 goto out;
230 }
231
232 for (acidx = 0; acidx < strlen(access); ++acidx) {
233 switch (access[acidx]) {
234 case 'r':
235 bpf_access |= BPF_DEVCG_ACC_READ;
236 break;
237 case 'w':
238 bpf_access |= BPF_DEVCG_ACC_WRITE;
239 break;
240 case 'm':
241 bpf_access |= BPF_DEVCG_ACC_MKNOD;
242 break;
243 default:
244 ret = EINVAL;
245 goto out;
246 }
247 }
248 }
249
250 if (!bpf_access)
251 bpf_access = DEVCG_ACC_ALL;
252
253 bpf_type = 0;
254 if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) {
255 devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]);
256
257 switch (devtype[0]) {
258 case 'c':
259 bpf_type = BPF_DEVCG_DEV_CHAR;
260 break;
261 case 'b':
262 bpf_type = BPF_DEVCG_DEV_BLOCK;
263 break;
264 case 'a':
265 bpf_type = DEVCG_DEV_ALL;
266 break;
267 default:
268 ret = EINVAL;
269 goto out;
270 }
271 }
272
273 if (!bpf_type)
274 bpf_type = DEVCG_DEV_ALL;
275
276 if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR])
277 devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]);
278 else
279 devmajor = ~0;
280
281 if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR])
282 devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]);
283 else
284 devminor = ~0;
285
286 if (bpf_type == DEVCG_DEV_ALL) {
287 /* wildcard => change default policy and flush all existing rules */
288 flush_exceptions(&exceptions);
289 behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY;
290 } else {
291 /* allocate and populate record for exception */
292 dl = malloc(sizeof(struct dev_exception_item));
293 if (!dl) {
294 ret = ENOSPC;
295 break;
296 }
297 dl->allow = allow;
298 dl->type = bpf_type;
299 dl->access = bpf_access;
300 dl->major = devmajor;
301 dl->minor = devminor;
302
303 /* push to exceptions list, last goes first */
304 list_add(&dl->list, &exceptions);
305 }
306 }
307 if (ret)
308 goto out;
309
310 /* add default rules */
311 ret = add_default_exceptions(&exceptions);
312 if (ret)
313 goto out;
314
315 /* calculate number of instructions to allocate */
316 list_for_each_entry(dl, &exceptions, list) {
317 has_access = dl->access != DEVCG_ACC_ALL;
318 has_type = dl->type != DEVCG_DEV_ALL;
319 has_major = dl->major != ~0;
320 has_minor = dl->minor != ~0;
321
322 total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2;
323 }
324
325 /* acccount for loader instructions */
326 total_ins += pre_insn_len;
327
328 /* final accept/deny block */
329 total_ins += 2;
330
331 /* allocate memory for eBPF program */
332 program = calloc(total_ins, sizeof(struct bpf_insn));
333 if (!program) {
334 ret = ENOMEM;
335 goto out;
336 }
337
338 /* copy program loader instructions */
339 memcpy(program, &pre_insn, sizeof(pre_insn));
340 cur_ins = pre_insn_len;
341
342 /* generate eBPF program */
343 list_for_each_entry(dl, &exceptions, list) {
344 has_access = dl->access != DEVCG_ACC_ALL;
345 has_type = dl->type != DEVCG_DEV_ALL;
346 has_major = dl->major != ~0;
347 has_minor = dl->minor != ~0;
348
349 next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1;
350
351 if (has_type) {
352 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins);
353 --next_ins;
354 }
355
356 if (has_access) {
357 program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3);
358 program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access);
359 program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2);
360 next_ins -= 3;
361 }
362
363 if (has_major) {
364 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins);
365 --next_ins;
366 }
367
368 if (has_minor) {
369 program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins);
370 --next_ins;
371 }
372
373 program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0);
374 program[cur_ins++] = BPF_EXIT_INSN();
375 }
376
377 /* default behavior */
378 program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0);
379 program[cur_ins++] = BPF_EXIT_INSN();
380
381 if (debug) {
382 fprintf(stderr, "cgroup devices:\na > devices.%s\n",
383 (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny");
384
385 list_for_each_entry(dl, &exceptions, list)
386 fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n",
387 (dl->type == DEVCG_DEV_ALL)?'a':
388 (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b',
389 (dl->major == ~0)?-1:dl->major,
390 (dl->minor == ~0)?-1:dl->minor,
391 (dl->access & BPF_DEVCG_ACC_READ)?"r":"",
392 (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"",
393 (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"",
394 (dl->allow)?"allow":"deny");
395
396 fprintf(stderr, "generated cgroup-devices eBPF program:\n");
397 fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n");
398 for (cur_ins=0; cur_ins<total_ins; cur_ins++)
399 fprintf(stderr, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins,
400 program[cur_ins].code,
401 program[cur_ins].dst_reg,
402 program[cur_ins].src_reg,
403 program[cur_ins].off,
404 program[cur_ins].imm);
405 }
406
407 assert(cur_ins == total_ins);
408 bpf_total_insn = total_ins;
409 ret = 0;
410
411 out:
412 flush_exceptions(&exceptions);
413 return ret;
414 }
415
416 /*
417 * attach eBPF program to cgroup
418 */
419 int attach_cgroups_ebpf(int cgroup_dirfd) {
420 int prog_fd;
421 #if ( __WORDSIZE == 64 )
422 uint64_t program_ptr = (uint64_t)program;
423 uint64_t license_ptr = (uint64_t)license;
424 #elif ( __WORDSIZE == 32 )
425 uint32_t program_ptr = (uint32_t)program;
426 uint32_t license_ptr = (uint32_t)license;
427 #else
428 #error
429 #endif
430 union bpf_attr load_attr = {
431 .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
432 .license = license_ptr,
433 .insns = program_ptr,
434 .insn_cnt = bpf_total_insn,
435 };
436
437 if (!program)
438 return 0;
439
440 prog_fd = syscall_bpf(BPF_PROG_LOAD, &load_attr, sizeof(load_attr));
441 if (prog_fd < 0)
442 return EIO;
443
444 union bpf_attr attach_attr = {
445 .attach_type = BPF_CGROUP_DEVICE,
446 .target_fd = cgroup_dirfd,
447 .attach_bpf_fd = prog_fd,
448 };
449
450 return syscall_bpf(BPF_PROG_ATTACH, &attach_attr, sizeof (attach_attr));
451 }