2 * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * somehow emulate devices.allow/devices.deny using eBPF
15 * OCI run-time spec defines the syntax for allowing/denying access
16 * to devices according to the definition of cgroup-v1 in the Kernel
17 * as described in Documentation/admin-guide/cgroup-v1.
21 #include <linux/bpf.h>
23 #include <sys/cdefs.h>
27 #include <sys/syscall.h>
29 #include <libubox/blobmsg.h>
30 #include <libubox/blobmsg_json.h>
31 #include <libubox/list.h>
34 #include "cgroups-bpf.h"
37 static struct bpf_insn
*program
= NULL
;
38 static int bpf_total_insn
= 0;
39 static const char *license
= "GPL";
42 syscall_bpf (int cmd
, union bpf_attr
*attr
, unsigned int size
)
44 return (int) syscall (__NR_bpf
, cmd
, attr
, size
);
47 /* from crun/src/libcrun/ebpf.c */
48 #define BPF_ALU32_IMM(OP, DST, IMM) \
49 ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
51 #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
53 .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
55 #define BPF_MOV64_REG(DST, SRC) \
56 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
58 #define BPF_JMP_A(OFF) \
59 ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
61 #define BPF_JMP_IMM(OP, DST, IMM, OFF) \
62 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
64 #define BPF_JMP_REG(OP, DST, SRC, OFF) \
65 ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
67 #define BPF_MOV64_IMM(DST, IMM) \
68 ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
70 #define BPF_MOV32_REG(DST, SRC) \
71 ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
73 #define BPF_EXIT_INSN() \
74 ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
76 /* taken from systemd. */
77 static const struct bpf_insn pre_insn
[] = {
79 BPF_LDX_MEM (BPF_W
, BPF_REG_2
, BPF_REG_1
, 0),
80 BPF_ALU32_IMM (BPF_AND
, BPF_REG_2
, 0xFFFF),
82 BPF_LDX_MEM (BPF_W
, BPF_REG_3
, BPF_REG_1
, 0),
83 BPF_ALU32_IMM (BPF_RSH
, BPF_REG_3
, 16),
85 BPF_LDX_MEM (BPF_W
, BPF_REG_4
, BPF_REG_1
, 4),
87 BPF_LDX_MEM (BPF_W
, BPF_REG_5
, BPF_REG_1
, 8),
91 OCI_LINUX_CGROUPS_DEVICES_ALLOW
,
92 OCI_LINUX_CGROUPS_DEVICES_TYPE
,
93 OCI_LINUX_CGROUPS_DEVICES_MAJOR
,
94 OCI_LINUX_CGROUPS_DEVICES_MINOR
,
95 OCI_LINUX_CGROUPS_DEVICES_ACCESS
,
96 __OCI_LINUX_CGROUPS_DEVICES_MAX
,
99 static const struct blobmsg_policy oci_linux_cgroups_devices_policy
[] = {
100 [OCI_LINUX_CGROUPS_DEVICES_ALLOW
] = { "allow", BLOBMSG_TYPE_BOOL
},
101 [OCI_LINUX_CGROUPS_DEVICES_TYPE
] = { "type", BLOBMSG_TYPE_STRING
},
102 [OCI_LINUX_CGROUPS_DEVICES_MAJOR
] = { "major", BLOBMSG_CAST_INT64
},
103 [OCI_LINUX_CGROUPS_DEVICES_MINOR
] = { "minor", BLOBMSG_CAST_INT64
},
104 [OCI_LINUX_CGROUPS_DEVICES_ACCESS
] = { "access", BLOBMSG_TYPE_STRING
},
108 * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
109 * define datatypes similar to the legacy kernel code.
111 #define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
112 #define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
114 enum devcg_behavior
{
120 struct dev_exception_item
{
121 uint32_t major
, minor
;
124 struct list_head list
;
129 * add a bunch of default rules
131 static int add_default_exceptions(struct list_head
*exceptions
)
134 struct dev_exception_item
*cur
;
135 /* from crun/src/libcrun/cgroup.c */
136 const struct dev_exception_item defrules
[] = {
137 /* always allow mknod */
138 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= ~0, .minor
= ~0, .access
= BPF_DEVCG_ACC_MKNOD
},
139 { .allow
= true, .type
= BPF_DEVCG_DEV_BLOCK
, .major
= ~0, .minor
= ~0, .access
= BPF_DEVCG_ACC_MKNOD
},
141 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 1, .minor
= 3, .access
= DEVCG_ACC_ALL
},
143 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 1, .minor
= 8, .access
= DEVCG_ACC_ALL
},
145 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 1, .minor
= 7, .access
= DEVCG_ACC_ALL
},
147 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 5, .minor
= 0, .access
= DEVCG_ACC_ALL
},
149 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 1, .minor
= 5, .access
= DEVCG_ACC_ALL
},
151 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 1, .minor
= 9, .access
= DEVCG_ACC_ALL
},
153 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 5, .minor
= 1, .access
= DEVCG_ACC_ALL
},
154 /* /dev/pts/[0-255] */
155 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 136, .minor
= ~0, .access
= DEVCG_ACC_ALL
},
157 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 5, .minor
= 2, .access
= DEVCG_ACC_ALL
},
159 { .allow
= true, .type
= BPF_DEVCG_DEV_CHAR
, .major
= 10, .minor
= 200, .access
= DEVCG_ACC_ALL
},
162 for (i
= 0; i
< (sizeof(defrules
) / sizeof(struct dev_exception_item
)); ++i
) {
163 cur
= malloc(sizeof(struct dev_exception_item
));
168 /* add defaults to list in reverse order (last item will be first in list) */
169 memcpy(cur
, &defrules
[i
], sizeof(struct dev_exception_item
));
170 list_add(&cur
->list
, exceptions
);
177 * free all exceptions in the list
179 static void flush_exceptions(struct list_head
*freelist
)
181 struct dev_exception_item
*dl
, *dln
;
183 if (!list_empty(freelist
))
184 list_for_each_entry_safe(dl
, dln
, freelist
, list
) {
191 * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
193 int parseOCIlinuxcgroups_devices(struct blob_attr
*msg
)
195 struct blob_attr
*tb
[__OCI_LINUX_CGROUPS_DEVICES_MAX
];
196 struct blob_attr
*cur
;
198 int bpf_type
, bpf_access
;
207 pre_insn_len
= sizeof(pre_insn
) / sizeof(struct bpf_insn
),
209 char *access
, *devtype
;
210 uint32_t devmajor
, devminor
;
211 struct dev_exception_item
*dl
;
212 struct list_head exceptions
;
213 enum devcg_behavior behavior
= DEVCG_DEFAULT_ALLOW
;
214 INIT_LIST_HEAD(&exceptions
);
216 /* parse according to OCI spec */
217 blobmsg_for_each_attr(cur
, msg
, rem
) {
218 blobmsg_parse(oci_linux_cgroups_devices_policy
, __OCI_LINUX_CGROUPS_DEVICES_MAX
,
219 tb
, blobmsg_data(cur
), blobmsg_len(cur
));
221 if (!tb
[OCI_LINUX_CGROUPS_DEVICES_ALLOW
]) {
226 allow
= blobmsg_get_bool(tb
[OCI_LINUX_CGROUPS_DEVICES_ALLOW
]);
229 if (tb
[OCI_LINUX_CGROUPS_DEVICES_ACCESS
]) {
230 access
= blobmsg_get_string(tb
[OCI_LINUX_CGROUPS_DEVICES_ACCESS
]);
231 if ((strlen(access
) > 3) || (strlen(access
) == 0)) {
236 for (acidx
= 0; acidx
< strlen(access
); ++acidx
) {
237 switch (access
[acidx
]) {
239 bpf_access
|= BPF_DEVCG_ACC_READ
;
242 bpf_access
|= BPF_DEVCG_ACC_WRITE
;
245 bpf_access
|= BPF_DEVCG_ACC_MKNOD
;
255 bpf_access
= DEVCG_ACC_ALL
;
258 if (tb
[OCI_LINUX_CGROUPS_DEVICES_TYPE
]) {
259 devtype
= blobmsg_get_string(tb
[OCI_LINUX_CGROUPS_DEVICES_TYPE
]);
261 switch (devtype
[0]) {
263 bpf_type
= BPF_DEVCG_DEV_CHAR
;
266 bpf_type
= BPF_DEVCG_DEV_BLOCK
;
269 bpf_type
= DEVCG_DEV_ALL
;
278 bpf_type
= DEVCG_DEV_ALL
;
280 if (tb
[OCI_LINUX_CGROUPS_DEVICES_MAJOR
])
281 devmajor
= blobmsg_cast_u64(tb
[OCI_LINUX_CGROUPS_DEVICES_MAJOR
]);
285 if (tb
[OCI_LINUX_CGROUPS_DEVICES_MINOR
])
286 devminor
= blobmsg_cast_u64(tb
[OCI_LINUX_CGROUPS_DEVICES_MINOR
]);
290 if (bpf_type
== DEVCG_DEV_ALL
) {
291 /* wildcard => change default policy and flush all existing rules */
292 flush_exceptions(&exceptions
);
293 behavior
= allow
?DEVCG_DEFAULT_ALLOW
:DEVCG_DEFAULT_DENY
;
295 /* allocate and populate record for exception */
296 dl
= malloc(sizeof(struct dev_exception_item
));
303 dl
->access
= bpf_access
;
304 dl
->major
= devmajor
;
305 dl
->minor
= devminor
;
307 /* push to exceptions list, last goes first */
308 list_add(&dl
->list
, &exceptions
);
314 /* add default rules */
315 ret
= add_default_exceptions(&exceptions
);
319 /* calculate number of instructions to allocate */
320 list_for_each_entry(dl
, &exceptions
, list
) {
321 has_access
= dl
->access
!= DEVCG_ACC_ALL
;
322 has_type
= dl
->type
!= DEVCG_DEV_ALL
;
323 has_major
= dl
->major
!= ~0;
324 has_minor
= dl
->minor
!= ~0;
326 total_ins
+= (has_type
? 1 : 0) + (has_access
? 3 : 0) + (has_major
? 1 : 0) + (has_minor
? 1 : 0) + 2;
329 /* acccount for loader instructions */
330 total_ins
+= pre_insn_len
;
332 /* final accept/deny block */
335 /* allocate memory for eBPF program */
336 program
= calloc(total_ins
, sizeof(struct bpf_insn
));
342 /* copy program loader instructions */
343 memcpy(program
, &pre_insn
, sizeof(pre_insn
));
344 cur_ins
= pre_insn_len
;
346 /* generate eBPF program */
347 list_for_each_entry(dl
, &exceptions
, list
) {
348 has_access
= dl
->access
!= DEVCG_ACC_ALL
;
349 has_type
= dl
->type
!= DEVCG_DEV_ALL
;
350 has_major
= dl
->major
!= ~0;
351 has_minor
= dl
->minor
!= ~0;
353 next_ins
= (has_type
? 1 : 0) + (has_access
? 3 : 0) + (has_major
? 1 : 0) + (has_minor
? 1 : 0) + 1;
356 program
[cur_ins
++] = BPF_JMP_IMM(BPF_JNE
, BPF_REG_2
, dl
->type
, next_ins
);
361 program
[cur_ins
++] = BPF_MOV32_REG(BPF_REG_1
, BPF_REG_3
);
362 program
[cur_ins
++] = BPF_ALU32_IMM(BPF_AND
, BPF_REG_1
, dl
->access
);
363 program
[cur_ins
++] = BPF_JMP_REG(BPF_JNE
, BPF_REG_1
, BPF_REG_3
, next_ins
- 2);
368 program
[cur_ins
++] = BPF_JMP_IMM(BPF_JNE
, BPF_REG_4
, dl
->major
, next_ins
);
373 program
[cur_ins
++] = BPF_JMP_IMM(BPF_JNE
, BPF_REG_5
, dl
->minor
, next_ins
);
377 program
[cur_ins
++] = BPF_MOV64_IMM(BPF_REG_0
, dl
->allow
? 1 : 0);
378 program
[cur_ins
++] = BPF_EXIT_INSN();
381 /* default behavior */
382 program
[cur_ins
++] = BPF_MOV64_IMM(BPF_REG_0
, (behavior
== DEVCG_DEFAULT_ALLOW
)?1:0);
383 program
[cur_ins
++] = BPF_EXIT_INSN();
386 fprintf(stderr
, "cgroup devices:\na > devices.%s\n",
387 (behavior
== DEVCG_DEFAULT_ALLOW
)?"allow":"deny");
389 list_for_each_entry(dl
, &exceptions
, list
)
390 fprintf(stderr
, "%c %d:%d %s%s%s > devices.%s\n",
391 (dl
->type
== DEVCG_DEV_ALL
)?'a':
392 (dl
->type
== BPF_DEVCG_DEV_CHAR
)?'c':'b',
393 (dl
->major
== ~0)?-1:dl
->major
,
394 (dl
->minor
== ~0)?-1:dl
->minor
,
395 (dl
->access
& BPF_DEVCG_ACC_READ
)?"r":"",
396 (dl
->access
& BPF_DEVCG_ACC_WRITE
)?"w":"",
397 (dl
->access
& BPF_DEVCG_ACC_MKNOD
)?"m":"",
398 (dl
->allow
)?"allow":"deny");
400 fprintf(stderr
, "generated cgroup-devices eBPF program:\n");
401 fprintf(stderr
, " [idx]\tcode\t dest\t src\t off\t imm\n");
402 for (cur_ins
=0; cur_ins
<total_ins
; cur_ins
++)
403 fprintf(stderr
, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins
,
404 program
[cur_ins
].code
,
405 program
[cur_ins
].dst_reg
,
406 program
[cur_ins
].src_reg
,
407 program
[cur_ins
].off
,
408 program
[cur_ins
].imm
);
411 assert(cur_ins
== total_ins
);
412 bpf_total_insn
= total_ins
;
416 flush_exceptions(&exceptions
);
421 * attach eBPF program to cgroup
423 int attach_cgroups_ebpf(int cgroup_dirfd
) {
425 #if ( __WORDSIZE == 64 )
426 uint64_t program_ptr
= (uint64_t)program
;
427 uint64_t license_ptr
= (uint64_t)license
;
428 #elif ( __WORDSIZE == 32 )
429 uint32_t program_ptr
= (uint32_t)program
;
430 uint32_t license_ptr
= (uint32_t)license
;
434 union bpf_attr load_attr
= {
435 .prog_type
= BPF_PROG_TYPE_CGROUP_DEVICE
,
436 .license
= license_ptr
,
437 .insns
= program_ptr
,
438 .insn_cnt
= bpf_total_insn
,
444 prog_fd
= syscall_bpf(BPF_PROG_LOAD
, &load_attr
, sizeof(load_attr
));
448 union bpf_attr attach_attr
= {
449 .attach_type
= BPF_CGROUP_DEVICE
,
450 .target_fd
= cgroup_dirfd
,
451 .attach_bpf_fd
= prog_fd
,
454 return syscall_bpf(BPF_PROG_ATTACH
, &attach_attr
, sizeof (attach_attr
));