jail: add support for cgroup devices as in OCI run-time spec
[project/procd.git] / jail / cgroups.c
1 /*
2 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * reads unified cgroup config as proposed in
14 * https://github.com/opencontainers/runtime-spec/pull/1040
15 * attempt conversion from cgroup1 -> cgroup2
16 * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
17 *
18 * ToDo:
19 * - convert cgroup1 net_prio and net_cls to eBPF program
20 * - rdma (anyone?) intelrdt (anyone?)
21 */
22
23 #define _GNU_SOURCE
24
25 #include <assert.h>
26 #include <errno.h>
27 #include <fcntl.h>
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <sys/stat.h>
32 #include <sys/mman.h>
33 #include <unistd.h>
34 #include <libgen.h>
35 #include <inttypes.h>
36
37 #include <libubox/avl.h>
38 #include <libubox/avl-cmp.h>
39 #include <libubox/blobmsg.h>
40 #include <libubox/list.h>
41 #include <libubox/utils.h>
42
43 #include "log.h"
44 #include "cgroups.h"
45 #include "cgroups-bpf.h"
46
47 #define CGROUP_ROOT "/sys/fs/cgroup/"
48 #define CGROUP_IO_WEIGHT_MAX 10000
49
50 struct cgval {
51 struct avl_node avl;
52 char *val;
53 };
54
55 struct avl_tree cgvals;
56 static char *cgroup_path;
57 static bool initialized;
58
59 void cgroups_prepare(void) {
60 initialized = false;
61 }
62
63 void cgroups_init(const char *p) {
64 avl_init(&cgvals, avl_strcmp, false, NULL);
65 cgroup_path = strdup(p);
66 initialized = true;
67 }
68
69 static void cgroups_set(const char *key, const char *val)
70 {
71 struct cgval *valp;
72
73 valp = avl_find_element(&cgvals, key, valp, avl);
74 if (!valp) {
75 valp = malloc(sizeof(struct cgval));
76 assert(valp != NULL);
77 valp->avl.key = strdup(key);
78 avl_insert(&cgvals, &valp->avl);
79 } else {
80 DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
81 free(valp->val);
82 }
83
84 valp->val = strdup(val);
85 }
86
87 void cgroups_free(void)
88 {
89 struct cgval *valp, *tmp;
90
91 if (initialized) {
92 avl_remove_all_elements(&cgvals, valp, avl, tmp) {
93 free((void *)(valp->avl.key));
94 free(valp->val);
95 free(valp);
96 }
97 free(cgroup_path);
98 }
99 }
100
101 void cgroups_apply(pid_t pid)
102 {
103 struct cgval *valp;
104 char *cdir, *ent;
105 int fd;
106 size_t maxlen = strlen("cgroup.subtree_control");
107
108 bool cpuset = false,
109 cpu = false,
110 hugetlb = false,
111 io = false,
112 memory = false,
113 pids = false,
114 rdma = false;
115
116 char subtree_control[64] = { 0 };
117
118 DEBUG("using cgroup path %s\n", cgroup_path);
119 mkdir_p(cgroup_path, 0700);
120
121 /* find which controllers need to be enabled */
122 avl_for_each_element(&cgvals, valp, avl) {
123 ent = (char *)valp->avl.key;
124 if (strlen(ent) > maxlen)
125 maxlen = strlen(ent);
126
127 if (!strncmp("cpuset.", ent, 7))
128 cpuset = true;
129 else if (!strncmp("cpu.", ent, 4))
130 cpu = true;
131 else if (!strncmp("hugetlb.", ent, 8))
132 hugetlb = true;
133 else if (!strncmp("io.", ent, 3))
134 io = true;
135 else if (!strncmp("memory.", ent, 7))
136 memory = true;
137 else if (!strncmp("pids.", ent, 5))
138 pids = true;
139 else if (!strncmp("rdma.", ent, 5))
140 pids = true;
141 }
142
143 maxlen += strlen(cgroup_path) + 2;
144
145 if (cpuset)
146 strcat(subtree_control, "+cpuset ");
147
148 if (cpu)
149 strcat(subtree_control, "+cpu ");
150
151 if (hugetlb)
152 strcat(subtree_control, "+hugetlb ");
153
154 if (io)
155 strcat(subtree_control, "+io ");
156
157 if (memory)
158 strcat(subtree_control, "+memory ");
159
160 if (pids)
161 strcat(subtree_control, "+pids ");
162
163 if (rdma)
164 strcat(subtree_control, "+rdma ");
165
166 /* remove trailing space */
167 ent = strchr(subtree_control, '\0') - 1;
168 *ent = '\0';
169
170 ent = malloc(maxlen);
171 assert(ent != 0);
172
173 DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
174 cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
175 while ((cdir = strchr(cdir + 1, '/'))) {
176 *cdir = '\0';
177 snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
178 DEBUG(" * %s\n", ent);
179 fd = open(ent, O_WRONLY);
180 assert(fd != -1);
181 write(fd, subtree_control, strlen(subtree_control));
182 close(fd);
183 *cdir = '/';
184 }
185
186 avl_for_each_element(&cgvals, valp, avl) {
187 DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
188 snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
189 fd = open(ent, O_WRONLY);
190 if (fd == -1) {
191 ERROR("can't open %s: %m\n", ent);
192 continue;
193 }
194 if (dprintf(fd, "%s", valp->val) < 0) {
195 ERROR("can't write to %s: %m\n", ent);
196 };
197 close(fd);
198 }
199
200 int dirfd = open(cgroup_path, O_DIRECTORY);
201 if (dirfd < 0) {
202 ERROR("can't open %s: %m\n", cgroup_path);
203 } else {
204 attach_cgroups_ebpf(dirfd);
205 close(dirfd);
206 }
207
208 snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
209 fd = open(ent, O_WRONLY);
210 if (fd < 0) {
211 ERROR("can't open %s: %m\n", cgroup_path);
212 } else {
213 dprintf(fd, "%d", pid);
214 close(fd);
215 }
216
217 free(ent);
218 }
219
220 enum {
221 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
222 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
223 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
224 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
225 __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
226 };
227
228 static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
229 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
230 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
231 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
232 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
233 };
234
235 enum {
236 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
237 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
238 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
239 __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
240 };
241
242 static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
243 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
244 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
245 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
246 };
247
248 enum {
249 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
250 OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
251 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
252 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
253 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
254 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
255 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
256 __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
257 };
258
259 static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
260 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
261 [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
262 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
263 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
264 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
265 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
266 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
267 };
268
269 struct posix_dev {
270 uint64_t major;
271 uint64_t minor;
272 };
273
274 struct iomax_line {
275 struct avl_node avl;
276 struct posix_dev dev;
277 uint64_t rbps;
278 uint64_t wbps;
279 uint64_t riops;
280 uint64_t wiops;
281 };
282
283 static int avl_devcmp(const void *k1, const void *k2, void *ptr)
284 {
285 struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
286
287 if (d1->major < d2->major)
288 return -1;
289
290 if (d1->major > d2->major)
291 return 1;
292
293 if (d1->minor < d2->minor)
294 return -1;
295
296 if (d1->minor > d2->minor)
297 return 1;
298
299 return 0;
300 }
301
302 static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
303 {
304 struct iomax_line *l;
305 struct posix_dev d;
306 d.major = major;
307 d.minor = minor;
308 l = avl_find_element(iomax, &d, l, avl);
309 if (!l) {
310 l = malloc(sizeof(struct iomax_line));
311 assert(l != NULL);
312 l->dev.major = d.major;
313 l->dev.minor = d.minor;
314 l->avl.key = &l->dev;
315 l->rbps = -1;
316 l->wbps = -1;
317 l->riops = -1;
318 l->wiops = -1;
319 avl_insert(iomax, &l->avl);
320 }
321
322 return l;
323 }
324
325 static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
326 {
327 struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
328 *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
329 *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
330 *cur;
331 int rem;
332 int weight = -1, leafweight = -1;
333 size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
334 char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
335 char *weightstr, *iomaxstr;
336 struct avl_tree iomax;
337 struct iomax_line *curiomax, *tmp;
338
339 blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
340
341 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
342 weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
343 ++numweightstrs;
344 }
345
346 if (weight > CGROUP_IO_WEIGHT_MAX)
347 return ERANGE;
348
349 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
350 leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
351
352 if (leafweight > CGROUP_IO_WEIGHT_MAX)
353 return ERANGE;
354
355 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
356 ++numweightstrs;
357
358 weightstrs = calloc(numweightstrs + 1, sizeof(char *));
359 assert(weightstrs != 0);
360 numweightstrs = 0;
361
362 if (weight > -1)
363 if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
364 return ENOMEM;
365
366 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
367 uint64_t major, minor;
368 int devweight = weight, devleafweight = leafweight;
369
370 blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
371 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
372 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
373 return ENODATA;
374
375 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
376 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
377 return ENODATA;
378
379 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
380 devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
381
382 if (devweight > CGROUP_IO_WEIGHT_MAX)
383 return ERANGE;
384
385 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
386 devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
387
388 if (devleafweight > CGROUP_IO_WEIGHT_MAX)
389 return ERANGE;
390
391 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
392 return ENOTSUP;
393
394 major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
395 minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
396
397 if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
398 return ENOMEM;
399 }
400
401 if (numweightstrs) {
402 curstr = weightstrs;
403 while (*curstr)
404 strtotlen += strlen(*(curstr++)) + 1;
405
406 weightstr = calloc(strtotlen, sizeof(char));
407 assert(weightstr != 0);
408
409 curstr = weightstrs;
410 while (*curstr) {
411 strcat(weightstr, *curstr);
412 strcat(weightstr, "\n");
413 free(*(curstr++));
414 }
415
416 cgroups_set("io.bfq.weight", weightstr);
417 free(weightstr);
418 };
419
420 free(weightstrs);
421
422 avl_init(&iomax, avl_devcmp, false, NULL);
423
424 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
425 struct iomax_line *l;
426
427 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
428
429 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
430 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
431 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
432 return ENODATA;
433
434 l = get_iomax_line(&iomax,
435 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
436 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
437
438 l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
439 }
440
441 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
442 struct iomax_line *l;
443
444 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
445
446 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
447 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
448 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
449 return ENODATA;
450
451 l = get_iomax_line(&iomax,
452 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
453 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
454
455 l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
456 }
457
458 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
459 struct iomax_line *l;
460
461 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
462
463 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
464 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
465 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
466 return ENODATA;
467
468 l = get_iomax_line(&iomax,
469 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
470 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
471
472 l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
473 }
474
475 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
476 struct iomax_line *l;
477
478 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
479
480 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
481 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
482 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
483 return ENODATA;
484
485 l = get_iomax_line(&iomax,
486 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
487 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
488
489 l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
490 }
491
492 avl_for_each_element(&iomax, curiomax, avl)
493 ++numiomaxstrs;
494
495 if (!numiomaxstrs)
496 return 0;
497
498 iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
499 assert(iomaxstrs != 0);
500 numiomaxstrs = 0;
501
502 avl_for_each_element(&iomax, curiomax, avl) {
503 char iomaxlstr[160];
504 char lstr[32];
505
506 sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
507
508 if (curiomax->rbps != -1) {
509 sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
510 strcat(iomaxlstr, lstr);
511 }
512 if (curiomax->wbps != -1) {
513 sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
514 strcat(iomaxlstr, lstr);
515 }
516 if (curiomax->riops != -1) {
517 sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
518 strcat(iomaxlstr, lstr);
519 }
520 if (curiomax->wiops != -1) {
521 sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
522 strcat(iomaxlstr, lstr);
523 }
524
525 iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
526 }
527
528 avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
529 avl_delete(&iomax, &curiomax->avl);
530 free(curiomax);
531 }
532
533 strtotlen = 1; /* 1 accounts for \0 at end of string */
534 if (numiomaxstrs) {
535 curstr = iomaxstrs;
536 while (*curstr)
537 strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
538
539 iomaxstr = calloc(strtotlen, sizeof(char));
540 assert(iomaxstr != 0);
541 curstr = iomaxstrs;
542
543 while (*curstr) {
544 strcat(iomaxstr, *curstr);
545 strcat(iomaxstr, "\n");
546 free(*(curstr++));
547 }
548
549 cgroups_set("io.max", iomaxstr);
550 free(iomaxstr);
551 };
552
553 free(iomaxstrs);
554
555 return 0;
556 }
557
558
559 enum {
560 OCI_LINUX_CGROUPS_CPU_SHARES,
561 OCI_LINUX_CGROUPS_CPU_PERIOD,
562 OCI_LINUX_CGROUPS_CPU_QUOTA,
563 OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
564 OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
565 OCI_LINUX_CGROUPS_CPU_CPUS,
566 OCI_LINUX_CGROUPS_CPU_MEMS,
567 __OCI_LINUX_CGROUPS_CPU_MAX,
568 };
569
570 static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
571 [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
572 [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
573 [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
574 [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
575 [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
576 [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
577 [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
578 };
579
580 static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
581 {
582 struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
583 uint64_t shares, period = 0;
584 int64_t quota = -2; /* unset */
585 char tmp[32] = { 0 };
586
587 blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
588
589 if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
590 tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
591 return ENOTSUP; /* no equivalent in cgroup2 */
592
593 if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
594 shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
595 if ((shares < 2) || (shares > 262144))
596 return ERANGE;
597
598 snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
599 cgroups_set("cpu.weight", tmp);
600 tmp[0] = '\0';
601 }
602
603 if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
604 quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
605
606 if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
607 period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
608
609 if (period) {
610 if (quota >= 0)
611 snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
612 else
613 snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
614 } else if (quota >= 0) {
615 snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
616 } else if (quota == -1) {
617 strcpy(tmp, "max");
618 }
619
620 if (tmp[0])
621 cgroups_set("cpu.max", tmp);
622
623 if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
624 cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
625
626 if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
627 cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
628
629 return 0;
630 }
631
632
633 enum {
634 OCI_LINUX_CGROUPS_MEMORY_LIMIT,
635 OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
636 OCI_LINUX_CGROUPS_MEMORY_SWAP,
637 OCI_LINUX_CGROUPS_MEMORY_KERNEL,
638 OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
639 OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
640 OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
641 OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
642 __OCI_LINUX_CGROUPS_MEMORY_MAX,
643 };
644
645 static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
646 [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
647 [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
648 [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
649 [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
650 [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
651 [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
652 [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
653 [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
654 };
655
656 static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
657 {
658 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
659 char tmp[32] = { 0 };
660 int64_t limit = -1, swap, reservation;
661
662 blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
663
664 /*
665 * not all properties of the OCI memory section can be mapped to cgroup2
666 * kernel memory accounting is always enabled and included in the set
667 * memory limit, hence these options can be ignored
668 * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
669 * preventing self-upgrade (but allow downgrade)
670 *
671 * see also https://github.com/opencontainers/runtime-spec/issues/1005
672 */
673 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
674 tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
675 tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
676 return ENOTSUP;
677
678
679 if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
680 limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
681 if (limit == -1)
682 strcpy(tmp, "max");
683 else
684 snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
685
686 cgroups_set("memory.max", tmp);
687 }
688
689 if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
690 reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
691
692 if (reservation == -1)
693 strcpy(tmp, "max");
694 else
695 snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
696
697 cgroups_set("memory.low", tmp);
698 }
699
700 /* OCI 'swap' acounts for memory+swap */
701 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
702 swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
703
704 if (swap == -1)
705 strcpy(tmp, "max");
706 else if (limit == -1 || (limit < swap))
707 snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
708 else
709 snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
710
711 cgroups_set("memory.swap_max", tmp);
712 }
713
714 return 0;
715 }
716
717
718 enum {
719 OCI_LINUX_CGROUPS_PIDS_LIMIT,
720 __OCI_LINUX_CGROUPS_PIDS_MAX,
721 };
722
723 static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
724 [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
725 };
726
727 static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
728 {
729 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
730 char tmp[32] = { 0 };
731
732 blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
733
734 if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
735 return EINVAL;
736
737 snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
738
739 cgroups_set("pids.max", tmp);
740
741 return 0;
742 }
743
744 static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
745 {
746 struct blob_attr *cur;
747 int rem;
748
749 blobmsg_for_each_attr(cur, msg, rem) {
750 if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
751 return EINVAL;
752
753 /* restrict keys */
754 if (strchr(blobmsg_name(cur), '/') ||
755 !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
756 !strcmp(blobmsg_name(cur), "cgroup.procs") ||
757 !strcmp(blobmsg_name(cur), "cgroup.threads") ||
758 !strcmp(blobmsg_name(cur), "cgroup.freeze"))
759 return EINVAL;
760
761 cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
762 }
763
764 return 0;
765 }
766
767 enum {
768 OCI_LINUX_CGROUPS_BLOCKIO,
769 OCI_LINUX_CGROUPS_CPU,
770 OCI_LINUX_CGROUPS_DEVICES,
771 OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
772 OCI_LINUX_CGROUPS_INTELRDT,
773 OCI_LINUX_CGROUPS_MEMORY,
774 OCI_LINUX_CGROUPS_NETWORK,
775 OCI_LINUX_CGROUPS_PIDS,
776 OCI_LINUX_CGROUPS_RDMA,
777 OCI_LINUX_CGROUPS_UNIFIED,
778 __OCI_LINUX_CGROUPS_MAX,
779 };
780
781 static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
782 [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
783 [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
784 [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
785 [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
786 [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
787 [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
788 [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
789 [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
790 [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
791 [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
792 };
793
794 int parseOCIlinuxcgroups(struct blob_attr *msg)
795 {
796 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
797 int ret;
798
799 blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
800
801 if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
802 tb[OCI_LINUX_CGROUPS_INTELRDT] ||
803 tb[OCI_LINUX_CGROUPS_NETWORK] ||
804 tb[OCI_LINUX_CGROUPS_RDMA])
805 return ENOTSUP;
806
807 if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
808 ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
809 if (ret)
810 return ret;
811 }
812
813 if (tb[OCI_LINUX_CGROUPS_CPU]) {
814 ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
815 if (ret)
816 return ret;
817 }
818
819 if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
820 ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
821 if (ret)
822 return ret;
823 }
824
825 if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
826 ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
827 if (ret)
828 return ret;
829 }
830
831 if (tb[OCI_LINUX_CGROUPS_PIDS]) {
832 ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
833 if (ret)
834 return ret;
835 }
836
837 if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
838 ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
839 if (ret)
840 return ret;
841 }
842
843 return 0;
844 }