treewide: replace local mkdir_p implementations
[project/procd.git] / jail / cgroups.c
1 /*
2 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU Lesser General Public License version 2.1
6 * as published by the Free Software Foundation
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * reads unified cgroup config as proposed in
14 * https://github.com/opencontainers/runtime-spec/pull/1040
15 * attempt conversion from cgroup1 -> cgroup2
16 * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
17 *
18 * ToDo:
19 * - convert cgroup1 devices to eBPF program
20 * - convert cgroup1 net_prio and net_cls to eBPF program
21 * - rdma (anyone?) intelrdt (anyone?)
22 */
23
24 #define _GNU_SOURCE
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <fcntl.h>
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #include <sys/stat.h>
33 #include <sys/mman.h>
34 #include <unistd.h>
35 #include <libgen.h>
36 #include <inttypes.h>
37
38 #include <libubox/avl.h>
39 #include <libubox/avl-cmp.h>
40 #include <libubox/blobmsg.h>
41 #include <libubox/list.h>
42 #include <libubox/utils.h>
43
44 #include "log.h"
45 #include "cgroups.h"
46
47 #define CGROUP_ROOT "/sys/fs/cgroup/"
48 #define CGROUP_IO_WEIGHT_MAX 10000
49
50 struct cgval {
51 struct avl_node avl;
52 char *val;
53 };
54
55 struct avl_tree cgvals;
56 static char *cgroup_path;
57 static bool initialized;
58
59 void cgroups_prepare(void) {
60 initialized = false;
61 }
62
63 void cgroups_init(const char *p) {
64 avl_init(&cgvals, avl_strcmp, false, NULL);
65 cgroup_path = strdup(p);
66 initialized = true;
67 }
68
69 static void cgroups_set(const char *key, const char *val)
70 {
71 struct cgval *valp;
72
73 valp = avl_find_element(&cgvals, key, valp, avl);
74 if (!valp) {
75 valp = malloc(sizeof(struct cgval));
76 assert(valp != NULL);
77 valp->avl.key = strdup(key);
78 avl_insert(&cgvals, &valp->avl);
79 } else {
80 DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
81 free(valp->val);
82 }
83
84 valp->val = strdup(val);
85 }
86
87 void cgroups_free(void)
88 {
89 struct cgval *valp, *tmp;
90
91 if (initialized) {
92 avl_remove_all_elements(&cgvals, valp, avl, tmp) {
93 free((void *)(valp->avl.key));
94 free(valp->val);
95 free(valp);
96 }
97 free(cgroup_path);
98 }
99 }
100
101 void cgroups_apply(pid_t pid)
102 {
103 struct cgval *valp;
104 char *cdir, *ent;
105 int fd;
106 size_t maxlen = strlen("cgroup.subtree_control");
107
108 bool cpuset = false,
109 cpu = false,
110 hugetlb = false,
111 io = false,
112 memory = false,
113 pids = false,
114 rdma = false;
115
116 char subtree_control[64] = { 0 };
117
118 DEBUG("using cgroup path %s\n", cgroup_path);
119 mkdir_p(cgroup_path, 0700);
120
121 /* find which controllers need to be enabled */
122 avl_for_each_element(&cgvals, valp, avl) {
123 ent = (char *)valp->avl.key;
124 if (strlen(ent) > maxlen)
125 maxlen = strlen(ent);
126
127 if (!strncmp("cpuset.", ent, 7))
128 cpuset = true;
129 else if (!strncmp("cpu.", ent, 4))
130 cpu = true;
131 else if (!strncmp("hugetlb.", ent, 8))
132 hugetlb = true;
133 else if (!strncmp("io.", ent, 3))
134 io = true;
135 else if (!strncmp("memory.", ent, 7))
136 memory = true;
137 else if (!strncmp("pids.", ent, 5))
138 pids = true;
139 else if (!strncmp("rdma.", ent, 5))
140 pids = true;
141 }
142
143 maxlen += strlen(cgroup_path) + 2;
144
145 if (cpuset)
146 strcat(subtree_control, "+cpuset ");
147
148 if (cpu)
149 strcat(subtree_control, "+cpu ");
150
151 if (hugetlb)
152 strcat(subtree_control, "+hugetlb ");
153
154 if (io)
155 strcat(subtree_control, "+io ");
156
157 if (memory)
158 strcat(subtree_control, "+memory ");
159
160 if (pids)
161 strcat(subtree_control, "+pids ");
162
163 if (rdma)
164 strcat(subtree_control, "+rdma ");
165
166 /* remove trailing space */
167 ent = strchr(subtree_control, '\0') - 1;
168 *ent = '\0';
169
170 ent = malloc(maxlen);
171 assert(ent != 0);
172
173 DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
174 cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
175 while ((cdir = strchr(cdir + 1, '/'))) {
176 *cdir = '\0';
177 snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
178 DEBUG(" * %s\n", ent);
179 fd = open(ent, O_WRONLY);
180 assert(fd != -1);
181 write(fd, subtree_control, strlen(subtree_control));
182 close(fd);
183 *cdir = '/';
184 }
185
186 avl_for_each_element(&cgvals, valp, avl) {
187 DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
188 snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
189 fd = open(ent, O_WRONLY);
190 if (fd == -1) {
191 ERROR("can't open %s: %m\n", ent);
192 continue;
193 }
194 if (dprintf(fd, "%s", valp->val) < 0) {
195 ERROR("can't write to %s: %m\n", ent);
196 };
197 close(fd);
198 }
199
200 snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
201 fd = open(ent, O_WRONLY);
202 assert(fd != -1);
203 dprintf(fd, "%d", pid);
204 close(fd);
205
206 free(ent);
207 }
208
209 enum {
210 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
211 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
212 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
213 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
214 __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
215 };
216
217 static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
218 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
219 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
220 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
221 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
222 };
223
224 enum {
225 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
226 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
227 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
228 __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
229 };
230
231 static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
232 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
233 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
234 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
235 };
236
237 enum {
238 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
239 OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
240 OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
241 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
242 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
243 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
244 OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
245 __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
246 };
247
248 static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
249 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
250 [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
251 [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
252 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
253 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
254 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
255 [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
256 };
257
258 struct posix_dev {
259 uint64_t major;
260 uint64_t minor;
261 };
262
263 struct iomax_line {
264 struct avl_node avl;
265 struct posix_dev dev;
266 uint64_t rbps;
267 uint64_t wbps;
268 uint64_t riops;
269 uint64_t wiops;
270 };
271
272 static int avl_devcmp(const void *k1, const void *k2, void *ptr)
273 {
274 struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
275
276 if (d1->major < d2->major)
277 return -1;
278
279 if (d1->major > d2->major)
280 return 1;
281
282 if (d1->minor < d2->minor)
283 return -1;
284
285 if (d1->minor > d2->minor)
286 return 1;
287
288 return 0;
289 }
290
291 static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
292 {
293 struct iomax_line *l;
294 struct posix_dev d;
295 d.major = major;
296 d.minor = minor;
297 l = avl_find_element(iomax, &d, l, avl);
298 if (!l) {
299 l = malloc(sizeof(struct iomax_line));
300 assert(l != NULL);
301 l->dev.major = d.major;
302 l->dev.minor = d.minor;
303 l->avl.key = &l->dev;
304 l->rbps = -1;
305 l->wbps = -1;
306 l->riops = -1;
307 l->wiops = -1;
308 avl_insert(iomax, &l->avl);
309 }
310
311 return l;
312 }
313
314 static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
315 {
316 struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
317 *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
318 *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
319 *cur;
320 int rem;
321 int weight = -1, leafweight = -1;
322 size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
323 char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
324 char *weightstr, *iomaxstr;
325 struct avl_tree iomax;
326 struct iomax_line *curiomax, *tmp;
327
328 blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
329
330 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
331 weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
332 ++numweightstrs;
333 }
334
335 if (weight > CGROUP_IO_WEIGHT_MAX)
336 return ERANGE;
337
338 if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
339 leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
340
341 if (leafweight > CGROUP_IO_WEIGHT_MAX)
342 return ERANGE;
343
344 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
345 ++numweightstrs;
346
347 weightstrs = calloc(numweightstrs + 1, sizeof(char *));
348 assert(weightstrs != 0);
349 numweightstrs = 0;
350
351 if (weight > -1)
352 asprintf(&weightstrs[numweightstrs++], "default %d", weight);
353
354 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
355 uint64_t major, minor;
356 int devweight = weight, devleafweight = leafweight;
357
358 blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
359 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
360 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
361 return ENODATA;
362
363 if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
364 !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
365 return ENODATA;
366
367 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
368 devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
369
370 if (devweight > CGROUP_IO_WEIGHT_MAX)
371 return ERANGE;
372
373 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
374 devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
375
376 if (devleafweight > CGROUP_IO_WEIGHT_MAX)
377 return ERANGE;
378
379 if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
380 return ENOTSUP;
381
382 major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
383 minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
384
385 asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight);
386 }
387
388 if (numweightstrs) {
389 curstr = weightstrs;
390 while (*curstr)
391 strtotlen += strlen(*(curstr++)) + 1;
392
393 weightstr = calloc(strtotlen, sizeof(char));
394 assert(weightstr != 0);
395
396 curstr = weightstrs;
397 while (*curstr) {
398 strcat(weightstr, *curstr);
399 strcat(weightstr, "\n");
400 free(*(curstr++));
401 }
402
403 cgroups_set("io.bfq.weight", weightstr);
404 free(weightstr);
405 };
406
407 free(weightstrs);
408
409 avl_init(&iomax, avl_devcmp, false, NULL);
410
411 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
412 struct iomax_line *l;
413
414 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
415
416 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
417 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
418 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
419 return ENODATA;
420
421 l = get_iomax_line(&iomax,
422 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
423 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
424
425 l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
426 }
427
428 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
429 struct iomax_line *l;
430
431 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
432
433 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
434 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
435 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
436 return ENODATA;
437
438 l = get_iomax_line(&iomax,
439 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
440 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
441
442 l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
443 }
444
445 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
446 struct iomax_line *l;
447
448 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
449
450 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
451 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
452 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
453 return ENODATA;
454
455 l = get_iomax_line(&iomax,
456 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
457 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
458
459 l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
460 }
461
462 blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
463 struct iomax_line *l;
464
465 blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
466
467 if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
468 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
469 !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
470 return ENODATA;
471
472 l = get_iomax_line(&iomax,
473 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
474 blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
475
476 l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
477 }
478
479 avl_for_each_element(&iomax, curiomax, avl)
480 ++numiomaxstrs;
481
482 if (!numiomaxstrs)
483 return 0;
484
485 iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
486 assert(iomaxstrs != 0);
487 numiomaxstrs = 0;
488
489 avl_for_each_element(&iomax, curiomax, avl) {
490 char iomaxlstr[160];
491 char lstr[32];
492
493 sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
494
495 if (curiomax->rbps != -1) {
496 sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
497 strcat(iomaxlstr, lstr);
498 }
499 if (curiomax->wbps != -1) {
500 sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
501 strcat(iomaxlstr, lstr);
502 }
503 if (curiomax->riops != -1) {
504 sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
505 strcat(iomaxlstr, lstr);
506 }
507 if (curiomax->wiops != -1) {
508 sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
509 strcat(iomaxlstr, lstr);
510 }
511
512 iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
513 }
514
515 avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
516 avl_delete(&iomax, &curiomax->avl);
517 free(curiomax);
518 }
519
520 strtotlen = 1; /* 1 accounts for \0 at end of string */
521 if (numiomaxstrs) {
522 curstr = iomaxstrs;
523 while (*curstr)
524 strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
525
526 iomaxstr = calloc(strtotlen, sizeof(char));
527 assert(iomaxstr != 0);
528 curstr = iomaxstrs;
529
530 while (*curstr) {
531 strcat(iomaxstr, *curstr);
532 strcat(iomaxstr, "\n");
533 free(*(curstr++));
534 }
535
536 cgroups_set("io.max", iomaxstr);
537 free(iomaxstr);
538 };
539
540 free(iomaxstrs);
541
542 return 0;
543 }
544
545
546 enum {
547 OCI_LINUX_CGROUPS_CPU_SHARES,
548 OCI_LINUX_CGROUPS_CPU_PERIOD,
549 OCI_LINUX_CGROUPS_CPU_QUOTA,
550 OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
551 OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
552 OCI_LINUX_CGROUPS_CPU_CPUS,
553 OCI_LINUX_CGROUPS_CPU_MEMS,
554 __OCI_LINUX_CGROUPS_CPU_MAX,
555 };
556
557 static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
558 [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
559 [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
560 [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
561 [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
562 [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
563 [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
564 [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
565 };
566
567 static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
568 {
569 struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
570 uint64_t shares, period = 0;
571 int64_t quota = -2; /* unset */
572 char tmp[32] = { 0 };
573
574 blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
575
576 if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
577 tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
578 return ENOTSUP; /* no equivalent in cgroup2 */
579
580 if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
581 shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
582 if ((shares < 2) || (shares > 262144))
583 return ERANGE;
584
585 snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
586 cgroups_set("cpu.weight", tmp);
587 tmp[0] = '\0';
588 }
589
590 if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
591 quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
592
593 if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
594 period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
595
596 if (period) {
597 if (quota >= 0)
598 snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
599 else
600 snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
601 } else if (quota >= 0) {
602 snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
603 } else if (quota == -1) {
604 strcpy(tmp, "max");
605 }
606
607 if (tmp[0])
608 cgroups_set("cpu.max", tmp);
609
610 if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
611 cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
612
613 if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
614 cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
615
616 return 0;
617 }
618
619
620 enum {
621 OCI_LINUX_CGROUPS_MEMORY_LIMIT,
622 OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
623 OCI_LINUX_CGROUPS_MEMORY_SWAP,
624 OCI_LINUX_CGROUPS_MEMORY_KERNEL,
625 OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
626 OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
627 OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
628 OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
629 __OCI_LINUX_CGROUPS_MEMORY_MAX,
630 };
631
632 static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
633 [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
634 [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
635 [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
636 [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
637 [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
638 [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
639 [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
640 [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
641 };
642
643 static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
644 {
645 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
646 char tmp[32] = { 0 };
647 int64_t limit, swap, reservation;
648
649 blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
650
651 /*
652 * not all properties of the OCI memory section can be mapped to cgroup2
653 * kernel memory accounting is always enabled and included in the set
654 * memory limit, hence these options can be ignored
655 * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
656 * preventing self-upgrade (but allow downgrade)
657 *
658 * see also https://github.com/opencontainers/runtime-spec/issues/1005
659 */
660 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
661 tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
662 tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
663 return ENOTSUP;
664
665
666 if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
667 limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
668 if (limit == -1)
669 strcpy(tmp, "max");
670 else
671 snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
672
673 cgroups_set("memory.max", tmp);
674 }
675
676 if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
677 reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
678
679 if (reservation == -1)
680 strcpy(tmp, "max");
681 else
682 snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
683
684 cgroups_set("memory.low", tmp);
685 }
686
687 /* OCI 'swap' acounts for memory+swap */
688 if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
689 swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
690
691 if (swap == -1)
692 strcpy(tmp, "max");
693 else if (limit == -1 || (limit < swap))
694 snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
695 else
696 snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
697
698 cgroups_set("memory.swap_max", tmp);
699 }
700
701 return 0;
702 }
703
704
705 enum {
706 OCI_LINUX_CGROUPS_PIDS_LIMIT,
707 __OCI_LINUX_CGROUPS_PIDS_MAX,
708 };
709
710 static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
711 [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
712 };
713
714 static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
715 {
716 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
717 char tmp[32] = { 0 };
718
719 blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
720
721 if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
722 return EINVAL;
723
724 snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
725
726 cgroups_set("pids.max", tmp);
727
728 return 0;
729 }
730
731 static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
732 {
733 struct blob_attr *cur;
734 int rem;
735
736 blobmsg_for_each_attr(cur, msg, rem) {
737 if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
738 return EINVAL;
739
740 /* restrict keys */
741 if (strchr(blobmsg_name(cur), '/') ||
742 !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
743 !strcmp(blobmsg_name(cur), "cgroup.procs") ||
744 !strcmp(blobmsg_name(cur), "cgroup.threads") ||
745 !strcmp(blobmsg_name(cur), "cgroup.freeze"))
746 return EINVAL;
747
748 cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
749 }
750
751 return 0;
752 }
753
754 enum {
755 OCI_LINUX_CGROUPS_BLOCKIO,
756 OCI_LINUX_CGROUPS_CPU,
757 OCI_LINUX_CGROUPS_DEVICES,
758 OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
759 OCI_LINUX_CGROUPS_INTELRDT,
760 OCI_LINUX_CGROUPS_MEMORY,
761 OCI_LINUX_CGROUPS_NETWORK,
762 OCI_LINUX_CGROUPS_PIDS,
763 OCI_LINUX_CGROUPS_RDMA,
764 OCI_LINUX_CGROUPS_UNIFIED,
765 __OCI_LINUX_CGROUPS_MAX,
766 };
767
768 static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
769 [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
770 [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
771 [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
772 [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
773 [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
774 [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
775 [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
776 [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
777 [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
778 [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
779 };
780
781 int parseOCIlinuxcgroups(struct blob_attr *msg)
782 {
783 struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
784 int ret;
785
786 blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
787
788 if (tb[OCI_LINUX_CGROUPS_DEVICES] ||
789 tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
790 tb[OCI_LINUX_CGROUPS_INTELRDT] ||
791 tb[OCI_LINUX_CGROUPS_NETWORK] ||
792 tb[OCI_LINUX_CGROUPS_RDMA])
793 return ENOTSUP;
794
795 if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
796 ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
797 if (ret)
798 return ret;
799 }
800
801 if (tb[OCI_LINUX_CGROUPS_CPU]) {
802 ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
803 if (ret)
804 return ret;
805 }
806
807 if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
808 ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
809 if (ret)
810 return ret;
811 }
812
813 if (tb[OCI_LINUX_CGROUPS_PIDS]) {
814 ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
815 if (ret)
816 return ret;
817 }
818
819 if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
820 ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
821 if (ret)
822 return ret;
823 }
824
825 return 0;
826 }