trace: use standard POSIX header for basename()
[project/procd.git] / jail / cgroups.c
index 2317472eb28a6d8296c91552748a061d534155ca..2d3dce4d31acc25b0c4c38643e55928f7743847f 100644 (file)
  * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
  *
  * ToDo:
- *  - convert cgroup1 devices to eBPF program
  *  - convert cgroup1 net_prio and net_cls to eBPF program
  *  - rdma (anyone?) intelrdt (anyone?)
  */
 
 #define _GNU_SOURCE
 
-#include <assert.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <libubox/avl-cmp.h>
 #include <libubox/blobmsg.h>
 #include <libubox/list.h>
+#include <libubox/utils.h>
 
-#include "fs.h"
 #include "log.h"
 #include "cgroups.h"
+#include "cgroups-bpf.h"
 
 #define CGROUP_ROOT "/sys/fs/cgroup/"
 #define CGROUP_IO_WEIGHT_MAX 10000
@@ -54,10 +53,16 @@ struct cgval {
 
 struct avl_tree cgvals;
 static char *cgroup_path;
+static bool initialized;
+
+void cgroups_prepare(void) {
+       initialized = false;
+}
 
 void cgroups_init(const char *p) {
        avl_init(&cgvals, avl_strcmp, false, NULL);
        cgroup_path = strdup(p);
+       initialized = true;
 }
 
 static void cgroups_set(const char *key, const char *val)
@@ -67,7 +72,9 @@ static void cgroups_set(const char *key, const char *val)
        valp = avl_find_element(&cgvals, key, valp, avl);
        if (!valp) {
                valp = malloc(sizeof(struct cgval));
-               assert(valp != NULL);
+               if (!valp)
+                       exit(ENOMEM);
+
                valp->avl.key = strdup(key);
                avl_insert(&cgvals, &valp->avl);
        } else {
@@ -81,13 +88,15 @@ static void cgroups_set(const char *key, const char *val)
 void cgroups_free(void)
 {
        struct cgval *valp, *tmp;
-       avl_for_each_element_safe(&cgvals, valp, avl, tmp) {
-               avl_delete(&cgvals, &valp->avl);
-               free((void *)(valp->avl.key));
-               free(valp->val);
-               free(valp);
+
+       if (initialized) {
+               avl_remove_all_elements(&cgvals, valp, avl, tmp) {
+                       free((void *)(valp->avl.key));
+                       free(valp->val);
+                       free(valp);
+               }
+               free(cgroup_path);
        }
-       free(cgroup_path);
 }
 
 void cgroups_apply(pid_t pid)
@@ -129,7 +138,7 @@ void cgroups_apply(pid_t pid)
                else if (!strncmp("pids.", ent, 5))
                        pids = true;
                else if (!strncmp("rdma.", ent, 5))
-                       pids = true;
+                       rdma = true;
        }
 
        maxlen += strlen(cgroup_path) + 2;
@@ -160,7 +169,8 @@ void cgroups_apply(pid_t pid)
        *ent = '\0';
 
        ent = malloc(maxlen);
-       assert(ent != 0);
+       if (!ent)
+               exit(ENOMEM);
 
        DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
        cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
@@ -168,9 +178,17 @@ void cgroups_apply(pid_t pid)
                *cdir = '\0';
                snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
                DEBUG(" * %s\n", ent);
-               fd = open(ent, O_WRONLY);
-               assert(fd != -1);
-               write(fd, subtree_control, strlen(subtree_control));
+               if ((fd = open(ent, O_WRONLY)) < 0) {
+                       ERROR("can't open %s: %m\n", ent);
+                       continue;
+               }
+
+               if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
+                       ERROR("can't write to %s: %m\n", ent);
+                       close(fd);
+                       continue;
+               }
+
                close(fd);
                *cdir = '/';
        }
@@ -179,7 +197,7 @@ void cgroups_apply(pid_t pid)
                DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
                snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
                fd = open(ent, O_WRONLY);
-               if (fd == -1) {
+               if (fd < 0) {
                        ERROR("can't open %s: %m\n", ent);
                        continue;
                }
@@ -189,15 +207,24 @@ void cgroups_apply(pid_t pid)
                close(fd);
        }
 
+       int dirfd = open(cgroup_path, O_DIRECTORY);
+       if (dirfd < 0) {
+               ERROR("can't open %s: %m\n", cgroup_path);
+       } else {
+               attach_cgroups_ebpf(dirfd);
+               close(dirfd);
+       }
+
        snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
        fd = open(ent, O_WRONLY);
-       assert(fd != -1);
-       dprintf(fd, "%d", pid);
-       close(fd);
+       if (fd < 0) {
+               ERROR("can't open %s: %m\n", cgroup_path);
+       } else {
+               dprintf(fd, "%d", pid);
+               close(fd);
+       }
 
        free(ent);
-
-       cgroups_free();
 }
 
 enum {
@@ -291,7 +318,9 @@ static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major,
        l = avl_find_element(iomax, &d, l, avl);
        if (!l) {
                l = malloc(sizeof(struct iomax_line));
-               assert(l != NULL);
+               if (!l)
+                       exit(ENOMEM);
+
                l->dev.major = d.major;
                l->dev.minor = d.minor;
                l->avl.key = &l->dev;
@@ -339,11 +368,14 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
                ++numweightstrs;
 
        weightstrs = calloc(numweightstrs + 1, sizeof(char *));
-       assert(weightstrs != 0);
+       if (!weightstrs)
+               exit(ENOMEM);
+
        numweightstrs = 0;
 
        if (weight > -1)
-               asprintf(&weightstrs[numweightstrs++], "default %d", weight);
+               if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
+                       return ENOMEM;
 
        blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
                uint64_t major, minor;
@@ -376,7 +408,8 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
                major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
                minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
 
-               asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight);
+               if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
+                       return ENOMEM;
        }
 
        if (numweightstrs) {
@@ -385,7 +418,8 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
                        strtotlen += strlen(*(curstr++)) + 1;
 
                weightstr = calloc(strtotlen, sizeof(char));
-               assert(weightstr != 0);
+               if (!weightstr)
+                       exit(ENOMEM);
 
                curstr = weightstrs;
                while (*curstr) {
@@ -477,7 +511,9 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
                return 0;
 
        iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
-       assert(iomaxstrs != 0);
+       if (!iomaxstrs)
+               exit(ENOMEM);
+
        numiomaxstrs = 0;
 
        avl_for_each_element(&iomax, curiomax, avl) {
@@ -518,7 +554,9 @@ static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
                        strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
 
                iomaxstr = calloc(strtotlen, sizeof(char));
-               assert(iomaxstr != 0);
+               if (!iomaxstr)
+                       exit(ENOMEM);
+
                curstr = iomaxstrs;
 
                while (*curstr) {
@@ -627,8 +665,8 @@ static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
        [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
        [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
        [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
-       [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! */
-       [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
        [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
        [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
        [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
@@ -638,16 +676,24 @@ static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
 {
        struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
        char tmp[32] = { 0 };
-       int64_t limit, swap, reservation;
+       int64_t limit = -1, swap, reservation;
 
        blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
 
-       if (tb[OCI_LINUX_CGROUPS_MEMORY_KERNEL] ||
-           tb[OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] ||
-           tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
+       /*
+        * not all properties of the OCI memory section can be mapped to cgroup2
+        * kernel memory accounting is always enabled and included in the set
+        *   memory limit, hence these options can be ignored
+        * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
+        *   preventing self-upgrade (but allow downgrade)
+        *
+        * see also https://github.com/opencontainers/runtime-spec/issues/1005
+        */
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
            tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
            tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
-               return ENOTSUP; /* no equivalent in cgroup2 */
+               return ENOTSUP;
+
 
        if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
                limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
@@ -670,13 +716,16 @@ static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
                cgroups_set("memory.low", tmp);
        }
 
+       /* OCI 'swap' acounts for memory+swap */
        if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
                swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
 
                if (swap == -1)
                        strcpy(tmp, "max");
-               else
+               else if (limit == -1 || (limit < swap))
                        snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
 
                cgroups_set("memory.swap_max", tmp);
        }
@@ -720,6 +769,14 @@ static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
                if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
                        return EINVAL;
 
+               /* restrict keys */
+               if (strchr(blobmsg_name(cur), '/') ||
+                   !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.procs") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.threads") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.freeze"))
+                       return EINVAL;
+
                cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
        }
 
@@ -760,8 +817,7 @@ int parseOCIlinuxcgroups(struct blob_attr *msg)
 
        blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
 
-       if (tb[OCI_LINUX_CGROUPS_DEVICES] ||
-           tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
+       if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
            tb[OCI_LINUX_CGROUPS_INTELRDT] ||
            tb[OCI_LINUX_CGROUPS_NETWORK] ||
            tb[OCI_LINUX_CGROUPS_RDMA])
@@ -779,6 +835,12 @@ int parseOCIlinuxcgroups(struct blob_attr *msg)
                        return ret;
        }
 
+       if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
+               ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
+               if (ret)
+                       return ret;
+       }
+
        if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
                ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
                if (ret)