jail: parse OCI cgroups resources
authorDaniel Golle <daniel@makrotopia.org>
Wed, 5 Aug 2020 17:37:53 +0000 (18:37 +0100)
committerDaniel Golle <daniel@makrotopia.org>
Thu, 6 Aug 2020 14:19:18 +0000 (15:19 +0100)
Start pure cgroup2 implementation with emulation of (some) cgroup1
properties.
Initially support converting cpu, memory, blockIO, pids to unified in
addition to directly specifying unified attributes as suggested in
https://github.com/opencontainers/runtime-spec/pull/1040

Support for converting devices and network into BPF programs is
planned.

Now that containers have their representation in the unified cgroup
hierarchy, make sure using cgroup namespaces also produces meaningful
results.

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
CMakeLists.txt
jail/cgroups.c [new file with mode: 0644]
jail/cgroups.h [new file with mode: 0644]
jail/jail.c

index 1f77662615f3d8b944bd9f8b1c6c9df59b94d6a4..c7adfa363b770e993d0a98c28ec17e9f5fa528db 100644 (file)
@@ -108,7 +108,7 @@ IF(SECCOMP_SUPPORT)
 ENDIF()
 
 IF(JAIL_SUPPORT)
-ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c ${SOURCES_OCI_SECCOMP})
+ADD_EXECUTABLE(ujail jail/jail.c jail/cgroups.c jail/elf.c jail/fs.c jail/capabilities.c ${SOURCES_OCI_SECCOMP})
 TARGET_LINK_LIBRARIES(ujail ${ubox} ${ubus} ${blobmsg_json})
 INSTALL(TARGETS ujail
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
diff --git a/jail/cgroups.c b/jail/cgroups.c
new file mode 100644 (file)
index 0000000..2317472
--- /dev/null
@@ -0,0 +1,801 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * reads unified cgroup config as proposed in
+ * https://github.com/opencontainers/runtime-spec/pull/1040
+ * attempt conversion from cgroup1 -> cgroup2
+ * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
+ *
+ * ToDo:
+ *  - convert cgroup1 devices to eBPF program
+ *  - convert cgroup1 net_prio and net_cls to eBPF program
+ *  - rdma (anyone?) intelrdt (anyone?)
+ */
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+
+#include <libubox/avl.h>
+#include <libubox/avl-cmp.h>
+#include <libubox/blobmsg.h>
+#include <libubox/list.h>
+
+#include "fs.h"
+#include "log.h"
+#include "cgroups.h"
+
+#define CGROUP_ROOT "/sys/fs/cgroup/"
+#define CGROUP_IO_WEIGHT_MAX 10000
+
+struct cgval {
+       struct avl_node avl;
+       char *val;
+};
+
+struct avl_tree cgvals;
+static char *cgroup_path;
+
+void cgroups_init(const char *p) {
+       avl_init(&cgvals, avl_strcmp, false, NULL);
+       cgroup_path = strdup(p);
+}
+
+static void cgroups_set(const char *key, const char *val)
+{
+       struct cgval *valp;
+
+       valp = avl_find_element(&cgvals, key, valp, avl);
+       if (!valp) {
+               valp = malloc(sizeof(struct cgval));
+               assert(valp != NULL);
+               valp->avl.key = strdup(key);
+               avl_insert(&cgvals, &valp->avl);
+       } else {
+               DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
+               free(valp->val);
+       }
+
+       valp->val = strdup(val);
+}
+
+void cgroups_free(void)
+{
+       struct cgval *valp, *tmp;
+       avl_for_each_element_safe(&cgvals, valp, avl, tmp) {
+               avl_delete(&cgvals, &valp->avl);
+               free((void *)(valp->avl.key));
+               free(valp->val);
+               free(valp);
+       }
+       free(cgroup_path);
+}
+
+void cgroups_apply(pid_t pid)
+{
+       struct cgval *valp;
+       char *cdir, *ent;
+       int fd;
+       size_t maxlen = strlen("cgroup.subtree_control");
+
+       bool cpuset = false,
+            cpu = false,
+            hugetlb = false,
+            io = false,
+            memory = false,
+            pids = false,
+            rdma = false;
+
+       char subtree_control[64] = { 0 };
+
+       DEBUG("using cgroup path %s\n", cgroup_path);
+       mkdir_p(cgroup_path, 0700);
+
+       /* find which controllers need to be enabled */
+       avl_for_each_element(&cgvals, valp, avl) {
+               ent = (char *)valp->avl.key;
+               if (strlen(ent) > maxlen)
+                       maxlen = strlen(ent);
+
+               if (!strncmp("cpuset.", ent, 7))
+                       cpuset = true;
+               else if (!strncmp("cpu.", ent, 4))
+                       cpu = true;
+               else if (!strncmp("hugetlb.", ent, 8))
+                       hugetlb = true;
+               else if (!strncmp("io.", ent, 3))
+                       io = true;
+               else if (!strncmp("memory.", ent, 7))
+                       memory = true;
+               else if (!strncmp("pids.", ent, 5))
+                       pids = true;
+               else if (!strncmp("rdma.", ent, 5))
+                       pids = true;
+       }
+
+       maxlen += strlen(cgroup_path) + 2;
+
+       if (cpuset)
+               strcat(subtree_control, "+cpuset ");
+
+       if (cpu)
+               strcat(subtree_control, "+cpu ");
+
+       if (hugetlb)
+               strcat(subtree_control, "+hugetlb ");
+
+       if (io)
+               strcat(subtree_control, "+io ");
+
+       if (memory)
+               strcat(subtree_control, "+memory ");
+
+       if (pids)
+               strcat(subtree_control, "+pids ");
+
+       if (rdma)
+               strcat(subtree_control, "+rdma ");
+
+       /* remove trailing space */
+       ent = strchr(subtree_control, '\0') - 1;
+       *ent = '\0';
+
+       ent = malloc(maxlen);
+       assert(ent != 0);
+
+       DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
+       cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
+       while ((cdir = strchr(cdir + 1, '/'))) {
+               *cdir = '\0';
+               snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
+               DEBUG(" * %s\n", ent);
+               fd = open(ent, O_WRONLY);
+               assert(fd != -1);
+               write(fd, subtree_control, strlen(subtree_control));
+               close(fd);
+               *cdir = '/';
+       }
+
+       avl_for_each_element(&cgvals, valp, avl) {
+               DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
+               snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
+               fd = open(ent, O_WRONLY);
+               if (fd == -1) {
+                       ERROR("can't open %s: %m\n", ent);
+                       continue;
+               }
+               if (dprintf(fd, "%s", valp->val) < 0) {
+                       ERROR("can't write to %s: %m\n", ent);
+               };
+               close(fd);
+       }
+
+       snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
+       fd = open(ent, O_WRONLY);
+       assert(fd != -1);
+       dprintf(fd, "%d", pid);
+       close(fd);
+
+       free(ent);
+
+       cgroups_free();
+}
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
+       __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
+};
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
+       __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
+};
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
+       __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
+};
+
+struct posix_dev {
+       uint64_t major;
+       uint64_t minor;
+};
+
+struct iomax_line {
+       struct avl_node avl;
+       struct posix_dev dev;
+       uint64_t rbps;
+       uint64_t wbps;
+       uint64_t riops;
+       uint64_t wiops;
+};
+
+static int avl_devcmp(const void *k1, const void *k2, void *ptr)
+{
+       struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
+
+       if (d1->major < d2->major)
+               return -1;
+
+       if (d1->major > d2->major)
+               return 1;
+
+       if (d1->minor < d2->minor)
+               return -1;
+
+       if (d1->minor > d2->minor)
+               return 1;
+
+       return 0;
+}
+
+static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
+{
+       struct iomax_line *l;
+       struct posix_dev d;
+       d.major = major;
+       d.minor = minor;
+       l = avl_find_element(iomax, &d, l, avl);
+       if (!l) {
+               l = malloc(sizeof(struct iomax_line));
+               assert(l != NULL);
+               l->dev.major = d.major;
+               l->dev.minor = d.minor;
+               l->avl.key = &l->dev;
+               l->rbps = -1;
+               l->wbps = -1;
+               l->riops = -1;
+               l->wiops = -1;
+               avl_insert(iomax, &l->avl);
+       }
+
+       return l;
+}
+
+static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
+                        *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
+                        *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
+                        *cur;
+       int rem;
+       int weight = -1, leafweight = -1;
+       size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
+       char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
+       char *weightstr, *iomaxstr;
+       struct avl_tree iomax;
+       struct iomax_line *curiomax, *tmp;
+
+       blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
+               weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
+               ++numweightstrs;
+       }
+
+       if (weight > CGROUP_IO_WEIGHT_MAX)
+               return ERANGE;
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
+               leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
+
+       if (leafweight > CGROUP_IO_WEIGHT_MAX)
+               return ERANGE;
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
+               ++numweightstrs;
+
+       weightstrs = calloc(numweightstrs + 1, sizeof(char *));
+       assert(weightstrs != 0);
+       numweightstrs = 0;
+
+       if (weight > -1)
+               asprintf(&weightstrs[numweightstrs++], "default %d", weight);
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
+               uint64_t major, minor;
+               int devweight = weight, devleafweight = leafweight;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
+                   !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
+                       return ENODATA;
+
+               if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
+                   !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       return ENODATA;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
+                       devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
+
+               if (devweight > CGROUP_IO_WEIGHT_MAX)
+                       return ERANGE;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
+
+               if (devleafweight > CGROUP_IO_WEIGHT_MAX)
+                       return ERANGE;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       return ENOTSUP;
+
+               major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
+               minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
+
+               asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight);
+       }
+
+       if (numweightstrs) {
+               curstr = weightstrs;
+               while (*curstr)
+                       strtotlen += strlen(*(curstr++)) + 1;
+
+               weightstr = calloc(strtotlen, sizeof(char));
+               assert(weightstr != 0);
+
+               curstr = weightstrs;
+               while (*curstr) {
+                       strcat(weightstr, *curstr);
+                       strcat(weightstr, "\n");
+                       free(*(curstr++));
+               }
+
+               cgroups_set("io.bfq.weight", weightstr);
+               free(weightstr);
+       };
+
+       free(weightstrs);
+
+       avl_init(&iomax, avl_devcmp, false, NULL);
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       avl_for_each_element(&iomax, curiomax, avl)
+               ++numiomaxstrs;
+
+       if (!numiomaxstrs)
+               return 0;
+
+       iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
+       assert(iomaxstrs != 0);
+       numiomaxstrs = 0;
+
+       avl_for_each_element(&iomax, curiomax, avl) {
+               char iomaxlstr[160];
+               char lstr[32];
+
+               sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
+
+               if (curiomax->rbps != -1) {
+                       sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->wbps != -1) {
+                       sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->riops != -1) {
+                       sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->wiops != -1) {
+                       sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
+                       strcat(iomaxlstr, lstr);
+               }
+
+               iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
+       }
+
+       avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
+               avl_delete(&iomax, &curiomax->avl);
+               free(curiomax);
+       }
+
+       strtotlen = 1; /* 1 accounts for \0 at end of string */
+       if (numiomaxstrs) {
+               curstr = iomaxstrs;
+               while (*curstr)
+                       strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
+
+               iomaxstr = calloc(strtotlen, sizeof(char));
+               assert(iomaxstr != 0);
+               curstr = iomaxstrs;
+
+               while (*curstr) {
+                       strcat(iomaxstr, *curstr);
+                       strcat(iomaxstr, "\n");
+                       free(*(curstr++));
+               }
+
+               cgroups_set("io.max", iomaxstr);
+               free(iomaxstr);
+       };
+
+       free(iomaxstrs);
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_CPU_SHARES,
+       OCI_LINUX_CGROUPS_CPU_PERIOD,
+       OCI_LINUX_CGROUPS_CPU_QUOTA,
+       OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
+       OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
+       OCI_LINUX_CGROUPS_CPU_CPUS,
+       OCI_LINUX_CGROUPS_CPU_MEMS,
+       __OCI_LINUX_CGROUPS_CPU_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
+       [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
+};
+
+static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
+       uint64_t shares, period = 0;
+       int64_t quota = -2; /* unset */
+       char tmp[32] = { 0 };
+
+       blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
+           tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
+               return ENOTSUP; /* no equivalent in cgroup2 */
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
+               shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
+               if ((shares < 2) || (shares > 262144))
+                       return ERANGE;
+
+               snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
+               cgroups_set("cpu.weight", tmp);
+               tmp[0] = '\0';
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
+               quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
+               period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
+
+       if (period) {
+               if (quota >= 0)
+                       snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
+               else
+                       snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
+       } else if (quota >= 0) {
+               snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
+       } else if (quota == -1) {
+               strcpy(tmp, "max");
+       }
+
+       if (tmp[0])
+               cgroups_set("cpu.max", tmp);
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
+               cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
+               cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_MEMORY_LIMIT,
+       OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
+       OCI_LINUX_CGROUPS_MEMORY_SWAP,
+       OCI_LINUX_CGROUPS_MEMORY_KERNEL,
+       OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
+       OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
+       OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
+       OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
+       __OCI_LINUX_CGROUPS_MEMORY_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
+       [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
+       [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
+};
+
+static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
+       char tmp[32] = { 0 };
+       int64_t limit, swap, reservation;
+
+       blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_KERNEL] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
+               return ENOTSUP; /* no equivalent in cgroup2 */
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
+               limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
+               if (limit == -1)
+                       strcpy(tmp, "max");
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
+
+               cgroups_set("memory.max", tmp);
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
+               reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
+
+               if (reservation == -1)
+                       strcpy(tmp, "max");
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
+
+               cgroups_set("memory.low", tmp);
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
+               swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
+
+               if (swap == -1)
+                       strcpy(tmp, "max");
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
+
+               cgroups_set("memory.swap_max", tmp);
+       }
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_PIDS_LIMIT,
+       __OCI_LINUX_CGROUPS_PIDS_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
+       [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
+};
+
+static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
+       char tmp[32] = { 0 };
+
+       blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
+               return EINVAL;
+
+       snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
+
+       cgroups_set("pids.max", tmp);
+
+       return 0;
+}
+
+static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
+{
+       struct blob_attr *cur;
+       int rem;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
+                       return EINVAL;
+
+               cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO,
+       OCI_LINUX_CGROUPS_CPU,
+       OCI_LINUX_CGROUPS_DEVICES,
+       OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
+       OCI_LINUX_CGROUPS_INTELRDT,
+       OCI_LINUX_CGROUPS_MEMORY,
+       OCI_LINUX_CGROUPS_NETWORK,
+       OCI_LINUX_CGROUPS_PIDS,
+       OCI_LINUX_CGROUPS_RDMA,
+       OCI_LINUX_CGROUPS_UNIFIED,
+       __OCI_LINUX_CGROUPS_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
+};
+
+int parseOCIlinuxcgroups(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
+       int ret;
+
+       blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_DEVICES] ||
+           tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
+           tb[OCI_LINUX_CGROUPS_INTELRDT] ||
+           tb[OCI_LINUX_CGROUPS_NETWORK] ||
+           tb[OCI_LINUX_CGROUPS_RDMA])
+               return ENOTSUP;
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
+               ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_CPU]) {
+               ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
+               ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_PIDS]) {
+               ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
+               ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
diff --git a/jail/cgroups.h b/jail/cgroups.h
new file mode 100644 (file)
index 0000000..1b3051c
--- /dev/null
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _JAIL_CGROUPS_H
+#define _JAIL_CGROUPS_H
+
+void cgroups_init(const char *p);
+int parseOCIlinuxcgroups(struct blob_attr *msg);
+void cgroups_apply(pid_t pid);
+void cgroups_free(void);
+
+#endif
index a3d35ad2cab1b181ec21cb581cbf2efe39122da5..d59afa522387aef4d55a2121b2b8cdd0aa601ba3 100644 (file)
@@ -51,6 +51,7 @@
 #include "jail.h"
 #include "log.h"
 #include "seccomp-oci.h"
+#include "cgroups.h"
 
 #include <libubox/utils.h>
 #include <libubox/blobmsg.h>
@@ -238,11 +239,11 @@ static void free_rlimits(void) {
                free(opts.rlimits[type]);
 }
 
-static void free_opts(bool child) {
+static void free_opts(bool parent) {
        char **tmp;
 
        /* we need to keep argv, envp and seccomp filter in child */
-       if (child) {
+       if (parent) {
                if (opts.ociseccomp) {
                        free(opts.ociseccomp->filter);
                        free(opts.ociseccomp);
@@ -259,6 +260,7 @@ static void free_opts(bool child) {
                        free(*(tmp++));
 
                free(opts.envp);
+               cgroups_free();
        };
 
        free_rlimits();
@@ -276,6 +278,7 @@ static void free_opts(bool child) {
        free_hooklist(opts.hooks.poststart);
        free_hooklist(opts.hooks.poststop);
 }
+
 static int mount_overlay(char *jail_root, char *overlaydir) {
        char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
        const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
@@ -1105,6 +1108,10 @@ static int exec_jail(void *arg)
                ERROR("parent had an error, child exiting\n");
                return EXIT_FAILURE;
        }
+
+       if (opts.namespace & CLONE_NEWCGROUP)
+               unshare(CLONE_NEWCGROUP);
+
        if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
                if (setregid(0, 0) < 0) {
                        ERROR("setgid\n");
@@ -1950,33 +1957,6 @@ static int parseOCIdevices(struct blob_attr *msg)
        return 0;
 }
 
-enum {
-       OCI_LINUX_RESOURCES,
-       OCI_LINUX_SECCOMP,
-       OCI_LINUX_SYSCTL,
-       OCI_LINUX_NAMESPACES,
-       OCI_LINUX_DEVICES,
-       OCI_LINUX_UIDMAPPINGS,
-       OCI_LINUX_GIDMAPPINGS,
-       OCI_LINUX_MASKEDPATHS,
-       OCI_LINUX_READONLYPATHS,
-       OCI_LINUX_ROOTFSPROPAGATION,
-       __OCI_LINUX_MAX,
-};
-
-static const struct blobmsg_policy oci_linux_policy[] = {
-       [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
-       [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
-       [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
-       [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
-       [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
-};
-
 static int parseOCIsysctl(struct blob_attr *msg)
 {
        struct blob_attr *cur;
@@ -2020,12 +2000,44 @@ static int parseOCIsysctl(struct blob_attr *msg)
        return 0;
 }
 
+
+enum {
+       OCI_LINUX_CGROUPSPATH,
+       OCI_LINUX_RESOURCES,
+       OCI_LINUX_SECCOMP,
+       OCI_LINUX_SYSCTL,
+       OCI_LINUX_NAMESPACES,
+       OCI_LINUX_DEVICES,
+       OCI_LINUX_UIDMAPPINGS,
+       OCI_LINUX_GIDMAPPINGS,
+       OCI_LINUX_MASKEDPATHS,
+       OCI_LINUX_READONLYPATHS,
+       OCI_LINUX_ROOTFSPROPAGATION,
+       __OCI_LINUX_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_policy[] = {
+       [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
+};
+
 static int parseOCIlinux(struct blob_attr *msg)
 {
        struct blob_attr *tb[__OCI_LINUX_MAX];
        struct blob_attr *cur;
        int rem;
        int res = 0;
+       char *cgpath;
+       char cgfullpath[256] = "/sys/fs/cgroup";
 
        blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
 
@@ -2083,6 +2095,37 @@ static int parseOCIlinux(struct blob_attr *msg)
                        return res;
        }
 
+       if (tb[OCI_LINUX_CGROUPSPATH]) {
+               cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]);
+               if (cgpath[0] == '/') {
+                       if (strlen(cgpath) >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+                               return E2BIG;
+
+                       strcat(cgfullpath, cgpath);
+               } else {
+                       strcat(cgfullpath, "/containers/");
+                       strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+                       strcat(cgfullpath, "/");
+                       if (strlen(cgpath) >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+                               return E2BIG;
+
+                       strcat(cgfullpath, cgpath);
+               }
+       } else {
+               strcat(cgfullpath, "/containers/");
+               strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+               strcat(cgfullpath, "/");
+               strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */
+       }
+
+       cgroups_init(cgfullpath);
+
+       if (tb[OCI_LINUX_RESOURCES]) {
+               res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]);
+               if (res)
+                       return res;
+       }
+
        return 0;
 }
 
@@ -2609,7 +2652,7 @@ static void post_main(struct uloop_timeout *t)
                }
 #endif
 
-               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, NULL);
+               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL);
        } else {
                jail_process.pid = fork();
        }
@@ -2652,6 +2695,9 @@ static void post_main(struct uloop_timeout *t)
                close(pipes[0]);
                set_oom_score_adj();
 
+               if (opts.ocibundle)
+                       cgroups_apply(jail_process.pid);
+
                if (opts.namespace & CLONE_NEWUSER) {
                        if (write_setgroups(jail_process.pid, true)) {
                                ERROR("can't write setgroups\n");