* https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
*
* ToDo:
- * - convert cgroup1 devices to eBPF program
* - convert cgroup1 net_prio and net_cls to eBPF program
* - rdma (anyone?) intelrdt (anyone?)
*/
#define _GNU_SOURCE
-#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <libubox/avl-cmp.h>
#include <libubox/blobmsg.h>
#include <libubox/list.h>
+#include <libubox/utils.h>
-#include "fs.h"
#include "log.h"
#include "cgroups.h"
+#include "cgroups-bpf.h"
#define CGROUP_ROOT "/sys/fs/cgroup/"
#define CGROUP_IO_WEIGHT_MAX 10000
struct avl_tree cgvals;
static char *cgroup_path;
+static bool initialized;
+
+void cgroups_prepare(void) {
+ initialized = false;
+}
void cgroups_init(const char *p) {
avl_init(&cgvals, avl_strcmp, false, NULL);
cgroup_path = strdup(p);
+ initialized = true;
}
static void cgroups_set(const char *key, const char *val)
valp = avl_find_element(&cgvals, key, valp, avl);
if (!valp) {
valp = malloc(sizeof(struct cgval));
- assert(valp != NULL);
+ if (!valp)
+ exit(ENOMEM);
+
valp->avl.key = strdup(key);
avl_insert(&cgvals, &valp->avl);
} else {
void cgroups_free(void)
{
struct cgval *valp, *tmp;
- avl_for_each_element_safe(&cgvals, valp, avl, tmp) {
- avl_delete(&cgvals, &valp->avl);
- free((void *)(valp->avl.key));
- free(valp->val);
- free(valp);
+
+ if (initialized) {
+ avl_remove_all_elements(&cgvals, valp, avl, tmp) {
+ free((void *)(valp->avl.key));
+ free(valp->val);
+ free(valp);
+ }
+ free(cgroup_path);
}
- free(cgroup_path);
}
void cgroups_apply(pid_t pid)
else if (!strncmp("pids.", ent, 5))
pids = true;
else if (!strncmp("rdma.", ent, 5))
- pids = true;
+ rdma = true;
}
maxlen += strlen(cgroup_path) + 2;
*ent = '\0';
ent = malloc(maxlen);
- assert(ent != 0);
+ if (!ent)
+ exit(ENOMEM);
DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
*cdir = '\0';
snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
DEBUG(" * %s\n", ent);
- fd = open(ent, O_WRONLY);
- assert(fd != -1);
- write(fd, subtree_control, strlen(subtree_control));
+ if ((fd = open(ent, O_WRONLY)) < 0) {
+ ERROR("can't open %s: %m\n", ent);
+ continue;
+ }
+
+ if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
+ ERROR("can't write to %s: %m\n", ent);
+ close(fd);
+ continue;
+ }
+
close(fd);
*cdir = '/';
}
DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
fd = open(ent, O_WRONLY);
- if (fd == -1) {
+ if (fd < 0) {
ERROR("can't open %s: %m\n", ent);
continue;
}
close(fd);
}
+ int dirfd = open(cgroup_path, O_DIRECTORY);
+ if (dirfd < 0) {
+ ERROR("can't open %s: %m\n", cgroup_path);
+ } else {
+ attach_cgroups_ebpf(dirfd);
+ close(dirfd);
+ }
+
snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
fd = open(ent, O_WRONLY);
- assert(fd != -1);
- dprintf(fd, "%d", pid);
- close(fd);
+ if (fd < 0) {
+ ERROR("can't open %s: %m\n", cgroup_path);
+ } else {
+ dprintf(fd, "%d", pid);
+ close(fd);
+ }
free(ent);
-
- cgroups_free();
}
enum {
l = avl_find_element(iomax, &d, l, avl);
if (!l) {
l = malloc(sizeof(struct iomax_line));
- assert(l != NULL);
+ if (!l)
+ exit(ENOMEM);
+
l->dev.major = d.major;
l->dev.minor = d.minor;
l->avl.key = &l->dev;
++numweightstrs;
weightstrs = calloc(numweightstrs + 1, sizeof(char *));
- assert(weightstrs != 0);
+ if (!weightstrs)
+ exit(ENOMEM);
+
numweightstrs = 0;
if (weight > -1)
- asprintf(&weightstrs[numweightstrs++], "default %d", weight);
+ if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
+ return ENOMEM;
blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
uint64_t major, minor;
major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
- asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight);
+ if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
+ return ENOMEM;
}
if (numweightstrs) {
strtotlen += strlen(*(curstr++)) + 1;
weightstr = calloc(strtotlen, sizeof(char));
- assert(weightstr != 0);
+ if (!weightstr)
+ exit(ENOMEM);
curstr = weightstrs;
while (*curstr) {
return 0;
iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
- assert(iomaxstrs != 0);
+ if (!iomaxstrs)
+ exit(ENOMEM);
+
numiomaxstrs = 0;
avl_for_each_element(&iomax, curiomax, avl) {
strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
iomaxstr = calloc(strtotlen, sizeof(char));
- assert(iomaxstr != 0);
+ if (!iomaxstr)
+ exit(ENOMEM);
+
curstr = iomaxstrs;
while (*curstr) {
[OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
[OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
[OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
- [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! */
- [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! */
+ [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
+ [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
{
struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
char tmp[32] = { 0 };
- int64_t limit, swap, reservation;
+ int64_t limit = -1, swap, reservation;
blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
- if (tb[OCI_LINUX_CGROUPS_MEMORY_KERNEL] ||
- tb[OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] ||
- tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
+ /*
+ * not all properties of the OCI memory section can be mapped to cgroup2
+ * kernel memory accounting is always enabled and included in the set
+ * memory limit, hence these options can be ignored
+ * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
+ * preventing self-upgrade (but allow downgrade)
+ *
+ * see also https://github.com/opencontainers/runtime-spec/issues/1005
+ */
+ if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
- return ENOTSUP; /* no equivalent in cgroup2 */
+ return ENOTSUP;
+
if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
cgroups_set("memory.low", tmp);
}
+ /* OCI 'swap' acounts for memory+swap */
if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
if (swap == -1)
strcpy(tmp, "max");
- else
+ else if (limit == -1 || (limit < swap))
snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
+ else
+ snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
cgroups_set("memory.swap_max", tmp);
}
if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
return EINVAL;
+ /* restrict keys */
+ if (strchr(blobmsg_name(cur), '/') ||
+ !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
+ !strcmp(blobmsg_name(cur), "cgroup.procs") ||
+ !strcmp(blobmsg_name(cur), "cgroup.threads") ||
+ !strcmp(blobmsg_name(cur), "cgroup.freeze"))
+ return EINVAL;
+
cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
}
blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
- if (tb[OCI_LINUX_CGROUPS_DEVICES] ||
- tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
+ if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
tb[OCI_LINUX_CGROUPS_INTELRDT] ||
tb[OCI_LINUX_CGROUPS_NETWORK] ||
tb[OCI_LINUX_CGROUPS_RDMA])
return ret;
}
+ if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
+ ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
+ if (ret)
+ return ret;
+ }
+
if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
if (ret)