jail: add support for running OCI bundle
authorDaniel Golle <daniel@makrotopia.org>
Fri, 10 Jul 2020 09:56:58 +0000 (10:56 +0100)
committerDaniel Golle <daniel@makrotopia.org>
Fri, 10 Jul 2020 17:31:52 +0000 (18:31 +0100)
Prepare ujail for running OCI bundled Linux containers.
This adds handling of most of the JSON schema defined by the
Open Container Initiative Runtime Specification.

What is supported by this commits:
 * basic OCI process definition
 * seccomp filters (no args yet)
 * capabilities (100%)
 * namespaces (100%)
 * uid/gid mappings for userns (100%)
 * mounts (no free form mounts yet)
 * env (100%, limited to a low number entries)
 * hostname (100%)
 * terminal (no consoleSize yet)

What is still missing:
 * complex mounts
 * maskedPaths, readonlyPaths
 * referencing existing namespaces
 * all hooks
 * rlimits
 * oomScoreAdj
 * additionalGids
 * cgroups
 * devices
 * sysctl
 * rootfsPropagation
 * personality and bi-arch (ie. 32-bit container on 64-bit host)

Signed-off-by: Daniel Golle <daniel@makrotopia.org>
CMakeLists.txt
jail/capabilities.c
jail/capabilities.h
jail/jail.c
jail/seccomp-bpf.h
jail/seccomp-oci.c [new file with mode: 0644]
jail/seccomp-oci.h [new file with mode: 0644]
jail/seccomp-syscalls-helpers.h [new file with mode: 0644]
jail/seccomp.c
service/instance.c
service/instance.h

index 3eb79f9ea08899e49d4098eeeadbc08dcb429b32..8084674dc1e07f3bfae6079f267d0e79b24f7d27 100644 (file)
@@ -103,8 +103,12 @@ INSTALL(TARGETS preload-seccomp
 ADD_DEPENDENCIES(preload-seccomp syscall-names-h)
 endif()
 
+IF(SECCOMP_SUPPORT)
+  SET(SOURCES_OCI_SECCOMP jail/seccomp-oci.c)
+ENDIF()
+
 IF(JAIL_SUPPORT)
-ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c)
+ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c ${SOURCES_OCI_SECCOMP})
 TARGET_LINK_LIBRARIES(ujail ${ubox} ${ubus} ${blobmsg_json})
 INSTALL(TARGETS ujail
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
index 76e06a6d5c7a60511adbe9c52d89982a78049b24..3c95f81199b2640ec88991d9172f83f57a5bf989 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Etienne CHAMPETIER <champetier.etienne@gmail.com>
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License version 2.1
@@ -14,6 +15,7 @@
 #define _GNU_SOURCE 1
 #include <syslog.h>
 #include <sys/prctl.h>
+#include <sys/capability.h>
 
 #include <libubox/blobmsg.h>
 #include <libubox/blobmsg_json.h>
 #include "../capabilities-names.h"
 #include "capabilities.h"
 
+#define JAIL_CAP_ERROR (1LLU << (CAP_LAST_CAP+1))
+#define JAIL_CAP_ALL (0xffffffffffffffffLLU)
+
 static int find_capabilities(const char *name)
 {
        int i;
 
        for (i = 0; i <= CAP_LAST_CAP; i++)
-               if (capabilities_names[i] && !strcmp(capabilities_names[i], name))
+               if (capabilities_names[i] && !strcasecmp(capabilities_names[i], name))
                        return i;
 
        return -1;
 }
 
+enum {
+       OCI_CAPABILITIES_BOUNDING,
+       OCI_CAPABILITIES_EFFECTIVE,
+       OCI_CAPABILITIES_INHERITABLE,
+       OCI_CAPABILITIES_PERMITTED,
+       OCI_CAPABILITIES_AMBIENT,
+       __OCI_CAPABILITIES_MAX
+};
+
+static const struct blobmsg_policy oci_capabilities_policy[] = {
+       [OCI_CAPABILITIES_BOUNDING] = { "bounding", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_EFFECTIVE] = { "effective", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_INHERITABLE] = { "inheritable", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_PERMITTED] = { "permitted", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_AMBIENT] = { "ambient", BLOBMSG_TYPE_ARRAY },
+};
+
+static uint64_t parseOCIcap(struct blob_attr *msg)
+{
+       struct blob_attr *cur;
+       int rem;
+       uint64_t caps = 0;
+       int capnum;
+
+       /* each capset is optional, set all-1 mask if absent */
+       if (!msg)
+               return JAIL_CAP_ALL;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               capnum = find_capabilities(blobmsg_get_string(cur));
+               if (capnum < 0)
+                       return JAIL_CAP_ERROR;
+
+               caps |= (1LLU << capnum);
+       }
+
+       return caps;
+}
+
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_CAPABILITIES_MAX];
+       uint64_t caps;
+       blobmsg_parse(oci_capabilities_policy, __OCI_CAPABILITIES_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_BOUNDING]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->bounding = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_EFFECTIVE]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->effective = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_INHERITABLE]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->inheritable = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_PERMITTED]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->permitted = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_AMBIENT]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->ambient = caps;
+
+       capset->apply = 1;
+
+       return 0;
+}
+
+
+int applyOCIcapabilities(struct jail_capset ocicapset)
+{
+       struct __user_cap_header_struct uh = {};
+       struct __user_cap_data_struct ud;
+       int cap;
+       int is_set;
+
+       if (!ocicapset.apply)
+               return 0;
+
+       /* drop from bounding set */
+       if (ocicapset.bounding != JAIL_CAP_ALL) {
+               for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+                       if (!prctl(PR_CAPBSET_READ, cap, 0, 0, 0)) {
+                               /* can't raise */
+                               if (ocicapset.bounding & (1LLU << cap))
+                                       ERROR("capability %s (%d) is not in bounding set\n", capabilities_names[cap], cap);
+
+                               continue;
+                       }
+                       if ( (ocicapset.bounding & (1LLU << cap)) == 0) {
+                               DEBUG("dropping capability %s (%d) from bounding set\n", capabilities_names[cap], cap);
+                               if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
+                                       ERROR("prctl(PR_CAPBSET_DROP, %d) failed: %m\n", cap);
+                                       return errno;
+                               }
+                       } else {
+                               DEBUG("keeping capability %s (%d)\n", capabilities_names[cap], cap);
+                       }
+               }
+       }
+
+       /* set effective, permitted and inheritable */
+       uh.version = _LINUX_CAPABILITY_VERSION_3;
+       uh.pid = getpid();
+
+       if (capget(&uh, &ud)) {
+               ERROR("capget() failed\n");
+               return -1;
+       }
+
+       DEBUG("old capabilities: Pe=%08x Pp=%08x Pi=%08x\n", ud.effective, ud.permitted, ud.inheritable);
+
+       if (ocicapset.effective != JAIL_CAP_ALL)
+               ud.effective = ocicapset.effective;
+
+       if (ocicapset.permitted != JAIL_CAP_ALL)
+               ud.permitted = ocicapset.permitted;
+
+       if (ocicapset.inheritable != JAIL_CAP_ALL)
+               ud.inheritable = ocicapset.inheritable;
+
+       DEBUG("new capabilities: Pe=%08x Pp=%08x Pi=%08x\n", ud.effective, ud.permitted, ud.inheritable);
+
+       if (capset(&uh, &ud)) {
+               ERROR("capset() failed\n");
+               return -1;
+       }
+
+       /* edit ambient set */
+       if (ocicapset.ambient != JAIL_CAP_ALL) {
+               for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+                       is_set = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, cap, 0, 0);
+                       if ( (ocicapset.ambient & (1LLU << cap)) == 0) {
+                               if (is_set) {
+                                       DEBUG("dropping capability %s (%d) from ambient set\n", capabilities_names[cap], cap);
+                                       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, cap, 0, 0)) {
+                                               ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, %d, 0, 0) failed: %m\n", cap);
+                                               return errno;
+                                       }
+                               }
+                       } else {
+                               if (!is_set) {
+                                       DEBUG("raising capability %s (%d) to ambient set\n", capabilities_names[cap], cap);
+                                       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {\
+                                               ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, %d, 0, 0) failed: %m\n", cap);
+                                               return errno;
+                                       }
+                               }
+                       }
+               }
+       }
+
+       return 0;
+}
+
 int drop_capabilities(const char *file)
 {
        enum {
@@ -81,7 +253,7 @@ int drop_capabilities(const char *file)
 
        if (capdrop == 0LLU) {
                DEBUG("cap.keep empty -> only dropping capabilities from cap.drop (blacklist)\n");
-               capdrop = 0xffffffffffffffffLLU;
+               capdrop = JAIL_CAP_ALL;
        } else {
                DEBUG("cap.keep has at least one capability -> dropping every capabilities not in cap.keep (whitelist)\n");
        }
index 11b8cc27e9d54b0cbc0e89e34153a8e97c031ee4..cc5f54d4fdc88058a1dbbb9e427f6909c13152e2 100644 (file)
 #ifndef _JAIL_CAPABILITIES_H_
 #define _JAIL_CAPABILITIES_H_
 
+#include <libubox/blobmsg.h>
+
+struct jail_capset {
+       uint64_t bounding;
+       uint64_t effective;
+       uint64_t inheritable;
+       uint64_t permitted;
+       uint64_t ambient;
+       uint8_t apply;
+};
+
 int drop_capabilities(const char *file);
 
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg);
+int applyOCIcapabilities(struct jail_capset capset);
+
 #endif
index 45906904451bee621108f7e5df3fb12930fe2538..645c836f0eba61b17f1badb055b76fb3374b90f0 100644 (file)
@@ -28,6 +28,7 @@
 #include <libgen.h>
 #include <sched.h>
 #include <linux/limits.h>
+#include <linux/filter.h>
 #include <signal.h>
 
 #include "capabilities.h"
 #include "fs.h"
 #include "jail.h"
 #include "log.h"
+#include "seccomp-oci.h"
 
+#include <libubox/utils.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <libubox/list.h>
+#include <libubox/vlist.h>
 #include <libubox/uloop.h>
 #include <libubus.h>
 
 #define STACK_SIZE     (1024 * 1024)
-#define OPT_ARGS       "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:Ey"
+#define OPT_ARGS       "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:"
 
 static struct {
        char *name;
        char *hostname;
        char **jail_argv;
+       char *cwd;
        char *seccomp;
+       struct sock_fprog *ociseccomp;
        char *capabilities;
+       struct jail_capset capset;
        char *user;
        char *group;
        char *extroot;
        char *overlaydir;
        char *tmpoverlaysize;
+       char **envp;
+       char *uidmap;
+       char *gidmap;
        int no_new_privs;
        int namespace;
        int procfs;
@@ -65,6 +78,7 @@ static struct {
        int require_jail;
 } opts;
 
+static struct blob_buf ocibuf;
 
 extern int pivot_root(const char *new_root, const char *put_old);
 
@@ -154,9 +168,9 @@ int mount_bind(const char *root, const char *path, int readonly, int error) {
 }
 
 static int mount_overlay(char *jail_root, char *overlaydir) {
-       char *upperdir, *workdir, *optsstr;
+       char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
        const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
-       int ret = -1;
+       int ret = -1, fd;
 
        if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
                goto out;
@@ -170,6 +184,31 @@ static int mount_overlay(char *jail_root, char *overlaydir) {
        if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
                goto opts_printf;
 
+/*
+ * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
+ * this is to work-around a bug in overlayfs described in the overlayfs-userns
+ * patch:
+ * 3. modification of a file 'hithere' which is in l but not yet
+ * in u, and which is not owned by T, is not allowed, even if
+ * writes to u are allowed.  This may be a bug in overlayfs,
+ * but it is safe behavior.
+ */
+       if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
+               goto opts_printf;
+
+       if (mkdir_p(upperetc, 0755))
+               goto upper_etc_printf;
+
+       if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
+               goto upper_etc_printf;
+
+       fd = creat(upperresolvconf, 0644);
+       if (fd == -1) {
+               ERROR("creat(%s) failed: %m\n", upperresolvconf);
+               goto upper_resolvconf_printf;
+       }
+       close(fd);
+
        DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
 
        if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
@@ -177,6 +216,10 @@ static int mount_overlay(char *jail_root, char *overlaydir) {
 
        ret = 0;
 
+upper_resolvconf_printf:
+       free(upperresolvconf);
+upper_etc_printf:
+       free(upperetc);
 opts_printf:
        free(optsstr);
 work_printf:
@@ -398,7 +441,29 @@ static int build_jail_fs(void)
        return 0;
 }
 
-static int write_uid_gid_map(pid_t child_pid, bool gidmap, int id)
+static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
+{
+       int map_file;
+       char map_path[64];
+
+       if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
+               child_pid, gidmap?"gid_map":"uid_map") < 0)
+               return -1;
+
+       if ((map_file = open(map_path, O_WRONLY)) == -1)
+               return -1;
+
+       if (dprintf(map_file, "%s", mapstr)) {
+               close(map_file);
+               return -1;
+       }
+
+       close(map_file);
+       free(mapstr);
+       return 0;
+}
+
+static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
 {
        int map_file;
        char map_path[64];
@@ -433,7 +498,7 @@ static int write_setgroups(pid_t child_pid, bool allow)
                return -1;
        }
 
-       if (dprintf(setgroups_file, allow?"allow":"deny") == -1) {
+       if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
                close(setgroups_file);
                return -1;
        }
@@ -475,7 +540,7 @@ static void get_jail_user(int *user, int *user_gid, int *gr_gid)
 
 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
 {
-       if ((user_gid != -1) && initgroups(opts.user, user_gid)) {
+       if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
                ERROR("failed to initgroups() for user %s: %m\n", opts.user);
                exit(EXIT_FAILURE);
        }
@@ -492,7 +557,7 @@ static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
 }
 
 #define MAX_ENVP       8
-static char** build_envp(const char *seccomp)
+static char** build_envp(const char *seccomp, char **ocienvp)
 {
        static char *envp[MAX_ENVP];
        static char preload_var[PATH_MAX];
@@ -500,6 +565,8 @@ static char** build_envp(const char *seccomp)
        static char debug_var[] = "LD_DEBUG=all";
        static char container_var[] = "container=ujail";
        const char *preload_lib = find_lib("libpreload-seccomp.so");
+       char **addenv;
+
        int count = 0;
 
        if (seccomp && !preload_lib) {
@@ -518,6 +585,14 @@ static char** build_envp(const char *seccomp)
        if (debug > 1)
                envp[count++] = debug_var;
 
+       addenv = ocienvp;
+       while (addenv && *addenv) {
+               envp[count++] = *(addenv++);
+               if (count >= MAX_ENVP) {
+                       ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
+                       break;
+               }
+       }
        return envp;
 }
 
@@ -548,6 +623,7 @@ static void usage(void)
        fprintf(stderr, "  -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
        fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
        fprintf(stderr, "  -y\t\tprovide jail console\n");
+       fprintf(stderr, "  -J <dir>\tstart OCI bundle\n");
        fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
 and he has the same powers as root outside the jail,\n\
 thus he can escape the jail and/or break stuff.\n\
@@ -584,18 +660,18 @@ static int exec_jail(void *pipes_ptr)
        close(pipes[2]);
 
        if (opts.namespace & CLONE_NEWUSER) {
-               if (setgid(0) < 0) {
+               if (setregid(0, 0) < 0) {
                        ERROR("setgid\n");
                        exit(EXIT_FAILURE);
                }
-               if (setuid(0) < 0) {
+               if (setreuid(0, 0) < 0) {
                        ERROR("setuid\n");
                        exit(EXIT_FAILURE);
                }
-//             if (setgroups(0, NULL) < 0) {
-//                     ERROR("setgroups\n");
-//                     exit(EXIT_FAILURE);
-//             }
+               if (setgroups(0, NULL) < 0) {
+                       ERROR("setgroups\n");
+                       exit(EXIT_FAILURE);
+               }
        }
 
        if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
@@ -609,6 +685,9 @@ static int exec_jail(void *pipes_ptr)
                exit(EXIT_FAILURE);
        }
 
+       if (applyOCIcapabilities(opts.capset))
+               exit(EXIT_FAILURE);
+
        if (opts.capabilities && drop_capabilities(opts.capabilities))
                exit(EXIT_FAILURE);
 
@@ -619,13 +698,17 @@ static int exec_jail(void *pipes_ptr)
 
        if (!(opts.namespace & CLONE_NEWUSER)) {
                get_jail_user(&pw_uid, &pw_gid, &gr_gid);
-               set_jail_user(pw_uid, pw_gid, gr_gid);
+
+               set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
        }
 
-       char **envp = build_envp(opts.seccomp);
+       char **envp = build_envp(opts.seccomp, opts.envp);
        if (!envp)
                exit(EXIT_FAILURE);
 
+       if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
+               exit(EXIT_FAILURE);
+
        INFO("exec-ing %s\n", *opts.jail_argv);
        execve(*opts.jail_argv, opts.jail_argv, envp);
        /* we get there only if execve fails */
@@ -702,12 +785,482 @@ static void netns_updown(pid_t pid, bool start)
        ubus_free(ctx);
 }
 
+
+enum {
+       OCI_ROOT_PATH,
+       OCI_ROOT_READONLY,
+       __OCI_ROOT_MAX,
+};
+
+static const struct blobmsg_policy oci_root_policy[] = {
+       [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
+       [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
+};
+
+static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
+{
+       static char rootpath[PATH_MAX] = { 0 };
+       struct blob_attr *tb[__OCI_ROOT_MAX];
+       char *cur;
+
+       blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_ROOT_PATH])
+               return ENODATA;
+
+       strncpy(rootpath, jsonfile, PATH_MAX);
+       cur = strrchr(rootpath, '/');
+
+       if (!cur)
+               return ENOTDIR;
+
+       *(++cur) = '\0';
+       strncat(rootpath, blobmsg_get_string(tb[OCI_ROOT_PATH]), PATH_MAX - (strlen(rootpath) + 1));
+
+       opts.extroot = rootpath;
+
+       opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
+
+       return 0;
+}
+
+
+enum {
+       OCI_MOUNT_SOURCE,
+       OCI_MOUNT_DESTINATION,
+       OCI_MOUNT_TYPE,
+       OCI_MOUNT_OPTIONS,
+       __OCI_MOUNT_MAX,
+};
+
+static const struct blobmsg_policy oci_mount_policy[] = {
+       [OCI_MOUNT_SOURCE] = { "source", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_DESTINATION] = { "destination", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_OPTIONS] = { "options", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCImount(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_MOUNT_MAX];
+
+       blobmsg_parse(oci_mount_policy, __OCI_MOUNT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_MOUNT_DESTINATION])
+               return EINVAL;
+
+       if (!strcmp("proc", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+           !strcmp("/proc", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+               opts.procfs = true;
+               return 0;
+       }
+
+       if (!strcmp("sysfs", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+           !strcmp("/sys", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+               opts.sysfs = true;
+               return 0;
+       }
+
+       if (!strcmp("tmpfs", blobmsg_get_string(tb[OCI_MOUNT_TYPE])) &&
+           !strcmp("/dev", blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]))) {
+               /* we always mount a small tmpfs on /dev */
+               return 0;
+       }
+
+       INFO("ignoring unsupported mount %s %s -t %s -o %s\n",
+               blobmsg_get_string(tb[OCI_MOUNT_SOURCE]),
+               blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]),
+               blobmsg_get_string(tb[OCI_MOUNT_TYPE]),
+               blobmsg_format_json(tb[OCI_MOUNT_OPTIONS], true));
+
+       return 0;
+};
+
+
+enum {
+       OCI_PROCESS_USER_UID,
+       OCI_PROCESS_USER_GID,
+       OCI_PROCESS_USER_UMASK,
+       OCI_PROCESS_USER_ADDITIONALGIDS,
+       __OCI_PROCESS_USER_MAX,
+};
+
+static const struct blobmsg_policy oci_process_user_policy[] = {
+       [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCIprocessuser(struct blob_attr *msg) {
+       struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
+
+       blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_PROCESS_USER_UID])
+               opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
+
+       if (tb[OCI_PROCESS_USER_GID]) {
+               opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+               opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+       }
+
+       /* ToDo: umask, additional GIDs */
+
+       return 0;
+}
+
+enum {
+       OCI_PROCESS_ARGS,
+       OCI_PROCESS_CAPABILITIES,
+       OCI_PROCESS_CWD,
+       OCI_PROCESS_ENV,
+       OCI_PROCESS_NONEWPRIVILEGES,
+       OCI_PROCESS_RLIMITS,
+       OCI_PROCESS_TERMINAL,
+       OCI_PROCESS_USER,
+       __OCI_PROCESS_MAX,
+};
+
+static const struct blobmsg_policy oci_process_policy[] = {
+       [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
+       [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
+       [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
+       [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
+       [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
+};
+
+static int parseOCIprocess(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_PROCESS_MAX];
+       struct blob_attr *cur;
+       unsigned int sz = 0;
+       int rem;
+       int res;
+
+       blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_PROCESS_ARGS])
+               return ENOENT;
+
+       blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ARGS], rem)
+               ++sz;
+
+       if (!sz)
+               return ENODATA;
+
+       opts.jail_argv = calloc(1 + sz, sizeof(char*));
+       if (!opts.jail_argv)
+               return ENOMEM;
+
+       sz = 0;
+       blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ARGS], rem)
+               opts.jail_argv[sz++] = blobmsg_get_string(cur);
+
+       opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
+       opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
+
+       if (tb[OCI_PROCESS_CWD])
+               opts.cwd = blobmsg_get_string(tb[OCI_PROCESS_CWD]);
+
+       sz = 0;
+       blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ENV], rem)
+               ++sz;
+
+       if (sz > 0) {
+               opts.envp = calloc(1 + sz, sizeof(char*));
+               if (!opts.envp)
+                       return ENOMEM;
+       }
+
+       sz = 0;
+       blobmsg_for_each_attr(cur, tb[OCI_PROCESS_ENV], rem)
+               opts.envp[sz++] = strdup(blobmsg_get_string(cur));
+
+       if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
+               return res;
+
+       if (tb[OCI_PROCESS_CAPABILITIES] &&
+           (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
+               return res;
+
+       /* ToDo: rlimits, capabilities */
+
+       return 0;
+}
+
+enum {
+       OCI_LINUX_NAMESPACE_TYPE,
+       OCI_LINUX_NAMESPACE_PATH,
+       __OCI_LINUX_NAMESPACE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_namespace_policy[] = {
+       [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
+};
+
+static unsigned int resolve_nstype(char *type) {
+       if (!strcmp("pid", type))
+               return CLONE_NEWPID;
+       else if (!strcmp("network", type))
+               return CLONE_NEWNET;
+       else if (!strcmp("mount", type))
+               return CLONE_NEWNS;
+       else if (!strcmp("ipc", type))
+               return CLONE_NEWIPC;
+       else if (!strcmp("uts", type))
+               return CLONE_NEWUTS;
+       else if (!strcmp("user", type))
+               return CLONE_NEWUSER;
+       else if (!strcmp("cgroup", type))
+               return CLONE_NEWCGROUP;
+       else
+               return 0;
+}
+
+static int parseOCIlinuxns(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
+
+
+       blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_NAMESPACE_TYPE])
+               return EINVAL;
+
+       if (tb[OCI_LINUX_NAMESPACE_PATH])
+               return ENOTSUP; /* ToDo */
+
+       opts.namespace |= resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
+
+       return 0;
+};
+
+
+enum {
+       OCI_LINUX_UIDGIDMAP_CONTAINERID,
+       OCI_LINUX_UIDGIDMAP_HOSTID,
+       OCI_LINUX_UIDGIDMAP_SIZE,
+       __OCI_LINUX_UIDGIDMAP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
+       [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
+};
+
+static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
+{
+       const char *map_format = "%d %d %d\n";
+       struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
+       struct blob_attr *cur;
+       int rem, len;
+       char **mappings;
+       char *map, *curstr;
+       unsigned int cnt = 0;
+       size_t totallen = 0;
+
+       /* count number of mappings */
+       blobmsg_for_each_attr(cur, msg, rem)
+               cnt++;
+
+       if (!cnt)
+               return 0;
+
+       /* allocate array for mappings */
+       mappings = calloc(1 + cnt, sizeof(char*));
+       if (!mappings)
+               return ENOMEM;
+
+       mappings[cnt] = NULL;
+
+       cnt = 0;
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
+                   !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
+                   !tb[OCI_LINUX_UIDGIDMAP_SIZE])
+                       return EINVAL;
+
+               /* write mapping line into allocated string */
+               len = asprintf(&mappings[cnt++], map_format,
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+
+               if (len < 0)
+                       return ENOMEM;
+
+               totallen += len;
+       }
+
+       /* allocate combined mapping string */
+       map = calloc(1 + len, sizeof(char));
+       if (!map)
+               return ENOMEM;
+
+       map[0] = '\0';
+
+       /* concatenate mapping strings into combined string */
+       curstr = mappings[0];
+       while (curstr) {
+               strcat(map, curstr);
+               free(curstr++);
+       }
+       free(mappings);
+
+       if (is_gidmap)
+               opts.gidmap = map;
+       else
+               opts.uidmap = map;
+
+       return 0;
+}
+
+enum {
+       OCI_LINUX_RESOURCES,
+       OCI_LINUX_SECCOMP,
+       OCI_LINUX_SYSCTL,
+       OCI_LINUX_NAMESPACES,
+       OCI_LINUX_UIDMAPPINGS,
+       OCI_LINUX_GIDMAPPINGS,
+       OCI_LINUX_MASKEDPATHS,
+       OCI_LINUX_READONLYPATHS,
+       OCI_LINUX_ROOTFSPROPAGATION,
+       __OCI_LINUX_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_policy[] = {
+       [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
+};
+
+static int parseOCIlinux(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_MAX];
+       struct blob_attr *cur;
+       int rem;
+       int res = 0;
+
+       blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_NAMESPACES]) {
+               blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
+                       res = parseOCIlinuxns(cur);
+                       if (res)
+                               return res;
+               }
+       }
+
+       if (tb[OCI_LINUX_UIDMAPPINGS]) {
+               res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_GIDMAPPINGS]) {
+               res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_SECCOMP]) {
+               opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
+               if (!opts.ociseccomp)
+                       return EINVAL;
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_VERSION,
+       OCI_HOSTNAME,
+       OCI_PROCESS,
+       OCI_ROOT,
+       OCI_MOUNTS,
+       OCI_LINUX,
+       __OCI_MAX,
+};
+
+static const struct blobmsg_policy oci_policy[] = {
+       [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
+       [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
+       [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
+       [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
+       [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
+};
+
+static int parseOCI(const char *jsonfile)
+{
+       struct blob_attr *tb[__OCI_MAX];
+       struct blob_attr *cur;
+       int rem;
+       int res;
+
+       blob_buf_init(&ocibuf, 0);
+       if (!blobmsg_add_json_from_file(&ocibuf, jsonfile))
+               return ENOENT;
+
+       blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
+
+       if (!tb[OCI_VERSION])
+               return ENOMSG;
+
+       if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
+               ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
+               return ENOTSUP;
+       }
+
+       if (tb[OCI_HOSTNAME])
+               opts.hostname = blobmsg_get_string(tb[OCI_HOSTNAME]);
+
+       if (!tb[OCI_PROCESS])
+               return ENODATA;
+
+       if ((res = parseOCIprocess(tb[OCI_PROCESS])))
+               return res;
+
+       if (!tb[OCI_ROOT])
+               return ENODATA;
+
+       if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
+               return res;
+
+       if (!tb[OCI_MOUNTS])
+               return ENODATA;
+
+       blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
+               if ((res = parseOCImount(cur)))
+                       return res;
+
+       if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
+               return res;
+
+       return 0;
+}
+
 int main(int argc, char **argv)
 {
        sigset_t sigmask;
        uid_t uid = getuid();
-       char log[] = "/dev/log";
-       char ubus[] = "/var/run/ubus.sock";
+       const char log[] = "/dev/log";
+       const char ubus[] = "/var/run/ubus.sock";
+       char *jsonfile = NULL;
        int ch, i;
        int pipes[4];
        char sig_buf[1];
@@ -802,19 +1355,32 @@ int main(int argc, char **argv)
                case 'y':
                        opts.console = 1;
                        break;
+               case 'J':
+                       asprintf(&jsonfile, "%s/config.json", optarg);
+                       break;
                }
        }
 
        if (opts.namespace)
                opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
 
+       if (jsonfile) {
+               int ocires;
+               ocires = parseOCI(jsonfile);
+               free(jsonfile);
+               if (ocires) {
+                       ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
+                       return ocires;
+               }
+       }
+
        if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
                ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
                return -1;
        }
 
        /* no <binary> param found */
-       if (argc - optind < 1) {
+       if (!jsonfile && (argc - optind < 1)) {
                usage();
                return EXIT_FAILURE;
        }
@@ -825,12 +1391,14 @@ int main(int argc, char **argv)
        }
        DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
                opts.namespace,
-               opts.capabilities != 0,
-               opts.seccomp != 0);
-
-       opts.jail_argv = &argv[optind];
+               opts.capabilities != 0 || opts.capset.apply,
+               opts.seccomp != 0 || opts.ociseccomp != 0);
 
-       get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
+       if (!jsonfile) {
+               opts.jail_argv = &argv[optind];
+               if (opts.namespace & CLONE_NEWUSER)
+                       get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
+       }
 
        if (!opts.extroot) {
                if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
@@ -908,17 +1476,23 @@ int main(int argc, char **argv)
                }
                close(pipes[0]);
                if (opts.namespace & CLONE_NEWUSER) {
-                       bool has_gr = (opts.gr_gid != -1);
-                       if (write_setgroups(jail_process.pid, false)) {
+                       if (write_setgroups(jail_process.pid, true)) {
                                ERROR("can't write setgroups\n");
                                return -1;
                        }
-                       if (opts.pw_uid != -1) {
-                               write_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
-                               write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+                       if (!opts.uidmap) {
+                               bool has_gr = (opts.gr_gid != -1);
+                               if (opts.pw_uid != -1) {
+                                       write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
+                                       write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+                               } else {
+                                       write_single_uid_gid_map(jail_process.pid, 0, 65534);
+                                       write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+                               }
                        } else {
-                               write_uid_gid_map(jail_process.pid, 0, 65534);
-                               write_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+                               write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
+                               if (opts.gidmap)
+                                       write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
                        }
                }
 
index fd6b3e257c385c4e3b03ccb9ed89fe67fec70ff4..d7903f10cca40ef6b670a494bca7bf3a78644486 100644 (file)
 #define SECCOMP_RET_TRAP       0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO      0x00050000U /* returns an errno */
 #define SECCOMP_RET_LOG                0x00070000U
+#define SECCOMP_RET_LOGALLOW   0x7ffc0000U
 #define SECCOMP_RET_TRACE      0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_ALLOW      0x7fff0000U /* allow */
+#define SECCOMP_RET_KILLPROCESS        0x80000000U
 #define SECCOMP_RET_ERROR(x)   (SECCOMP_RET_ERRNO | ((x) & 0x0000ffffU))
 #define SECCOMP_RET_LOGGER(x)  (SECCOMP_RET_LOG | ((x) & 0x0000ffffU))
 
diff --git a/jail/seccomp-oci.c b/jail/seccomp-oci.c
new file mode 100644 (file)
index 0000000..06fa2d1
--- /dev/null
@@ -0,0 +1,272 @@
+/*
+ * parse and setup OCI seccomp filter
+ * Copyright (c) 2020 Daniel Golle <daniel@makrotopia.org>
+ * seccomp example with syscall reporting
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Authors:
+ *  Kees Cook <keescook@chromium.org>
+ *  Will Drewry <wad@chromium.org>
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#define _GNU_SOURCE 1
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <libubox/utils.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+
+#include "log.h"
+#include "seccomp-bpf.h"
+#include "seccomp-oci.h"
+#include "../syscall-names.h"
+#include "seccomp-syscalls-helpers.h"
+
+static uint32_t resolve_action(char *actname)
+{
+       if (!strcmp(actname, "SCMP_ACT_KILL"))
+               return SECCOMP_RET_KILL;
+       else if (!strcmp(actname, "SCMP_ACT_KILL_PROCESS"))
+               return SECCOMP_RET_KILLPROCESS;
+       else if (!strcmp(actname, "SCMP_ACT_TRAP"))
+               return SECCOMP_RET_TRAP;
+       else if (!strcmp(actname, "SCMP_ACT_ERRNO"))
+               return SECCOMP_RET_ERRNO;
+       else if (!strcmp(actname, "SCMP_ACT_ERROR"))
+               return SECCOMP_RET_ERRNO;
+       else if (!strcmp(actname, "SCMP_ACT_TRACE"))
+               return SECCOMP_RET_TRACE;
+       else if (!strcmp(actname, "SCMP_ACT_ALLOW"))
+               return SECCOMP_RET_ALLOW;
+       else if (!strcmp(actname, "SCMP_ACT_LOG"))
+               return SECCOMP_RET_LOGALLOW;
+       else {
+               ERROR("unknown seccomp action %s\n", actname);
+               return SECCOMP_RET_KILL;
+       }
+}
+
+static uint32_t resolve_architecture(char *archname)
+{
+       if (!strcmp(archname, "SCMP_ARCH_X86"))
+               return AUDIT_ARCH_I386;
+       else if (!strcmp(archname, "SCMP_ARCH_X86_64"))
+               return AUDIT_ARCH_X86_64;
+       else if (!strcmp(archname, "SCMP_ARCH_X32"))
+               /*
+                * return AUDIT_ARCH_X86_64;
+                * 32-bit userland on 64-bit kernel is not supported yet
+                */
+               return 0;
+       else if (!strcmp(archname, "SCMP_ARCH_ARM"))
+               return AUDIT_ARCH_ARM;
+       else if (!strcmp(archname, "SCMP_ARCH_AARCH64"))
+               return AUDIT_ARCH_AARCH64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS"))
+               return AUDIT_ARCH_MIPS;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS64"))
+               return AUDIT_ARCH_MIPS64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS64N32"))
+               return AUDIT_ARCH_MIPS64N32;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL"))
+               return AUDIT_ARCH_MIPSEL;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64"))
+               return AUDIT_ARCH_MIPSEL64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64N32"))
+               return AUDIT_ARCH_MIPSEL64N32;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC"))
+               return AUDIT_ARCH_PPC;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC64"))
+               return AUDIT_ARCH_PPC64;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC64LE"))
+               return AUDIT_ARCH_PPC64LE;
+       else if (!strcmp(archname, "SCMP_ARCH_S390"))
+               return AUDIT_ARCH_S390;
+       else if (!strcmp(archname, "SCMP_ARCH_S390X"))
+               return AUDIT_ARCH_S390X;
+       else if (!strcmp(archname, "SCMP_ARCH_PARISC"))
+               return AUDIT_ARCH_PARISC;
+       else if (!strcmp(archname, "SCMP_ARCH_PARISC64"))
+               return AUDIT_ARCH_PARISC64;
+       else {
+               ERROR("unknown seccomp architecture %s\n", archname);
+               return 0;
+       }
+}
+
+enum {
+       OCI_LINUX_SECCOMP_DEFAULTACTION,
+       OCI_LINUX_SECCOMP_ARCHITECTURES,
+       OCI_LINUX_SECCOMP_FLAGS,
+       OCI_LINUX_SECCOMP_SYSCALLS,
+       __OCI_LINUX_SECCOMP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_policy[] = {
+       [OCI_LINUX_SECCOMP_DEFAULTACTION] = { "defaultAction", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_SECCOMP_ARCHITECTURES] = { "architectures", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_FLAGS] = { "flags", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS] = { "syscalls", BLOBMSG_TYPE_ARRAY },
+};
+
+enum {
+       OCI_LINUX_SECCOMP_SYSCALLS_NAMES,
+       OCI_LINUX_SECCOMP_SYSCALLS_ACTION,
+       OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS,
+       __OCI_LINUX_SECCOMP_SYSCALLS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_policy[] = {
+       [OCI_LINUX_SECCOMP_SYSCALLS_NAMES] = { "names", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET] = { "errnoRet", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ACTION] = { "action", BLOBMSG_TYPE_STRING },
+};
+
+enum {
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP,
+       __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_args_policy[] = {
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX] = { "index", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE] = { "value", BLOBMSG_TYPE_INT64 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO] = { "valueTwo", BLOBMSG_TYPE_INT64 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP] = { "op", BLOBMSG_TYPE_STRING },
+};
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_SECCOMP_MAX];
+       struct blob_attr *tbn[__OCI_LINUX_SECCOMP_SYSCALLS_MAX];
+       struct blob_attr *tba[__OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX];
+       struct blob_attr *cur, *curn, *curarg;
+       int rem, remn, remargs, sc;
+       struct sock_filter *filter;
+       struct sock_fprog *prog;
+       int sz = 5, idx = 0;
+       uint32_t default_policy = 0;
+       uint32_t seccomp_arch;
+
+       blobmsg_parse(oci_linux_seccomp_policy, __OCI_LINUX_SECCOMP_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_SECCOMP_DEFAULTACTION]) {
+               ERROR("seccomp: no default action set\n");
+               return NULL;
+       }
+
+       default_policy = resolve_action(blobmsg_get_string(tb[OCI_LINUX_SECCOMP_DEFAULTACTION]));
+
+       /* verify architecture while ignoring the x86_64 anomaly for now */
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_ARCHITECTURES], rem) {
+               seccomp_arch = resolve_architecture(blobmsg_get_string(cur));
+               /* take the first useful arch for now */
+               if (seccomp_arch)
+                       break;
+       }
+
+       if (ARCH_NR != seccomp_arch) {
+               ERROR("seccomp architecture doesn't match system\n");
+               return NULL;
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+               blobmsg_parse(oci_linux_seccomp_syscalls_policy, __OCI_LINUX_SECCOMP_SYSCALLS_MAX, tbn, blobmsg_data(cur), blobmsg_len(cur));
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn)
+                       sz += 2;
+
+               if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS])
+                       blobmsg_for_each_attr(curarg, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remargs)
+                               sz++;
+       }
+
+       prog = malloc(sizeof(struct sock_fprog));
+       if (!prog)
+               return NULL;
+
+       filter = calloc(sz, sizeof(struct sock_filter));
+       if (!filter) {
+               ERROR("failed to allocate memory for seccomp filter\n");
+               goto errout2;
+       }
+
+       /* validate arch */
+       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, arch_nr);
+       set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 1, 0, ARCH_NR);
+       set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_KILL);
+
+       /* get syscall */
+       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, syscall_nr);
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+               uint32_t action;
+               blobmsg_parse(oci_linux_seccomp_syscalls_policy, __OCI_LINUX_SECCOMP_SYSCALLS_MAX, tbn, blobmsg_data(cur), blobmsg_len(cur));
+               action = resolve_action(blobmsg_get_string(tbn[OCI_LINUX_SECCOMP_SYSCALLS_ACTION]));
+               if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]) {
+                       if (action != SECCOMP_RET_ERRNO)
+                               goto errout1;
+
+                       action = SECCOMP_RET_ERROR(blobmsg_get_u32(tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]));
+               } else if (action == SECCOMP_RET_ERRNO)
+                       action = SECCOMP_RET_ERROR(EPERM);
+
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn) {
+                       sc = find_syscall(blobmsg_get_string(curn));
+                       if (sc == -1) {
+                               ERROR("unknown syscall '%s'\n", blobmsg_get_string(curn));
+                               goto errout1;
+                       }
+
+                       /* add rule to filter */
+                       set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 0, 1, sc);
+                       set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, action);
+
+               }
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remn) {
+                       blobmsg_parse(oci_linux_seccomp_syscalls_args_policy, __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX, tba, blobmsg_data(curn), blobmsg_len(curn));
+                       /* ToDo: process args */
+               }
+       }
+
+       set_filter(&filter[idx], BPF_RET + BPF_K, 0, 0, default_policy);
+
+       prog->len = (unsigned short) idx + 1;
+       prog->filter = filter;
+
+       return prog;
+
+errout1:
+       free(prog->filter);
+errout2:
+       free(prog);
+       return NULL;
+}
+
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog)
+{
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+               goto errout;
+       }
+
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog)) {
+               ERROR("prctl(PR_SET_SECCOMP) failed: %m\n");
+               goto errout;
+       }
+       free(prog);
+
+       return 0;
+
+errout:
+       free(prog->filter);
+       free(prog);
+       return errno;
+}
diff --git a/jail/seccomp-oci.h b/jail/seccomp-oci.h
new file mode 100644 (file)
index 0000000..f8e0b6a
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_OCI_H_
+#define _JAIL_SECCOMP_OCI_H_
+
+#include <linux/filter.h>
+
+#ifdef SECCOMP_SUPPORT
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg);
+int applyOCIlinuxseccomp(struct sock_fprog *prog);
+#else
+
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg) {
+       return NULL;
+}
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog) {
+       return ENOTSUP;
+}
+#endif
+
+#endif
diff --git a/jail/seccomp-syscalls-helpers.h b/jail/seccomp-syscalls-helpers.h
new file mode 100644 (file)
index 0000000..f86e468
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_HELPERS_H_
+#define _JAIL_SECCOMP_HELPERS_H_
+
+static int find_syscall(const char *name)
+{
+       int i;
+
+       for (i = 0; i < SYSCALL_COUNT; i++) {
+               int sc = syscall_index_to_number(i);
+               if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
+                       return sc;
+       }
+
+       return -1;
+}
+
+static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
+{
+       filter->code = code;
+       filter->jt = jt;
+       filter->jf = jf;
+       filter->k = k;
+}
+
+#endif
index a00250c7342f99fb11c85b79095cab0fc295c6f1..dac4245fc779be805ca21c9547d6efb76595e08c 100644 (file)
 #include "seccomp-bpf.h"
 #include "seccomp.h"
 #include "../syscall-names.h"
-
-static int find_syscall(const char *name)
-{
-       int i;
-
-       for (i = 0; i < SYSCALL_COUNT; i++) {
-               int sc = syscall_index_to_number(i);
-               if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
-                       return sc;
-       }
-
-       return -1;
-}
-
-static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
-{
-       filter->code = code;
-       filter->jt = jt;
-       filter->jf = jf;
-       filter->k = k;
-}
+#include "seccomp-syscalls-helpers.h"
 
 int install_syscall_filter(const char *argv, const char *file)
 {
index 142208a41e84d5f467829575ce824f592bbdb62b..c65da5051e936a47046582144bb46db0519dd7f6 100644 (file)
@@ -65,6 +65,7 @@ enum {
        INSTANCE_ATTR_EXTROOT,
        INSTANCE_ATTR_OVERLAYDIR,
        INSTANCE_ATTR_TMPOVERLAYSIZE,
+       INSTANCE_ATTR_BUNDLE,
        __INSTANCE_ATTR_MAX
 };
 
@@ -95,6 +96,7 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
        [INSTANCE_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_OVERLAYDIR] = { "overlaydir", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_TMPOVERLAYSIZE] = { "tmpoverlaysize", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_BUNDLE] = { "bundle", BLOBMSG_TYPE_STRING },
 };
 
 enum {
@@ -294,6 +296,11 @@ jail_run(struct service_instance *in, char **argv)
                argv[argc++] = in->tmpoverlaysize;
        }
 
+       if (in->bundle) {
+               argv[argc++] = "-J";
+               argv[argc++] = in->bundle;
+       }
+
        if (in->require_jail)
                argv[argc++] = "-E";
 
@@ -484,7 +491,7 @@ instance_start(struct service_instance *in)
                return;
        }
 
-       if (!in->command) {
+       if (!in->bundle && !in->command) {
                LOG("Not starting instance %s::%s, command not set\n", in->srv->name, in->name);
                return;
        }
@@ -802,7 +809,8 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
                return true;
        if (in->respawn_timeout != in_new->respawn_timeout)
                return true;
-
+       if (in->bundle && in_new->bundle && strcmp(in->bundle, in_new->bundle))
+               return true;
        if ((!in->seccomp && in_new->seccomp) ||
            (in->seccomp && !in_new->seccomp) ||
            (in->seccomp && in_new->seccomp && strcmp(in->seccomp, in_new->seccomp)))
@@ -996,6 +1004,9 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr)
        if (in->no_new_privs)
                jail->argc++;
 
+       if (in->bundle)
+               jail->argc += 2;
+
        return true;
 }
 
@@ -1035,8 +1046,8 @@ instance_config_parse(struct service_instance *in)
        blobmsg_parse(instance_attr, __INSTANCE_ATTR_MAX, tb,
                blobmsg_data(in->config), blobmsg_data_len(in->config));
 
-       if (!instance_config_parse_command(in, tb))
-               return false;
+       if (!tb[INSTANCE_ATTR_BUNDLE] && !instance_config_parse_command(in, tb))
+                       return false;
 
        if (tb[INSTANCE_ATTR_TERMTIMEOUT])
                in->term_timeout = blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
@@ -1113,6 +1124,9 @@ instance_config_parse(struct service_instance *in)
        if (tb[INSTANCE_ATTR_TMPOVERLAYSIZE])
                in->tmpoverlaysize = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_TMPOVERLAYSIZE]));
 
+       if (tb[INSTANCE_ATTR_BUNDLE])
+               in->bundle = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_BUNDLE]));
+
        if (tb[INSTANCE_ATTR_PIDFILE]) {
                char *pidfile = blobmsg_get_string(tb[INSTANCE_ATTR_PIDFILE]);
                if (pidfile)
@@ -1264,6 +1278,7 @@ instance_free(struct service_instance *in)
        free(in->extroot);
        free(in->overlaydir);
        free(in->tmpoverlaysize);
+       free(in->bundle);
        free(in->jail.name);
        free(in->jail.hostname);
        free(in->seccomp);
@@ -1324,6 +1339,8 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                blobmsg_add_u32(b, "pid", in->proc.pid);
        if (in->command)
                blobmsg_add_blob(b, in->command);
+       if (in->bundle)
+               blobmsg_add_string(b, "bundle", in->bundle);
        blobmsg_add_u32(b, "term_timeout", in->term_timeout);
        if (!in->proc.pending)
                blobmsg_add_u32(b, "exit_code", in->exit_code);
@@ -1393,17 +1410,19 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                void *r = blobmsg_open_table(b, "jail");
                if (in->jail.name)
                        blobmsg_add_string(b, "name", in->jail.name);
-               if (in->jail.hostname)
-                       blobmsg_add_string(b, "hostname", in->jail.hostname);
-
-               blobmsg_add_u8(b, "procfs", in->jail.procfs);
-               blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
-               blobmsg_add_u8(b, "ubus", in->jail.ubus);
-               blobmsg_add_u8(b, "log", in->jail.log);
-               blobmsg_add_u8(b, "ronly", in->jail.ronly);
-               blobmsg_add_u8(b, "netns", in->jail.netns);
-               blobmsg_add_u8(b, "userns", in->jail.userns);
-               blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns);
+               if (!in->bundle) {
+                       if (in->jail.hostname)
+                               blobmsg_add_string(b, "hostname", in->jail.hostname);
+
+                       blobmsg_add_u8(b, "procfs", in->jail.procfs);
+                       blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
+                       blobmsg_add_u8(b, "ubus", in->jail.ubus);
+                       blobmsg_add_u8(b, "log", in->jail.log);
+                       blobmsg_add_u8(b, "ronly", in->jail.ronly);
+                       blobmsg_add_u8(b, "netns", in->jail.netns);
+                       blobmsg_add_u8(b, "userns", in->jail.userns);
+                       blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns);
+               }
                blobmsg_add_u8(b, "console", (in->console.fd.fd > -1));
                blobmsg_close_table(b, r);
                if (!avl_is_empty(&in->jail.mount.avl)) {
index 4400cd4cd02888e3c4eaabf61442a1faef4d9a7d..e8ee15caa600811c9728f731e48317bf30c2fc63 100644 (file)
@@ -70,6 +70,7 @@ struct service_instance {
        char *extroot;
        char *overlaydir;
        char *tmpoverlaysize;
+       char *bundle;
        int syslog_facility;
        int exit_code;