jail: don't ignore return value of write()
[project/procd.git] / jail / jail.c
index 8c1b5630cd6ac8ec3dba3312d7673b2870589071..92ced457dd3951388bc1141e677da68621bcc2bb 100644 (file)
 #include "seccomp-oci.h"
 #include "cgroups.h"
 
-#include <libubox/utils.h>
 #include <libubox/blobmsg.h>
 #include <libubox/blobmsg_json.h>
 #include <libubox/list.h>
 #include <libubox/vlist.h>
 #include <libubox/uloop.h>
+#include <libubox/utils.h>
 #include <libubus.h>
 
 #ifndef CLONE_NEWCGROUP
@@ -132,6 +132,7 @@ static struct {
        int pw_uid;
        int pw_gid;
        int gr_gid;
+       int root_map_uid;
        gid_t *additional_gids;
        size_t num_additional_gids;
        mode_t umask;
@@ -248,6 +249,7 @@ static void free_opts(bool parent) {
 
        free_library_search();
        mount_free();
+       cgroups_free();
 
        /* we need to keep argv, envp and seccomp filter in child */
        if (parent) { /* parent-only */
@@ -258,9 +260,6 @@ static void free_opts(bool parent) {
 
                free_oci_envp(opts.jail_argv);
                free_oci_envp(opts.envp);
-       } else { /* child-only */
-               if (opts.ocibundle)
-                       cgroups_free();
        }
 
        free_rlimits();
@@ -271,6 +270,8 @@ static void free_opts(bool parent) {
        free(opts.uidmap);
        free(opts.gidmap);
        free(opts.annotations);
+       free(opts.extroot);
+       free(opts.overlaydir);
        free_hooklist(opts.hooks.createRuntime);
        free_hooklist(opts.hooks.createContainer);
        free_hooklist(opts.hooks.startContainer);
@@ -466,9 +467,6 @@ static void run_hooklist(void)
        if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
                hook_process_handler(&hook_process, EPERM);
 
-       if (!((unsigned long)s.st_mode & (S_IRUSR | S_IRGRP | S_IROTH)))
-               hook_process_handler(&hook_process, EPERM);
-
        hook_running = 1;
        hook_process.pid = fork();
        if (hook_process.pid == 0) {
@@ -518,8 +516,7 @@ static int apply_sysctl(const char *jail_root)
        if (!opts.sysctl)
                return 0;
 
-       asprintf(&procdir, "%s/proc", jail_root);
-       if (!procdir)
+       if (asprintf(&procdir, "%s/proc", jail_root) < 0)
                return ENOMEM;
 
        mkdir(procdir, 0700);
@@ -529,8 +526,7 @@ static int apply_sysctl(const char *jail_root)
        cur = opts.sysctl;
 
        while (*cur) {
-               asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry);
-               if (!fname)
+               if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0)
                        return ENOMEM;
 
                DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
@@ -540,7 +536,10 @@ static int apply_sysctl(const char *jail_root)
                        ERROR("sysctl: can't open %s\n", fname);
                        return errno;
                }
-               write(f, (*cur)->value, strlen((*cur)->value));
+               if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) {
+                       ERROR("sysctl: write to %s\n", fname);
+                       return errno;
+               }
 
                free(fname);
                close(f);
@@ -611,11 +610,11 @@ only_default_devices:
        }
 
        /* Dev symbolic links as defined in OCI spec */
-       symlink("/dev/pts/ptmx", "/dev/ptmx");
-       symlink("/proc/self/fd", "/dev/fd");
-       symlink("/proc/self/fd/0", "/dev/stdin");
-       symlink("/proc/self/fd/1", "/dev/stdout");
-       symlink("/proc/self/fd/2", "/dev/stderr");
+       (void) symlink("/dev/pts/ptmx", "/dev/ptmx");
+       (void) symlink("/proc/self/fd", "/dev/fd");
+       (void) symlink("/proc/self/fd/0", "/dev/stdin");
+       (void) symlink("/proc/self/fd/1", "/dev/stdout");
+       (void) symlink("/proc/self/fd/2", "/dev/stderr");
 
        return 0;
 }
@@ -627,6 +626,7 @@ static void enter_jail_fs(void);
 static int build_jail_fs(void)
 {
        char *overlaydir = NULL;
+       int ret;
 
        old_umask = umask(0);
 
@@ -678,8 +678,11 @@ static int build_jail_fs(void)
        if (opts.overlaydir)
                overlaydir = opts.overlaydir;
 
-       if (overlaydir)
-               mount_overlay(jail_root, overlaydir);
+       if (overlaydir) {
+               ret = mount_overlay(jail_root, overlaydir);
+               if (ret)
+                       return ret;
+       }
 
        if (chdir(jail_root)) {
                ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
@@ -695,7 +698,7 @@ static int build_jail_fs(void)
                create_dev_console(jail_root);
 
        /* make sure /etc/resolv.conf exists if in new network namespace */
-       if (opts.namespace & CLONE_NEWNET) {
+       if (!opts.extroot && opts.namespace & CLONE_NEWNET) {
                char jailetc[PATH_MAX], jaillink[PATH_MAX];
 
                snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
@@ -704,7 +707,7 @@ static int build_jail_fs(void)
                if (overlaydir)
                        unlink(jaillink);
 
-               symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
+               (void) symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
        }
 
        run_hooks(opts.hooks.createContainer, enter_jail_fs);
@@ -897,12 +900,13 @@ static int apply_rlimits(void)
        return 0;
 }
 
-#define MAX_ENVP       8
+#define MAX_ENVP       64
 static char** build_envp(const char *seccomp, char **ocienvp)
 {
        static char *envp[MAX_ENVP];
        static char preload_var[PATH_MAX];
        static char seccomp_var[PATH_MAX];
+       static char seccomp_debug_var[20];
        static char debug_var[] = "LD_DEBUG=all";
        static char container_var[] = "container=ujail";
        const char *preload_lib = find_lib("libpreload-seccomp.so");
@@ -917,6 +921,8 @@ static char** build_envp(const char *seccomp, char **ocienvp)
        if (seccomp) {
                snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
                envp[count++] = seccomp_var;
+               snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug);
+               envp[count++] = seccomp_debug_var;
                snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
                envp[count++] = preload_var;
        }
@@ -965,7 +971,7 @@ static void usage(void)
        fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
        fprintf(stderr, "  -y\t\tprovide jail console\n");
        fprintf(stderr, "  -J <dir>\tcreate container from OCI bundle\n");
-       fprintf(stderr, "  -j\t\tstart container immediately\n");
+       fprintf(stderr, "  -i\t\tstart container immediately\n");
        fprintf(stderr, "  -P <pidfile>\tcreate <pidfile>\n");
        fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
 and he has the same powers as root outside the jail,\n\
@@ -1006,8 +1012,7 @@ static int setns_open(unsigned long nstype)
 {
        int *fd = get_namespace_fd(nstype);
 
-       if (!*fd)
-               return EFAULT;
+       assert(fd != NULL);
 
        if (*fd == -1)
                return 0;
@@ -1096,6 +1101,8 @@ static int exec_jail(void *arg)
        char buf[1];
 
        exit_from_child = true;
+       prctl(PR_SET_SECUREBITS, 0);
+
        uloop_init();
        signals_init();
 
@@ -1126,6 +1133,8 @@ static int exec_jail(void *arg)
        if (opts.namespace & CLONE_NEWCGROUP)
                unshare(CLONE_NEWCGROUP);
 
+       setns_open(CLONE_NEWCGROUP);
+
        if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
                if (setregid(0, 0) < 0) {
                        ERROR("setgid\n");
@@ -1214,9 +1223,10 @@ static void post_start_hook(void)
        if (opts.set_umask)
                umask(opts.umask);
 
-       /* restore securebits back to normal */
+       /* restore securebits back to normal (and lock them if not in userns) */
        if (opts.capset.apply) {
-               if (prctl(PR_SET_SECUREBITS, 0)) {
+               if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0:
+                   SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) {
                        ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
                        free_and_exit(EXIT_FAILURE);
                }
@@ -1323,7 +1333,7 @@ static const struct blobmsg_policy oci_root_policy[] = {
 
 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
 {
-       static char extroot[PATH_MAX] = { 0 };
+       char extroot[PATH_MAX] = { 0 };
        struct blob_attr *tb[__OCI_ROOT_MAX];
        char *cur;
        char *root_path;
@@ -1348,7 +1358,10 @@ static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
 
        strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1));
 
-       opts.extroot = extroot;
+       /* follow symbolic link(s) */
+       opts.extroot = realpath(extroot, NULL);
+       if (!opts.extroot)
+               return errno;
 
        if (tb[OCI_ROOT_READONLY])
                opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
@@ -1818,8 +1831,14 @@ static int parseOCIlinuxns(struct blob_attr *msg)
        }
 
        return 0;
-};
+}
 
+static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size)
+{
+       if (container_id == 0 && size >= 1)
+               if (!is_gidmap)
+                       opts.root_map_uid = host_id;
+}
 
 enum {
        OCI_LINUX_UIDGIDMAP_CONTAINERID,
@@ -1866,6 +1885,10 @@ static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
        blobmsg_for_each_attr(cur, msg, rem) {
                blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
 
+               get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+
                /* write mapping line into pre-allocated string */
                len = snprintf(&map[pos], totallen + 1, "%d %d %d\n",
                         blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
@@ -2397,6 +2420,18 @@ jail_writepid(pid_t pid)
        return 0;
 }
 
+static int checkpath(const char *path)
+{
+       int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+       if (dirfd == -1) {
+               ERROR("path %s open failed %m\n", path);
+               return -1;
+       }
+       close(dirfd);
+
+       return 0;
+}
+
 static struct ubus_method container_methods[] = {
        UBUS_METHOD_NOARG("start", handle_start),
        UBUS_METHOD_NOARG("state", handle_state),
@@ -2460,7 +2495,7 @@ int main(int argc, char **argv)
                        opts.namespace |= CLONE_NEWCGROUP;
                        break;
                case 'R':
-                       opts.extroot = optarg;
+                       opts.extroot = realpath(optarg, NULL);
                        break;
                case 's':
                        opts.namespace |= CLONE_NEWNS;
@@ -2509,7 +2544,7 @@ int main(int argc, char **argv)
                        opts.group = optarg;
                        break;
                case 'O':
-                       opts.overlaydir = optarg;
+                       opts.overlaydir = realpath(optarg, NULL);
                        break;
                case 'T':
                        opts.tmpoverlaysize = optarg;
@@ -2547,6 +2582,12 @@ int main(int argc, char **argv)
        opts.setns.time = -1;
 #endif
 
+       /*
+        * uid in parent user namespace representing root user in new
+        * user namespace, defaults to nobody unless specified in uidMappings
+        */
+       opts.root_map_uid = 65534;
+
        if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) {
                ERROR("failed to read capabilities from file %s\n", opts.capabilities);
                ret=-1;
@@ -2562,7 +2603,10 @@ int main(int argc, char **argv)
                        ret=-1;
                        goto errout;
                }
-               asprintf(&jsonfile, "%s/config.json", opts.ocibundle);
+               if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) {
+                       ret=-ENOMEM;
+                       goto errout;
+               }
                ocires = parseOCI(jsonfile);
                free(jsonfile);
                if (ocires) {
@@ -2587,6 +2631,18 @@ int main(int argc, char **argv)
                goto errout;
        }
 
+       if (opts.extroot && checkpath(opts.extroot)) {
+               ERROR("invalid rootfs path '%s'", opts.extroot);
+               ret=-1;
+               goto errout;
+       }
+
+       if (opts.overlaydir && checkpath(opts.overlaydir)) {
+               ERROR("invalid rootfs overlay path '%s'", opts.overlaydir);
+               ret=-1;
+               goto errout;
+       }
+
        /* no <binary> param found */
        if (!opts.ocibundle && (argc - optind < 1)) {
                usage();
@@ -2735,7 +2791,7 @@ static void post_main(struct uloop_timeout *t)
 
                        }
                        if (opts.sysfs || opts.ocibundle)
-                               add_mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1);
+                               add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1);
 
                        if (opts.ocibundle)
                                add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1);
@@ -2753,9 +2809,22 @@ static void post_main(struct uloop_timeout *t)
                if (opts.setns.time != -1) {
                        timens_fd = ns_open_pid("time", getpid());
                        setns_open(CLONE_NEWTIME);
+               } else {
+                       timens_fd = -1;
                }
 #endif
 
+               if (opts.namespace & CLONE_NEWUSER) {
+                       if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
+                               ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+                               free_and_exit(EXIT_FAILURE);
+                       }
+                       if (seteuid(opts.root_map_uid)) {
+                               ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+                               free_and_exit(EXIT_FAILURE);
+                       }
+               }
+
                jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL);
        } else {
                jail_process.pid = fork();
@@ -2767,13 +2836,19 @@ static void post_main(struct uloop_timeout *t)
 
                uloop_process_add(&jail_process);
                jail_running = 1;
-               seteuid(0);
+               if (seteuid(0)) {
+                       ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+                       free_and_exit(EXIT_FAILURE);
+               }
+
+               prctl(PR_SET_SECUREBITS, 0);
+
                if (pidns_fd != -1) {
                        setns(pidns_fd, CLONE_NEWPID);
                        close(pidns_fd);
                }
 #ifdef CLONE_NEWTIME
-               if (timens_fd != -1)
+               if (timens_fd != -1) {
                        setns(timens_fd, CLONE_NEWTIME);
                        close(timens_fd);
                }