#define RLIM_NLIMITS 16
#endif
+#include <assert.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <linux/filter.h>
#include <linux/limits.h>
#include <linux/nsfs.h>
+#include <linux/securebits.h>
#include <signal.h>
#include <inttypes.h>
#include "jail.h"
#include "log.h"
#include "seccomp-oci.h"
+#include "cgroups.h"
-#include <libubox/utils.h>
#include <libubox/blobmsg.h>
#include <libubox/blobmsg_json.h>
#include <libubox/list.h>
#include <libubox/vlist.h>
#include <libubox/uloop.h>
+#include <libubox/utils.h>
#include <libubus.h>
#ifndef CLONE_NEWCGROUP
#endif
#define STACK_SIZE (1024 * 1024)
-#define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:"
+#define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:iP:"
+
+#define OCI_VERSION_STRING "1.0.2"
struct hook_execvpe {
char *file;
char **envp;
char *uidmap;
char *gidmap;
+ char *pidfile;
struct sysctl_val **sysctl;
int no_new_privs;
int namespace;
int pw_uid;
int pw_gid;
int gr_gid;
+ int root_map_uid;
gid_t *additional_gids;
size_t num_additional_gids;
mode_t umask;
int oom_score_adj;
bool set_oom_score_adj;
struct mknod_args **devices;
+ char *ocibundle;
+ bool immediately;
+ struct blob_attr *annotations;
} opts;
+static struct blob_buf ocibuf;
+
+extern int pivot_root(const char *new_root, const char *put_old);
+
+int debug = 0;
+
+static char child_stack[STACK_SIZE];
+
+static struct ubus_context *parent_ctx;
+
+int console_fd;
+
+
static inline bool has_namespaces(void)
{
return ((opts.setns.pid != -1) ||
opts.namespace);
}
+static void free_oci_envp(char **p) {
+ char **tmp;
+
+ if (p) {
+ tmp = p;
+ while (*tmp)
+ free(*(tmp++));
+
+ free(p);
+ }
+}
+
static void free_hooklist(struct hook_execvpe **hooklist)
{
struct hook_execvpe *cur;
- char **tmp;
if (!hooklist)
return;
cur = *hooklist;
while (cur) {
+ free_oci_envp(cur->argv);
+ free_oci_envp(cur->envp);
free(cur->file);
- tmp = cur->argv;
- while (tmp)
- free(*(tmp++));
-
- free(cur->argv);
-
- tmp = cur->envp;
- while (tmp)
- free(*(tmp++));
-
- free(cur->envp);
free(cur++);
}
free(hooklist);
free(opts.rlimits[type]);
}
-static void free_opts(bool child) {
- char **tmp;
+static void free_opts(bool parent) {
+
+ free_library_search();
+ mount_free();
+ cgroups_free();
/* we need to keep argv, envp and seccomp filter in child */
- if (child) {
+ if (parent) { /* parent-only */
if (opts.ociseccomp) {
free(opts.ociseccomp->filter);
free(opts.ociseccomp);
}
- tmp = opts.jail_argv;
- while(tmp)
- free(*(tmp++));
-
- free(opts.jail_argv);
-
- tmp = opts.envp;
- while (tmp)
- free(*(tmp++));
-
- free(opts.envp);
- };
+ free_oci_envp(opts.jail_argv);
+ free_oci_envp(opts.envp);
+ }
free_rlimits();
free_sysctl();
free_devices();
free(opts.hostname);
free(opts.cwd);
- free(opts.extroot);
free(opts.uidmap);
free(opts.gidmap);
+ free(opts.annotations);
+ free(opts.extroot);
+ free(opts.overlaydir);
free_hooklist(opts.hooks.createRuntime);
free_hooklist(opts.hooks.createContainer);
free_hooklist(opts.hooks.startContainer);
free_hooklist(opts.hooks.poststop);
}
-static struct blob_buf ocibuf;
-
-extern int pivot_root(const char *new_root, const char *put_old);
-
-int debug = 0;
-
-static char child_stack[STACK_SIZE];
-
-int console_fd;
-
static int mount_overlay(char *jail_root, char *overlaydir) {
char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
goto upper_etc_printf;
fd = creat(upperresolvconf, 0644);
- if (fd == -1) {
- ERROR("creat(%s) failed: %m\n", upperresolvconf);
- goto upper_resolvconf_printf;
+ if (fd < 0) {
+ if (errno != EEXIST)
+ ERROR("creat(%s) failed: %m\n", upperresolvconf);
+ } else {
+ close(fd);
}
- close(fd);
-
DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
- goto opts_printf;
+ goto upper_resolvconf_printf;
ret = 0;
static void pass_console(int console_fd)
{
- struct ubus_context *ctx = ubus_connect(NULL);
+ struct ubus_context *child_ctx = ubus_connect(NULL);
static struct blob_buf req;
uint32_t id;
- if (!ctx)
+ if (!child_ctx)
return;
blob_buf_init(&req, 0);
blobmsg_add_string(&req, "name", opts.name);
- if (ubus_lookup_id(ctx, "container", &id) ||
- ubus_invoke_fd(ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
+ if (ubus_lookup_id(child_ctx, "container", &id) ||
+ ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
INFO("ubus request failed\n");
else
close(console_fd);
blob_buf_free(&req);
- ubus_free(ctx);
+ ubus_free(child_ctx);
}
static int create_dev_console(const char *jail_root)
/* Open UNIX/98 virtual console */
console_fd = posix_openpt(O_RDWR | O_NOCTTY);
- if (console_fd == -1)
+ if (console_fd < 0)
return -1;
console_fname = ptsname(console_fd);
snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
close(creat(dev_console_path, 0620));
- if (mount(console_fname, dev_console_path, NULL, MS_BIND, NULL))
+ if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL))
goto no_console;
/* use PTY slave for stdio */
static int hook_running = 0;
static int hook_return_code = 0;
+static struct hook_execvpe **current_hook = NULL;
+typedef void (*hook_return_handler)(void);
+static hook_return_handler hook_return_cb = NULL;
static void hook_process_timeout_cb(struct uloop_timeout *t);
static struct uloop_timeout hook_process_timeout = {
.cb = hook_process_timeout_cb,
};
+static void run_hooklist(void);
static void hook_process_handler(struct uloop_process *c, int ret)
{
uloop_timeout_cancel(&hook_process_timeout);
+
if (WIFEXITED(ret)) {
hook_return_code = WEXITSTATUS(ret);
- DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
+ if (hook_return_code)
+ ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
+ else
+ DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
+
} else {
hook_return_code = WTERMSIG(ret);
- DEBUG("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
+ ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
}
hook_running = 0;
- uloop_end();
+ ++current_hook;
+ run_hooklist();
}
static struct uloop_process hook_process = {
kill(hook_process.pid, SIGKILL);
}
-static int run_hook(struct hook_execvpe *hook)
+static void run_hooklist(void)
{
+ struct hook_execvpe *hook = *current_hook;
struct stat s;
+ if (!hook)
+ hook_return_cb();
+
DEBUG("executing hook %s\n", hook->file);
if (stat(hook->file, &s))
- return ENOENT;
+ hook_process_handler(&hook_process, ENOENT);
if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
- return EPERM;
-
- if (!((unsigned long)s.st_mode & (S_IRUSR | S_IRGRP | S_IROTH)))
- return EPERM;
-
- uloop_init();
+ hook_process_handler(&hook_process, EPERM);
hook_running = 1;
hook_process.pid = fork();
- if (hook_process.pid > 0) {
- /* parent */
- uloop_process_add(&hook_process);
-
- if (hook->timeout > 0)
- uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
-
- uloop_run();
- if (hook_running) {
- DEBUG("uloop interrupted, killing hook process\n");
- kill(hook_process.pid, SIGTERM);
- uloop_timeout_set(&hook_process_timeout, 1000);
- uloop_run();
- }
- uloop_done();
-
- waitpid(hook_process.pid, NULL, WCONTINUED);
-
- return hook_return_code;
- } else if (hook_process.pid == 0) {
+ if (hook_process.pid == 0) {
/* child */
- execvpe(hook->file, hook->argv, hook->envp);
- hook_running = 0;
+ execve(hook->file, hook->argv, hook->envp);
+ ERROR("execve error %m\n");
_exit(errno);
- } else {
+ } else if (hook_process.pid < 0) {
/* fork error */
+ ERROR("hook fork error\n");
hook_running = 0;
- return errno;
+ hook_process_handler(&hook_process, errno);
}
-}
-static int run_hooks(struct hook_execvpe **hooklist)
-{
- struct hook_execvpe **cur;
- int res;
+ /* parent */
+ uloop_process_add(&hook_process);
- if (!hooklist)
- return 0; /* Nothing to do */
+ if (hook->timeout > 0)
+ uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
- cur = hooklist;
+ uloop_run();
+ if (hook_running) {
+ DEBUG("uloop interrupted, killing jail process\n");
+ kill(hook_process.pid, SIGTERM);
+ uloop_timeout_set(&hook_process_timeout, 1000);
+ uloop_run();
+ }
+}
- while (*cur) {
- res = run_hook(*cur);
- if (res)
- DEBUG(" error running hook %s\n", (*cur)->file);
- else
- DEBUG(" success running hook %s\n", (*cur)->file);
+static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb)
+{
+ if (!hooklist)
+ return_cb();
- ++cur;
- }
+ current_hook = hooklist;
+ hook_return_cb = return_cb;
- return 0;
+ run_hooklist();
}
static int apply_sysctl(const char *jail_root)
if (!opts.sysctl)
return 0;
- asprintf(&procdir, "%s/proc", jail_root);
- if (!procdir)
+ if (asprintf(&procdir, "%s/proc", jail_root) < 0)
return ENOMEM;
mkdir(procdir, 0700);
cur = opts.sysctl;
while (*cur) {
- asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry);
- if (!fname)
+ if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0)
return ENOMEM;
DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
f = open(fname, O_WRONLY);
- if (f == -1) {
+ if (f < 0) {
ERROR("sysctl: can't open %s\n", fname);
+ free(fname);
+ return errno;
+ }
+ if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) {
+ ERROR("sysctl: write to %s\n", fname);
+ free(fname);
+ close(f);
return errno;
}
- write(f, (*cur)->value, strlen((*cur)->value));
free(fname);
close(f);
static int create_devices(void)
{
struct mknod_args **cur, *curdef;
+ char *path, *tmp;
if (!opts.devices)
goto only_default_devices;
cur = opts.devices;
while (*cur) {
- DEBUG("creating %s (mode=%08o)\n", (*cur)->path, (*cur)->mode);
- if (mknod((*cur)->path, (*cur)->mode, (*cur)->dev))
+ path = (*cur)->path;
+ /* don't allow devices outside of /dev */
+ if (strncmp(path, "/dev", 4))
+ return EPERM;
+
+ /* make sure parent folder exists */
+ tmp = strrchr(path, '/');
+ if (!tmp)
+ return EINVAL;
+
+ *tmp = '\0';
+ if (strcmp(path, "/dev")) {
+ DEBUG("creating directory %s\n", path);
+
+ mkdir_p(path, 0755);
+ }
+ *tmp = '/';
+
+ DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode);
+
+ /* create device */
+ if (mknod(path, (*cur)->mode, (*cur)->dev))
return errno;
+ /* change owner, if needed */
if (((*cur)->uid || (*cur)->gid) &&
- chown((*cur)->path, (*cur)->uid, (*cur)->gid))
+ chown(path, (*cur)->uid, (*cur)->gid))
return errno;
++cur;
}
/* Dev symbolic links as defined in OCI spec */
- symlink("/dev/pts/ptmx", "/dev/ptmx");
- symlink("/proc/self/fd", "/dev/fd");
- symlink("/proc/self/fd/0", "/dev/stdin");
- symlink("/proc/self/fd/1", "/dev/stdout");
- symlink("/proc/self/fd/2", "/dev/stderr");
+ (void) symlink("/dev/pts/ptmx", "/dev/ptmx");
+ (void) symlink("/proc/self/fd", "/dev/fd");
+ (void) symlink("/proc/self/fd/0", "/dev/stdin");
+ (void) symlink("/proc/self/fd/1", "/dev/stdout");
+ (void) symlink("/proc/self/fd/2", "/dev/stderr");
return 0;
}
+static char jail_root[] = "/tmp/ujail-XXXXXX";
+static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
+static mode_t old_umask;
+static void enter_jail_fs(void);
static int build_jail_fs(void)
{
- char jail_root[] = "/tmp/ujail-XXXXXX";
- char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
char *overlaydir = NULL;
- mode_t old_umask;
+ int ret;
old_umask = umask(0);
}
/* oldroot can't be MS_SHARED else pivot_root() fails */
- if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
+ if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) {
ERROR("private mount failed %m\n");
return -1;
}
if (opts.extroot) {
- if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) {
+ if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) {
ERROR("extroot mount failed %m\n");
return -1;
}
if (opts.overlaydir)
overlaydir = opts.overlaydir;
- if (overlaydir)
- mount_overlay(jail_root, overlaydir);
+ if (overlaydir) {
+ ret = mount_overlay(jail_root, overlaydir);
+ if (ret)
+ return ret;
+ }
if (chdir(jail_root)) {
ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
create_dev_console(jail_root);
/* make sure /etc/resolv.conf exists if in new network namespace */
- if (opts.namespace & CLONE_NEWNET) {
+ if (!opts.extroot && opts.namespace & CLONE_NEWNET) {
char jailetc[PATH_MAX], jaillink[PATH_MAX];
snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
if (overlaydir)
unlink(jaillink);
- symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
+ (void) symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
}
- run_hooks(opts.hooks.createContainer);
+ run_hooks(opts.hooks.createContainer, enter_jail_fs);
+
+ return 0;
+}
+
+static bool exit_from_child;
+static void free_and_exit(int ret)
+{
+ if (!exit_from_child && opts.ocibundle)
+ cgroups_free();
+
+ if (!exit_from_child && parent_ctx)
+ ubus_free(parent_ctx);
+
+ free_opts(!exit_from_child);
+
+ exit(ret);
+}
+static void post_jail_fs(void);
+static void enter_jail_fs(void)
+{
char dirbuf[sizeof(jail_root) + 4];
+
snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
mkdir(dirbuf, 0755);
if (pivot_root(jail_root, dirbuf) == -1) {
ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
- return -1;
+ free_and_exit(-1);
}
if (chdir("/")) {
ERROR("chdir(/) (after pivot_root) failed: %m\n");
- return -1;
+ free_and_exit(-1);
}
snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
if (create_devices()) {
ERROR("create_devices() failed\n");
- return -1;
+ free_and_exit(-1);
}
if (opts.ronly)
- mount(NULL, "/", NULL, MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
+ mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
umask(old_umask);
-
- return 0;
+ post_jail_fs();
}
static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
child_pid, gidmap?"gid_map":"uid_map") < 0)
return -1;
- if ((map_file = open(map_path, O_WRONLY)) == -1)
+ if ((map_file = open(map_path, O_WRONLY)) < 0)
return -1;
if (dprintf(map_file, "%s", mapstr)) {
}
close(map_file);
- free(mapstr);
return 0;
}
child_pid, gidmap?"gid_map":"uid_map") < 0)
return -1;
- if ((map_file = open(map_path, O_WRONLY)) == -1)
+ if ((map_file = open(map_path, O_WRONLY)) < 0)
return -1;
- if (dprintf(map_file, map_format, 0, id, 1) == -1) {
+ if (dprintf(map_file, map_format, 0, id, 1) < 0) {
close(map_file);
return -1;
}
return -1;
}
- if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) {
+ if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) {
return -1;
}
if (!p) {
ERROR("failed to get uid/gid for user %s: %d (%s)\n",
opts.user, errno, strerror(errno));
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
*user = p->pw_uid;
*user_gid = p->pw_gid;
g = getgrnam(opts.group);
if (!g) {
ERROR("failed to get gid for group %s: %m\n", opts.group);
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
*gr_gid = g->gr_gid;
} else {
{
if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
ERROR("failed to initgroups() for user %s: %m\n", opts.user);
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
ERROR("failed to set group id %d: %m\n", gr_gid);
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
ERROR("failed to set user id %d: %m\n", pw_uid);
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
}
return 0;
}
-#define MAX_ENVP 8
+#define MAX_ENVP 64
static char** build_envp(const char *seccomp, char **ocienvp)
{
static char *envp[MAX_ENVP];
static char preload_var[PATH_MAX];
static char seccomp_var[PATH_MAX];
+ static char seccomp_debug_var[20];
static char debug_var[] = "LD_DEBUG=all";
static char container_var[] = "container=ujail";
const char *preload_lib = find_lib("libpreload-seccomp.so");
if (seccomp) {
snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
envp[count++] = seccomp_var;
+ snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug);
+ envp[count++] = seccomp_debug_var;
snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
envp[count++] = preload_var;
}
fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
fprintf(stderr, " -E\t\tfail if jail cannot be setup\n");
fprintf(stderr, " -y\t\tprovide jail console\n");
- fprintf(stderr, " -J <dir>\tstart OCI bundle\n");
+ fprintf(stderr, " -J <dir>\tcreate container from OCI bundle\n");
+ fprintf(stderr, " -i\t\tstart container immediately\n");
+ fprintf(stderr, " -P <pidfile>\tcreate <pidfile>\n");
fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
and he has the same powers as root outside the jail,\n\
thus he can escape the jail and/or break stuff.\n\
{
int *fd = get_namespace_fd(nstype);
- if (!*fd)
- return EFAULT;
+ assert(fd != NULL);
- if (*fd == -1)
+ if (*fd < 0)
return 0;
if (setns(*fd, nstype) == -1) {
return 0;
}
-static int exec_jail(void *pipes_ptr)
+static int jail_running = 0;
+static int jail_return_code = 0;
+
+static void jail_process_timeout_cb(struct uloop_timeout *t);
+static struct uloop_timeout jail_process_timeout = {
+ .cb = jail_process_timeout_cb,
+};
+static void poststop(void);
+static void jail_process_handler(struct uloop_process *c, int ret)
+{
+ uloop_timeout_cancel(&jail_process_timeout);
+ if (WIFEXITED(ret)) {
+ jail_return_code = WEXITSTATUS(ret);
+ INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
+ } else {
+ jail_return_code = WTERMSIG(ret);
+ INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
+ }
+ jail_running = 0;
+ poststop();
+}
+
+static struct uloop_process jail_process = {
+ .cb = jail_process_handler,
+};
+
+static void jail_process_timeout_cb(struct uloop_timeout *t)
+{
+ DEBUG("jail process failed to stop, sending SIGKILL\n");
+ kill(jail_process.pid, SIGKILL);
+}
+
+static void jail_handle_signal(int signo)
+{
+ if (hook_running) {
+ DEBUG("forwarding signal %d to the hook process\n", signo);
+ kill(hook_process.pid, signo);
+ }
+
+ if (jail_running) {
+ DEBUG("forwarding signal %d to the jailed process\n", signo);
+ kill(jail_process.pid, signo);
+ }
+}
+
+static void signals_init(void)
+{
+ int i;
+ sigset_t sigmask;
+
+ sigfillset(&sigmask);
+ for (i = 0; i < _NSIG; i++) {
+ struct sigaction s = { 0 };
+
+ if (!sigismember(&sigmask, i))
+ continue;
+ if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
+ continue;
+
+ s.sa_handler = jail_handle_signal;
+ sigaction(i, &s, NULL);
+ }
+}
+
+static void pre_exec_jail(struct uloop_timeout *t);
+static struct uloop_timeout pre_exec_timeout = {
+ .cb = pre_exec_jail,
+};
+
+int pipes[4];
+static int exec_jail(void *arg)
{
- int *pipes = (int*)pipes_ptr;
char buf[1];
- int pw_uid, pw_gid, gr_gid;
+
+ exit_from_child = true;
+ prctl(PR_SET_SECUREBITS, 0);
+
+ uloop_init();
+ signals_init();
close(pipes[0]);
close(pipes[3]);
setns_open(CLONE_NEWNS);
setns_open(CLONE_NEWIPC);
setns_open(CLONE_NEWUTS);
-#ifdef CLONE_NEWTIME
- setns_open(CLONE_NEWTIME);
-#endif
buf[0] = 'i';
if (write(pipes[1], buf, 1) < 1) {
ERROR("can't write to parent\n");
- exit(EXIT_FAILURE);
+ return EXIT_FAILURE;
}
+ close(pipes[1]);
if (read(pipes[2], buf, 1) < 1) {
ERROR("can't read from parent\n");
- exit(EXIT_FAILURE);
+ return EXIT_FAILURE;
}
if (buf[0] != 'O') {
ERROR("parent had an error, child exiting\n");
- exit(EXIT_FAILURE);
+ return EXIT_FAILURE;
}
- close(pipes[1]);
- close(pipes[2]);
+ if (opts.namespace & CLONE_NEWCGROUP)
+ unshare(CLONE_NEWCGROUP);
+
+ setns_open(CLONE_NEWCGROUP);
if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
if (setregid(0, 0) < 0) {
ERROR("setgid\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if (setreuid(0, 0) < 0) {
ERROR("setuid\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if (setgroups(0, NULL) < 0) {
ERROR("setgroups\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
}
if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
&& sethostname(opts.hostname, strlen(opts.hostname))) {
ERROR("sethostname(%s) failed: %m\n", opts.hostname);
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
+ uloop_timeout_add(&pre_exec_timeout);
+ uloop_run();
+
+ free_and_exit(-1);
+ return -1;
+}
+
+static void pre_exec_jail(struct uloop_timeout *t)
+{
if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
ERROR("failed to build jail fs\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
+ } else {
+ run_hooks(opts.hooks.createContainer, post_jail_fs);
+ }
+}
+
+static void post_start_hook(void);
+static void post_jail_fs(void)
+{
+ char buf[1];
+
+ if (read(pipes[2], buf, 1) < 1) {
+ ERROR("can't read from parent\n");
+ free_and_exit(EXIT_FAILURE);
+ }
+ if (buf[0] != '!') {
+ ERROR("parent had an error, child exiting\n");
+ free_and_exit(EXIT_FAILURE);
}
- run_hooks(opts.hooks.startContainer);
+ close(pipes[2]);
+
+ run_hooks(opts.hooks.startContainer, post_start_hook);
+}
- if (!(opts.namespace & CLONE_NEWUSER) && (opts.setns.user == -1)) {
- get_jail_user(&pw_uid, &pw_gid, &gr_gid);
+static void post_start_hook(void)
+{
+ int pw_uid, pw_gid, gr_gid;
- set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
+ /*
+ * make sure setuid/setgid won't drop capabilities in case capabilities
+ * have been specified explicitely.
+ */
+ if (opts.capset.apply) {
+ if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
+ ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+ free_and_exit(EXIT_FAILURE);
+ }
}
+ /* drop capabilities, retain those still needed to further setup jail */
+ if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP)))
+ free_and_exit(EXIT_FAILURE);
+
+ /* use either cmdline-supplied user/group or uid/gid from OCI spec */
+ get_jail_user(&pw_uid, &pw_gid, &gr_gid);
+ set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
+
if (opts.additional_gids &&
(setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) {
ERROR("setgroups failed: %m\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if (opts.set_umask)
umask(opts.umask);
- if (applyOCIcapabilities(opts.capset))
- exit(EXIT_FAILURE);
+ /* restore securebits back to normal (and lock them if not in userns) */
+ if (opts.capset.apply) {
+ if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0:
+ SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) {
+ ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+ free_and_exit(EXIT_FAILURE);
+ }
+ }
- if (opts.capabilities && drop_capabilities(opts.capabilities))
- exit(EXIT_FAILURE);
+ /* drop remaining capabilities to end up with specified sets */
+ if (applyOCIcapabilities(opts.capset, 0))
+ free_and_exit(EXIT_FAILURE);
if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
- ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
- exit(EXIT_FAILURE);
+ ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+ free_and_exit(EXIT_FAILURE);
}
char **envp = build_envp(opts.seccomp, opts.envp);
if (!envp)
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
if (opts.cwd && chdir(opts.cwd))
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
uloop_end();
free_opts(false);
exit(EXIT_FAILURE);
}
-static int jail_running = 0;
-static int jail_return_code = 0;
-
-static void jail_process_timeout_cb(struct uloop_timeout *t);
-static struct uloop_timeout jail_process_timeout = {
- .cb = jail_process_timeout_cb,
-};
-
-static void jail_process_handler(struct uloop_process *c, int ret)
+static int ns_open_pid(const char *nstype, const pid_t target_ns)
{
- uloop_timeout_cancel(&jail_process_timeout);
- if (WIFEXITED(ret)) {
- jail_return_code = WEXITSTATUS(ret);
- INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
- } else {
- jail_return_code = WTERMSIG(ret);
- INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
- }
- jail_running = 0;
- uloop_end();
-}
+ char pid_pid_path[PATH_MAX];
-static struct uloop_process jail_process = {
- .cb = jail_process_handler,
-};
+ snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype);
-static void jail_process_timeout_cb(struct uloop_timeout *t)
-{
- DEBUG("jail process failed to stop, sending SIGKILL\n");
- kill(jail_process.pid, SIGKILL);
+ return open(pid_pid_path, O_RDONLY);
}
-static void jail_handle_signal(int signo)
+static void netns_updown(pid_t pid, bool start)
{
- if (hook_running) {
- DEBUG("forwarding signal %d to the hook process\n", signo);
- kill(hook_process.pid, signo);
- }
-
- if (jail_running) {
- DEBUG("forwarding signal %d to the jailed process\n", signo);
- kill(jail_process.pid, signo);
- }
-}
-
-static int netns_open_pid(const pid_t target_ns)
-{
- char pid_net_path[PATH_MAX];
-
- snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns);
-
- return open(pid_net_path, O_RDONLY);
-}
-
-static int pidns_open_pid(const pid_t target_ns)
-{
- char pid_pid_path[PATH_MAX];
-
- snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/pid", target_ns);
-
- return open(pid_pid_path, O_RDONLY);
-}
-
-static void netns_updown(pid_t pid, bool start)
-{
- struct ubus_context *ctx = ubus_connect(NULL);
static struct blob_buf req;
uint32_t id;
- if (!ctx)
+ if (!parent_ctx)
return;
blob_buf_init(&req, 0);
blobmsg_add_u32(&req, "pid", pid);
blobmsg_add_u8(&req, "start", start);
- if (ubus_lookup_id(ctx, "network", &id) ||
- ubus_invoke(ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
+ if (ubus_lookup_id(parent_ctx, "network", &id) ||
+ ubus_invoke(parent_ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
INFO("ubus request failed\n");
blob_buf_free(&req);
- ubus_free(ctx);
}
static int parseOCIenvarray(struct blob_attr *msg, char ***envp)
static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
{
- static char rootpath[PATH_MAX] = { 0 };
+ char extroot[PATH_MAX] = { 0 };
struct blob_attr *tb[__OCI_ROOT_MAX];
char *cur;
+ char *root_path;
blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
if (!tb[OCI_ROOT_PATH])
return ENODATA;
- strncpy(rootpath, jsonfile, PATH_MAX);
- cur = strrchr(rootpath, '/');
+ root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]);
+
+ /* prepend bundle directory in case of relative paths */
+ if (root_path[0] != '/') {
+ strncpy(extroot, jsonfile, PATH_MAX - 1);
- if (!cur)
- return ENOTDIR;
+ cur = strrchr(extroot, '/');
- *(++cur) = '\0';
- strncat(rootpath, blobmsg_get_string(tb[OCI_ROOT_PATH]), PATH_MAX - (strlen(rootpath) + 1));
+ if (!cur)
+ return ENOTDIR;
- opts.extroot = rootpath;
+ *(++cur) = '\0';
+ }
+
+ strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1));
+
+ /* follow symbolic link(s) */
+ opts.extroot = realpath(extroot, NULL);
+ if (!opts.extroot)
+ return errno;
- opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
+ if (tb[OCI_ROOT_READONLY])
+ opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
return 0;
}
goto errout;
}
- (*hooklist)[idx] = malloc(sizeof(struct hook_execvpe));
+ (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe));
if (tb[OCI_HOOK_ARGS]) {
ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv));
if (ret)
return 0;
}
+enum {
+ OCI_PROCESS_RLIMIT_TYPE,
+ OCI_PROCESS_RLIMIT_SOFT,
+ OCI_PROCESS_RLIMIT_HARD,
+ __OCI_PROCESS_RLIMIT_MAX,
+};
+
+static const struct blobmsg_policy oci_process_rlimit_policy[] = {
+ [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+ [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 },
+ [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 },
+};
+
/* from manpage GETRLIMIT(2) */
static const char* const rlimit_names[RLIM_NLIMITS] = {
[RLIMIT_AS] = "AS",
}
-static int parseOCIrlimits(struct blob_attr *msg)
+static int parseOCIrlimit(struct blob_attr *msg)
{
- struct blob_attr *cur, *cure;
- int rem, reme;
+ struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX];
int limtype = -1;
struct rlimit *curlim;
- rlim_t soft, hard;
- bool sethard = false, setsoft = false;
- blobmsg_for_each_attr(cur, msg, rem) {
- blobmsg_for_each_attr(cure, cur, reme) {
- if (!strcmp(blobmsg_name(cure), "type") && (blobmsg_type(cure) == BLOBMSG_TYPE_STRING)) {
- limtype = resolve_rlimit(blobmsg_get_string(cure));
- } else if (!strcmp(blobmsg_name(cure), "soft")) {
- switch (blobmsg_type(cure)) {
- case BLOBMSG_TYPE_INT32:
- soft = blobmsg_get_u32(cure);
- break;
- case BLOBMSG_TYPE_INT64:
- soft = blobmsg_get_u64(cure);
- break;
- default:
- return EINVAL;
- }
- setsoft = true;
- } else if (!strcmp(blobmsg_name(cure), "hard")) {
- switch (blobmsg_type(cure)) {
- case BLOBMSG_TYPE_INT32:
- hard = blobmsg_get_u32(cure);
- break;
- case BLOBMSG_TYPE_INT64:
- hard = blobmsg_get_u64(cure);
- break;
- default:
- return EINVAL;
- }
- sethard = true;
- } else {
- return EINVAL;
- }
- }
+ blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
- if (limtype < 0)
- return EINVAL;
+ if (!tb[OCI_PROCESS_RLIMIT_TYPE] ||
+ !tb[OCI_PROCESS_RLIMIT_SOFT] ||
+ !tb[OCI_PROCESS_RLIMIT_HARD])
+ return ENODATA;
- if (opts.rlimits[limtype])
- return ENOTUNIQ;
+ limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE]));
- if (!sethard || !setsoft)
- return ENODATA;
+ if (limtype < 0)
+ return EINVAL;
- curlim = malloc(sizeof(struct rlimit));
- curlim->rlim_cur = soft;
- curlim->rlim_max = hard;
+ if (opts.rlimits[limtype])
+ return ENOTUNIQ;
- opts.rlimits[limtype] = curlim;
- }
+ curlim = malloc(sizeof(struct rlimit));
+ curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]);
+ curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]);
+
+ opts.rlimits[limtype] = curlim;
return 0;
};
static int parseOCIprocess(struct blob_attr *msg)
{
- struct blob_attr *tb[__OCI_PROCESS_MAX];
- int res;
+ struct blob_attr *tb[__OCI_PROCESS_MAX], *cur;
+ int rem, res;
blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
if (res)
return res;
- opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
- opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
+ if (tb[OCI_PROCESS_TERMINAL])
+ opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
+
+ if (tb[OCI_PROCESS_NONEWPRIVILEGES])
+ opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
if (tb[OCI_PROCESS_CWD])
opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD]));
(res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
return res;
- if (tb[OCI_PROCESS_RLIMITS] &&
- (res = parseOCIrlimits(tb[OCI_PROCESS_RLIMITS])))
- return res;
+ if (tb[OCI_PROCESS_RLIMITS]) {
+ blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) {
+ res = parseOCIrlimit(cur);
+ if (res)
+ return res;
+ }
+ }
if (tb[OCI_PROCESS_OOMSCOREADJ]) {
opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]);
blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]));
fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY);
- if (fd == -1)
+ if (fd < 0)
return errno?:ESTALE;
- if (ioctl(fd, NS_GET_NSTYPE) != nstype)
+ if (ioctl(fd, NS_GET_NSTYPE) != nstype) {
+ close(fd);
return EINVAL;
+ }
DEBUG("opened existing %s namespace got filehandler %u\n",
blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
}
return 0;
-};
+}
+static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size)
+{
+ if (container_id == 0 && size >= 1)
+ if (!is_gidmap)
+ opts.root_map_uid = host_id;
+}
enum {
OCI_LINUX_UIDGIDMAP_CONTAINERID,
static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
{
- const char *map_format = "%d %d %d\n";
struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
struct blob_attr *cur;
- int rem, len;
- char **mappings;
- char *map, *curstr;
- unsigned int cnt = 0;
- size_t totallen = 0;
-
- /* count number of mappings */
- blobmsg_for_each_attr(cur, msg, rem)
- cnt++;
-
- if (!cnt)
- return 0;
-
- /* allocate array for mappings */
- mappings = calloc(1 + cnt, sizeof(char*));
- if (!mappings)
- return ENOMEM;
-
- mappings[cnt] = NULL;
+ int rem;
+ char *map;
+ size_t len, pos, totallen = 0;
- cnt = 0;
blobmsg_for_each_attr(cur, msg, rem) {
blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
!tb[OCI_LINUX_UIDGIDMAP_SIZE])
return EINVAL;
- /* write mapping line into allocated string */
- len = asprintf(&mappings[cnt++], map_format,
+ /* count length */
+ totallen += snprintf(NULL, 0, "%d %d %d\n",
blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
-
- if (len < 0)
- return ENOMEM;
-
- totallen += len;
}
/* allocate combined mapping string */
- map = calloc(1 + totallen, sizeof(char));
+ map = malloc(totallen + 1);
if (!map)
return ENOMEM;
- map[0] = '\0';
+ pos = 0;
+ blobmsg_for_each_attr(cur, msg, rem) {
+ blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+ get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+ blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+ blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
- /* concatenate mapping strings into combined string */
- curstr = mappings[0];
- while (curstr) {
- strcat(map, curstr);
- free(curstr++);
+ /* write mapping line into pre-allocated string */
+ len = snprintf(&map[pos], totallen + 1, "%d %d %d\n",
+ blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+ blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+ blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+ pos += len;
+ totallen -= len;
}
- free(mappings);
+
+ assert(totallen == 0);
if (is_gidmap)
opts.gidmap = map;
return ENOMEM;
tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
- if (!tmp->mode)
+ if (!tmp->mode) {
+ free(tmp);
return EINVAL;
+ }
if (tmp->mode != S_IFIFO) {
- if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR])
+ if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) {
+ free(tmp);
return ENODATA;
+ }
tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]),
blobmsg_get_u32(tb[OCI_DEVICES_MINOR]));
}
if (tb[OCI_DEVICES_FILEMODE]) {
- if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]))
+ if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) {
+ free(tmp);
return EINVAL;
+ }
tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]);
} else {
return 0;
}
-enum {
- OCI_LINUX_RESOURCES,
- OCI_LINUX_SECCOMP,
- OCI_LINUX_SYSCTL,
- OCI_LINUX_NAMESPACES,
- OCI_LINUX_DEVICES,
- OCI_LINUX_UIDMAPPINGS,
- OCI_LINUX_GIDMAPPINGS,
- OCI_LINUX_MASKEDPATHS,
- OCI_LINUX_READONLYPATHS,
- OCI_LINUX_ROOTFSPROPAGATION,
- __OCI_LINUX_MAX,
-};
-
-static const struct blobmsg_policy oci_linux_policy[] = {
- [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
- [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
- [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
- [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
- [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
-};
-
static int parseOCIsysctl(struct blob_attr *msg)
{
struct blob_attr *cur;
return 0;
}
+
+enum {
+ OCI_LINUX_CGROUPSPATH,
+ OCI_LINUX_RESOURCES,
+ OCI_LINUX_SECCOMP,
+ OCI_LINUX_SYSCTL,
+ OCI_LINUX_NAMESPACES,
+ OCI_LINUX_DEVICES,
+ OCI_LINUX_UIDMAPPINGS,
+ OCI_LINUX_GIDMAPPINGS,
+ OCI_LINUX_MASKEDPATHS,
+ OCI_LINUX_READONLYPATHS,
+ OCI_LINUX_ROOTFSPROPAGATION,
+ __OCI_LINUX_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_policy[] = {
+ [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING },
+ [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
+ [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
+ [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
+ [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
+ [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
+};
+
static int parseOCIlinux(struct blob_attr *msg)
{
struct blob_attr *tb[__OCI_LINUX_MAX];
struct blob_attr *cur;
int rem;
int res = 0;
+ char *cgpath;
+ char cgfullpath[256] = "/sys/fs/cgroup";
blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
if (tb[OCI_LINUX_READONLYPATHS]) {
blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) {
- res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, NULL, 0);
+ res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0);
if (res)
return res;
}
if (tb[OCI_LINUX_MASKEDPATHS]) {
blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) {
- res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, NULL, 1);
+ res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0);
if (res)
return res;
}
return res;
}
+ if (tb[OCI_LINUX_CGROUPSPATH]) {
+ cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]);
+ if (cgpath[0] == '/') {
+ if (strlen(cgpath) >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+ return E2BIG;
+
+ strcat(cgfullpath, cgpath);
+ } else {
+ strcat(cgfullpath, "/containers/");
+ strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+ strcat(cgfullpath, "/");
+ if (strlen(cgpath) >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+ return E2BIG;
+
+ strcat(cgfullpath, cgpath);
+ }
+ } else {
+ strcat(cgfullpath, "/containers/");
+ strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+ strcat(cgfullpath, "/");
+ strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */
+ }
+
+ cgroups_init(cgfullpath);
+
+ if (tb[OCI_LINUX_RESOURCES]) {
+ res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]);
+ if (res)
+ return res;
+ }
+
return 0;
}
OCI_MOUNTS,
OCI_HOOKS,
OCI_LINUX,
+ OCI_ANNOTATIONS,
__OCI_MAX,
};
[OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
[OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE },
[OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
+ [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE },
};
static int parseOCI(const char *jsonfile)
int res;
blob_buf_init(&ocibuf, 0);
- if (!blobmsg_add_json_from_file(&ocibuf, jsonfile))
- return ENOENT;
+
+ if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) {
+ res=ENOENT;
+ goto errout;
+ }
blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
- if (!tb[OCI_VERSION])
- return ENOMSG;
+ if (!tb[OCI_VERSION]) {
+ res=ENOMSG;
+ goto errout;
+ }
if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
- return ENOTSUP;
+ res=ENOTSUP;
+ goto errout;
}
if (tb[OCI_HOSTNAME])
opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME]));
- if (!tb[OCI_PROCESS])
- return ENODATA;
+ if (!tb[OCI_PROCESS]) {
+ res=ENODATA;
+ goto errout;
+ }
if ((res = parseOCIprocess(tb[OCI_PROCESS])))
- return res;
-
- if (!tb[OCI_ROOT])
- return ENODATA;
+ goto errout;
+ if (!tb[OCI_ROOT]) {
+ res=ENODATA;
+ goto errout;
+ }
if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
- return res;
+ goto errout;
- if (!tb[OCI_MOUNTS])
- return ENODATA;
+ if (!tb[OCI_MOUNTS]) {
+ res=ENODATA;
+ goto errout;
+ }
blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
if ((res = parseOCImount(cur)))
- return res;
+ goto errout;
if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
- return res;
+ goto errout;
if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS])))
- return res;
+ goto errout;
+
+ if (tb[OCI_ANNOTATIONS])
+ opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]);
+errout:
blob_buf_free(&ocibuf);
- return 0;
+ return res;
}
static int set_oom_score_adj(void)
snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid);
f = open(fname, O_WRONLY | O_TRUNC);
- if (f == -1)
+ if (f < 0)
return errno;
dprintf(f, "%d", opts.oom_score_adj);
}
+enum {
+ OCI_STATE_CREATING,
+ OCI_STATE_CREATED,
+ OCI_STATE_RUNNING,
+ OCI_STATE_STOPPED,
+};
+
+static int jail_oci_state = OCI_STATE_CREATED;
+static void pipe_send_start_container(struct uloop_timeout *t);
+static struct uloop_timeout start_container_timeout = {
+ .cb = pipe_send_start_container,
+};
+
+static int handle_start(struct ubus_context *ctx, struct ubus_object *obj,
+ struct ubus_request_data *req, const char *method,
+ struct blob_attr *msg)
+{
+ if (jail_oci_state != OCI_STATE_CREATED)
+ return UBUS_STATUS_INVALID_ARGUMENT;
+
+ uloop_timeout_add(&start_container_timeout);
+
+ return UBUS_STATUS_OK;
+}
+
+static struct blob_buf bb;
+static int handle_state(struct ubus_context *ctx, struct ubus_object *obj,
+ struct ubus_request_data *req, const char *method,
+ struct blob_attr *msg)
+{
+ char *statusstr;
+
+ switch (jail_oci_state) {
+ case OCI_STATE_CREATING:
+ statusstr = "creating";
+ break;
+ case OCI_STATE_CREATED:
+ statusstr = "created";
+ break;
+ case OCI_STATE_RUNNING:
+ statusstr = "running";
+ break;
+ case OCI_STATE_STOPPED:
+ statusstr = "stopped";
+ break;
+ default:
+ statusstr = "unknown";
+ }
+
+ blob_buf_init(&bb, 0);
+ blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING);
+ blobmsg_add_string(&bb, "id", opts.name);
+ blobmsg_add_string(&bb, "status", statusstr);
+ if (jail_oci_state == OCI_STATE_CREATED ||
+ jail_oci_state == OCI_STATE_RUNNING)
+ blobmsg_add_u32(&bb, "pid", jail_process.pid);
+
+ blobmsg_add_string(&bb, "bundle", opts.ocibundle);
+
+ if (opts.annotations)
+ blobmsg_add_blob(&bb, opts.annotations);
+
+ ubus_send_reply(ctx, req, bb.head);
+
+ return UBUS_STATUS_OK;
+}
+
+enum {
+ CONTAINER_KILL_ATTR_SIGNAL,
+ __CONTAINER_KILL_ATTR_MAX,
+};
+
+static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = {
+ [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 },
+};
+
+static int
+container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj,
+ struct ubus_request_data *req, const char *method,
+ struct blob_attr *msg)
+{
+ struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur;
+ int sig = SIGTERM;
+
+ blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
+
+ cur = tb[CONTAINER_KILL_ATTR_SIGNAL];
+ if (cur)
+ sig = blobmsg_get_u32(cur);
+
+ if (jail_oci_state == OCI_STATE_CREATING)
+ return UBUS_STATUS_NOT_FOUND;
+
+ if (kill(jail_process.pid, sig) == 0)
+ return 0;
+
+ switch (errno) {
+ case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT;
+ case EPERM: return UBUS_STATUS_PERMISSION_DENIED;
+ case ESRCH: return UBUS_STATUS_NOT_FOUND;
+ }
+
+ return UBUS_STATUS_UNKNOWN_ERROR;
+}
+
+static int
+jail_writepid(pid_t pid)
+{
+ FILE *_pidfile;
+
+ if (!opts.pidfile)
+ return 0;
+
+ _pidfile = fopen(opts.pidfile, "w");
+ if (_pidfile == NULL)
+ return errno;
+
+ if (fprintf(_pidfile, "%d\n", pid) < 0) {
+ fclose(_pidfile);
+ return errno;
+ }
+
+ if (fclose(_pidfile))
+ return errno;
+
+ return 0;
+}
+
+static int checkpath(const char *path)
+{
+ int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+ if (dirfd < 0) {
+ ERROR("path %s open failed %m\n", path);
+ return -1;
+ }
+ close(dirfd);
+
+ return 0;
+}
+
+static struct ubus_method container_methods[] = {
+ UBUS_METHOD_NOARG("start", handle_start),
+ UBUS_METHOD_NOARG("state", handle_state),
+ UBUS_METHOD("kill", container_handle_kill, container_kill_attrs),
+};
+
+static struct ubus_object_type container_object_type =
+ UBUS_OBJECT_TYPE("container", container_methods);
+
+static struct ubus_object container_object = {
+ .type = &container_object_type,
+ .methods = container_methods,
+ .n_methods = ARRAY_SIZE(container_methods),
+};
+
+static void post_main(struct uloop_timeout *t);
+static struct uloop_timeout post_main_timeout = {
+ .cb = post_main,
+};
+static int netns_fd;
+static int pidns_fd;
+#ifdef CLONE_NEWTIME
+static int timens_fd;
+#endif
+static void post_create_runtime(void);
int main(int argc, char **argv)
{
- sigset_t sigmask;
uid_t uid = getuid();
const char log[] = "/dev/log";
- const char ubus[] = "/var/run/ubus.sock";
- char *jsonfile = NULL;
- int i, ch;
- int pipes[4];
- char sig_buf[1];
- int netns_fd;
- int pidns_fd;
+ const char ubus[] = "/var/run/ubus/ubus.sock";
+ int ret = EXIT_FAILURE;
+ int ch;
if (uid) {
ERROR("not root, aborting: %m\n");
umask(022);
mount_list_init();
init_library_search();
+ cgroups_prepare();
+ exit_from_child = false;
while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
switch (ch) {
opts.namespace |= CLONE_NEWCGROUP;
break;
case 'R':
- opts.extroot = strdup(optarg);
+ opts.extroot = realpath(optarg, NULL);
break;
case 's':
opts.namespace |= CLONE_NEWNS;
opts.group = optarg;
break;
case 'O':
- opts.overlaydir = optarg;
+ opts.overlaydir = realpath(optarg, NULL);
break;
case 'T':
opts.tmpoverlaysize = optarg;
opts.console = 1;
break;
case 'J':
- asprintf(&jsonfile, "%s/config.json", optarg);
+ opts.ocibundle = optarg;
+ break;
+ case 'i':
+ opts.immediately = true;
+ break;
+ case 'P':
+ opts.pidfile = optarg;
break;
}
}
- if (opts.namespace && !jsonfile)
+ if (opts.namespace && !opts.ocibundle)
opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
/* those are filehandlers, so -1 indicates unused */
opts.setns.time = -1;
#endif
- if (jsonfile) {
+ /*
+ * uid in parent user namespace representing root user in new
+ * user namespace, defaults to nobody unless specified in uidMappings
+ */
+ opts.root_map_uid = 65534;
+
+ if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) {
+ ERROR("failed to read capabilities from file %s\n", opts.capabilities);
+ ret=-1;
+ goto errout;
+ }
+
+ if (opts.ocibundle) {
+ char *jsonfile;
int ocires;
+
+ if (!opts.name) {
+ ERROR("OCI bundle needs a named jail\n");
+ ret=-1;
+ goto errout;
+ }
+ if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) {
+ ret=-ENOMEM;
+ goto errout;
+ }
ocires = parseOCI(jsonfile);
free(jsonfile);
if (ocires) {
ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
- return ocires;
+ ret=ocires;
+ goto errout;
+ }
+ }
+
+ if (opts.namespace & CLONE_NEWNET) {
+ if (!opts.name) {
+ ERROR("netns needs a named jail\n");
+ ret=-1;
+ goto errout;
}
}
+
if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
- return -1;
+ ret=-1;
+ goto errout;
+ }
+
+ if (opts.extroot && checkpath(opts.extroot)) {
+ ERROR("invalid rootfs path '%s'", opts.extroot);
+ ret=-1;
+ goto errout;
+ }
+
+ if (opts.overlaydir && checkpath(opts.overlaydir)) {
+ ERROR("invalid rootfs overlay path '%s'", opts.overlaydir);
+ ret=-1;
+ goto errout;
}
/* no <binary> param found */
- if (!jsonfile && (argc - optind < 1)) {
+ if (!opts.ocibundle && (argc - optind < 1)) {
usage();
- return EXIT_FAILURE;
+ ret=EXIT_FAILURE;
+ goto errout;
}
- if (!(jsonfile||opts.namespace||opts.capabilities||opts.seccomp)) {
+ if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp)) {
ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
usage();
- return EXIT_FAILURE;
+ ret=EXIT_FAILURE;
+ goto errout;
}
DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
opts.namespace,
- opts.capabilities != 0 || opts.capset.apply,
+ opts.capset.apply,
opts.seccomp != 0 || opts.ociseccomp != 0);
- if (!jsonfile) {
+ uloop_init();
+ signals_init();
+
+ parent_ctx = ubus_connect(NULL);
+ ubus_add_uloop(parent_ctx);
+
+ if (opts.ocibundle) {
+ char *objname;
+ if (asprintf(&objname, "container.%s", opts.name) < 0) {
+ ret=-ENOMEM;
+ goto errout;
+ }
+
+ container_object.name = objname;
+ ret = ubus_add_object(parent_ctx, &container_object);
+ if (ret) {
+ ERROR("Failed to add object: %s\n", ubus_strerror(ret));
+ ret=-1;
+ goto errout;
+ }
+ }
+
+ /* deliberately not using 'else' on unrelated conditional branches */
+ if (!opts.ocibundle) {
/* allocate NULL-terminated array for argv */
opts.jail_argv = calloc(1 + argc - optind, sizeof(char**));
- if (!opts.jail_argv)
- return EXIT_FAILURE;
-
+ if (!opts.jail_argv) {
+ ret=EXIT_FAILURE;
+ goto errout;
+ }
for (size_t s = optind; s < argc; s++)
opts.jail_argv[s - optind] = strdup(argv[s]);
if (!opts.extroot) {
if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
ERROR("failed to load dependencies\n");
- return -1;
+ ret=-1;
+ goto errout;
}
}
if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
ERROR("failed to load libpreload-seccomp.so\n");
opts.seccomp = 0;
- if (opts.require_jail)
- return -1;
+ if (opts.require_jail) {
+ ret=-1;
+ goto errout;
+ }
}
+ uloop_timeout_add(&post_main_timeout);
+ uloop_run();
+
+errout:
+ if (opts.ocibundle)
+ cgroups_free();
+
+ free_opts(true);
+
+ return ret;
+}
+
+static void post_main(struct uloop_timeout *t)
+{
if (apply_rlimits()) {
ERROR("error applying resource limits\n");
- exit(EXIT_FAILURE);
+ free_and_exit(EXIT_FAILURE);
}
if (opts.name)
prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
- sigfillset(&sigmask);
- for (i = 0; i < _NSIG; i++) {
- struct sigaction s = { 0 };
-
- if (!sigismember(&sigmask, i))
- continue;
- if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
- continue;
-
- s.sa_handler = jail_handle_signal;
- sigaction(i, &s, NULL);
- }
-
if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
- return -1;
+ free_and_exit(-1);
if (has_namespaces()) {
if (opts.namespace & CLONE_NEWNS) {
if (!opts.extroot && (opts.user || opts.group)) {
- add_mount_bind("/etc/passwd", 0, -1);
- add_mount_bind("/etc/group", 0, -1);
+ add_mount_bind("/etc/passwd", 1, -1);
+ add_mount_bind("/etc/group", 1, -1);
}
#if defined(__GLIBC__)
if (!opts.extroot)
- add_mount_bind("/etc/nsswitch.conf", 0, -1);
+ add_mount_bind("/etc/nsswitch.conf", 1, -1);
#endif
if (!(opts.namespace & CLONE_NEWNET)) {
- add_mount_bind("/etc/resolv.conf", 0, -1);
- } else if (opts.setns.net == -1) {
+ add_mount_bind("/etc/resolv.conf", 1, 0);
+ } else if (opts.setns.ns == -1) {
+ /* new mount namespace to provide /dev/resolv.conf.d */
char hostdir[PATH_MAX];
snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
mkdir_p(hostdir, 0755);
- add_mount(hostdir, "/dev/resolv.conf.d", NULL, MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, NULL, -1);
+ add_mount(hostdir, "/dev/resolv.conf.d", NULL, MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0);
}
/* default mounts */
- add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M", -1);
- add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
+ add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1);
+ add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
- if (opts.procfs || jsonfile) {
- add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL, -1);
+ if (opts.procfs || opts.ocibundle) {
+ add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1);
/*
* hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only
* move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII
* table (and in the alphabet).
*/
- if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, NULL, -1))
+ if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1))
if (opts.namespace & CLONE_NEWNET)
- if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, NULL, -1))
- add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, NULL, -1);
+ if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1))
+ add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1);
}
- if (opts.sysfs || jsonfile)
- add_mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, NULL, -1);
+ if (opts.sysfs || opts.ocibundle)
+ add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1);
- if (jsonfile)
- add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, "mode=1777", -1);
+ if (opts.ocibundle)
+ add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1);
}
if (opts.setns.pid != -1) {
- pidns_fd = pidns_open_pid(getpid());
+ pidns_fd = ns_open_pid("pid", getpid());
setns_open(CLONE_NEWPID);
} else {
pidns_fd = -1;
}
- jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes);
+#ifdef CLONE_NEWTIME
+ if (opts.setns.time != -1) {
+ timens_fd = ns_open_pid("time", getpid());
+ setns_open(CLONE_NEWTIME);
+ } else {
+ timens_fd = -1;
+ }
+#endif
+
+ if (opts.namespace & CLONE_NEWUSER) {
+ if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
+ ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+ free_and_exit(EXIT_FAILURE);
+ }
+ if (seteuid(opts.root_map_uid)) {
+ ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+ free_and_exit(EXIT_FAILURE);
+ }
+ }
+
+ jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL);
} else {
jail_process.pid = fork();
}
if (jail_process.pid > 0) {
/* parent process */
+ char sig_buf[1];
+
+ uloop_process_add(&jail_process);
jail_running = 1;
- seteuid(0);
+ if (seteuid(0)) {
+ ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+ free_and_exit(EXIT_FAILURE);
+ }
+
+ prctl(PR_SET_SECUREBITS, 0);
+
if (pidns_fd != -1) {
setns(pidns_fd, CLONE_NEWPID);
close(pidns_fd);
}
+#ifdef CLONE_NEWTIME
+ if (timens_fd != -1) {
+ setns(timens_fd, CLONE_NEWTIME);
+ close(timens_fd);
+ }
+#endif
if (opts.setns.net != -1)
close(opts.setns.net);
if (opts.setns.ns != -1)
close(opts.setns.user);
if (opts.setns.cgroup != -1)
close(opts.setns.cgroup);
-#ifdef CLONE_NEWTIME
- if (opts.setns.time != -1)
- close(opts.setns.time);
-#endif
close(pipes[1]);
close(pipes[2]);
- run_hooks(opts.hooks.createRuntime);
if (read(pipes[0], sig_buf, 1) < 1) {
ERROR("can't read from child\n");
- return -1;
+ free_and_exit(-1);
}
close(pipes[0]);
set_oom_score_adj();
+ if (opts.ocibundle)
+ cgroups_apply(jail_process.pid);
+
if (opts.namespace & CLONE_NEWUSER) {
if (write_setgroups(jail_process.pid, true)) {
ERROR("can't write setgroups\n");
- return -1;
+ free_and_exit(-1);
}
if (!opts.uidmap) {
bool has_gr = (opts.gr_gid != -1);
}
if (opts.namespace & CLONE_NEWNET) {
- if (!opts.name) {
- ERROR("netns needs a named jail\n");
- return -1;
- }
- netns_fd = netns_open_pid(jail_process.pid);
+ netns_fd = ns_open_pid("net", jail_process.pid);
netns_updown(jail_process.pid, true);
}
- sig_buf[0] = 'O';
- if (write(pipes[3], sig_buf, 1) < 0) {
- ERROR("can't write to child\n");
- return -1;
+ if (jail_writepid(jail_process.pid)) {
+ ERROR("failed to write pidfile: %m\n");
+ free_and_exit(-1);
}
- close(pipes[3]);
- run_hooks(opts.hooks.poststart);
-
- uloop_init();
- uloop_process_add(&jail_process);
- uloop_run();
- if (jail_running) {
- DEBUG("uloop interrupted, killing jail process\n");
- kill(jail_process.pid, SIGTERM);
- uloop_timeout_set(&jail_process_timeout, 1000);
- uloop_run();
- }
- uloop_done();
- if (opts.namespace & CLONE_NEWNET) {
- setns(netns_fd, CLONE_NEWNET);
- netns_updown(getpid(), false);
- close(netns_fd);
- }
- run_hooks(opts.hooks.poststop);
- free_opts(true);
- return jail_return_code;
} else if (jail_process.pid == 0) {
/* fork child process */
- return exec_jail(&pipes);
+ free_and_exit(exec_jail(NULL));
} else {
ERROR("failed to clone/fork: %m\n");
- return EXIT_FAILURE;
+ free_and_exit(EXIT_FAILURE);
+ }
+ run_hooks(opts.hooks.createRuntime, post_create_runtime);
+}
+
+static void post_poststart(void);
+static void post_create_runtime(void)
+{
+ char sig_buf[1];
+
+ sig_buf[0] = 'O';
+ if (write(pipes[3], sig_buf, 1) < 0) {
+ ERROR("can't write to child\n");
+ free_and_exit(-1);
+ }
+
+ jail_oci_state = OCI_STATE_CREATED;
+ if (opts.ocibundle && !opts.immediately)
+ uloop_run(); /* wait for 'start' command via ubus */
+ else
+ pipe_send_start_container(NULL);
+}
+
+static void pipe_send_start_container(struct uloop_timeout *t)
+{
+ char sig_buf[1];
+
+ jail_oci_state = OCI_STATE_RUNNING;
+ sig_buf[0] = '!';
+ if (write(pipes[3], sig_buf, 1) < 0) {
+ ERROR("can't write to child\n");
+ free_and_exit(-1);
}
+ close(pipes[3]);
+
+ run_hooks(opts.hooks.poststart, post_poststart);
+}
+
+static void post_poststart(void)
+{
+ uloop_run(); /* idle here while jail is running */
+ if (jail_running) {
+ DEBUG("uloop interrupted, killing jail process\n");
+ kill(jail_process.pid, SIGTERM);
+ uloop_timeout_set(&jail_process_timeout, 1000);
+ uloop_run();
+ }
+ uloop_done();
+ poststop();
+}
+
+static void post_poststop(void);
+static void poststop(void) {
+ if (opts.namespace & CLONE_NEWNET) {
+ setns(netns_fd, CLONE_NEWNET);
+ netns_updown(getpid(), false);
+ close(netns_fd);
+ }
+ run_hooks(opts.hooks.poststop, post_poststop);
+}
+
+static void post_poststop(void)
+{
+ free_opts(true);
+ if (parent_ctx)
+ ubus_free(parent_ctx);
+
+ exit(jail_return_code);
}