trace: use standard POSIX header for basename() master
authorTony Ambardar <itugrok@yahoo.com>
Wed, 6 Mar 2024 00:27:42 +0000 (16:27 -0800)
committerTony Ambardar <itugrok@yahoo.com>
Sat, 30 Mar 2024 21:30:41 +0000 (14:30 -0700)
The musl libc only implements POSIX basename() but provided a GNU header
kludge in <string.h>, which was removed in musl 1.2.5 [1]. Use the standard
<libgen.h> header to avoid compilation errors like:

trace/trace.c: In function 'main':
trace/trace.c:435:64: error: implicit declaration of function 'basename';
did you mean 'rename'? [-Werror=implicit-function-declaration]
  435 | if (asprintf(&json, "/tmp/%s.%u.json", basename(*argv), child) < 0)
      |                                        ^~~~~~~~
      |                                        rename
cc1: all warnings being treated as errors

Link 1: https://git.musl-libc.org/cgit/musl/log/?qt=grep&q=basename

Signed-off-by: Tony Ambardar <itugrok@yahoo.com>
60 files changed:
.github/workflows/build.yml [new file with mode: 0644]
.gitignore
CMakeLists.txt
container.h
hotplug-dispatch.c [new file with mode: 0644]
initd/early.c
initd/init.c
initd/init.h
initd/mkdev.c
initd/preinit.c
initd/zram.c [deleted file]
inittab.c
jail/capabilities.c
jail/capabilities.h
jail/cgroups-bpf.c [new file with mode: 0644]
jail/cgroups-bpf.h [new file with mode: 0644]
jail/cgroups.c [new file with mode: 0644]
jail/cgroups.h [new file with mode: 0644]
jail/elf.c
jail/elf.h
jail/fs.c
jail/fs.h
jail/jail.c
jail/jail.h
jail/log.h
jail/netifd.c [new file with mode: 0644]
jail/netifd.h [new file with mode: 0644]
jail/preload.c
jail/seccomp-bpf.h
jail/seccomp-oci.c [new file with mode: 0644]
jail/seccomp-oci.h [new file with mode: 0644]
jail/seccomp-syscalls-helpers.h [new file with mode: 0644]
jail/seccomp.c
jail/seccomp.h
log.h
make_syscall_h.sh
plug/coldplug.c
plug/hotplug.c
procd.c
procd.h
rcS.c
service/instance.c
service/instance.h
service/service.c
service/service.h
service/trigger.c
service/watch.c
state.c
system.c
sysupgrade.c
sysupgrade.h
trace/preload.c
trace/trace.c
ubus.c
upgraded/CMakeLists.txt
upgraded/upgraded.c
utils/utils.c
utils/utils.h
uxc.c [new file with mode: 0644]
watchdog.c

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644 (file)
index 0000000..4d3c40d
--- /dev/null
@@ -0,0 +1,27 @@
+name: OpenWrt CI testing
+
+on: [ push, pull_request ]
+env:
+  CI_ENABLE_UNIT_TESTING: 0
+  CI_TARGET_BUILD_DEPENDS: libubox
+  CI_CMAKE_EXTRA_BUILD_ARGS: -DJAIL_SUPPORT=1
+
+jobs:
+  native_testing:
+    name: Various native checks
+    runs-on: ubuntu-20.04
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: ynezz/gh-actions-openwrt-ci-native@v0.0.1
+
+      - name: Upload build artifacts
+        uses: actions/upload-artifact@v2
+        if: failure()
+        with:
+          name: native-build-artifacts
+          if-no-files-found: ignore
+          path: |
+            build/scan
+            tests/cram/**/*.t.err
index 9d80a7404d7096ac61f8465a0b586f2cc83da715..eaf1ef4a9886eb07e681f84fe027e123e2891db7 100644 (file)
@@ -2,6 +2,7 @@ procd
 askfirst
 udevtrigger
 init
+upgraded/upgraded
 .*
 Makefile
 CMakeCache.txt
index 4b3eebd7c9e18d10b7489233e6f6cb536d6938dd..e734bf723ef526b3f849b12d72158178371c4fc5 100644 (file)
@@ -18,14 +18,22 @@ INSTALL(TARGETS setlbf
 )
 
 
-SET(SOURCES procd.c signal.c state.c inittab.c rcS.c ubus.c system.c sysupgrade.c
-       service/service.c service/instance.c service/validate.c service/trigger.c service/watch.c
-       utils/utils.c)
+SET(SOURCES procd.c signal.c state.c hotplug-dispatch.c inittab.c rcS.c ubus.c
+       system.c sysupgrade.c service/service.c service/instance.c
+       service/validate.c service/trigger.c service/watch.c utils/utils.c)
 IF(NOT DISABLE_INIT)
   SET(SOURCES ${SOURCES} watchdog.c plug/coldplug.c plug/hotplug.c)
 ENDIF()
 
-SET(LIBS ubox ubus json-c blobmsg_json json_script)
+FIND_LIBRARY(ubox NAMES ubox)
+FIND_LIBRARY(ubus NAMES ubus)
+FIND_LIBRARY(uci NAMES uci)
+FIND_LIBRARY(blobmsg_json NAMES blobmsg_json)
+FIND_LIBRARY(json_script NAMES json_script)
+FIND_LIBRARY(json NAMES json-c json)
+FIND_LIBRARY(udebug NAMES udebug)
+
+SET(LIBS ${ubox} ${ubus} ${json} ${blobmsg_json} ${json_script} ${udebug})
 
 IF(DEBUG)
   ADD_DEFINITIONS(-DUDEV_DEBUG -g3)
@@ -35,28 +43,32 @@ IF(EARLY_PATH)
   ADD_DEFINITIONS(-DEARLY_PATH="${EARLY_PATH}")
 ENDIF()
 
-IF(ZRAM_TMPFS)
-  ADD_DEFINITIONS(-DZRAM_TMPFS)
-  SET(SOURCES_ZRAM initd/zram.c)
+IF(SELINUX)
+  include(FindPkgConfig)
+  pkg_search_module(SELINUX REQUIRED libselinux)
+  add_compile_definitions(WITH_SELINUX)
 ENDIF()
 
 add_subdirectory(upgraded)
 
 ADD_EXECUTABLE(procd ${SOURCES})
 TARGET_LINK_LIBRARIES(procd ${LIBS})
+SET_TARGET_PROPERTIES(procd PROPERTIES COMPILE_DEFINITIONS "HAS_UDEBUG")
 INSTALL(TARGETS procd
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
 )
 
 FIND_PATH(ubox_include_dir libubox/uloop.h)
-INCLUDE_DIRECTORIES(${ubox_include_dir})
+FIND_PATH(udebug_include_dir NAMES udebug.h)
+INCLUDE_DIRECTORIES(${ubox_include_dir} ${udebug_include_dir})
 
 IF(DISABLE_INIT)
 ADD_DEFINITIONS(-DDISABLE_INIT)
 ELSE()
 ADD_EXECUTABLE(init initd/init.c initd/early.c initd/preinit.c initd/mkdev.c sysupgrade.c watchdog.c
-       utils/utils.c ${SOURCES_ZRAM})
-TARGET_LINK_LIBRARIES(init ${LIBS})
+       utils/utils.c)
+TARGET_INCLUDE_DIRECTORIES(init PUBLIC ${SELINUX_INCLUDE_DIRS})
+TARGET_LINK_LIBRARIES(init ${LIBS} ${SELINUX_LIBRARIES})
 INSTALL(TARGETS init
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
 )
@@ -89,26 +101,36 @@ ADD_CUSTOM_TARGET(capabilities-names-h DEPENDS capabilities-names.h)
 
 IF(SECCOMP_SUPPORT)
 ADD_DEFINITIONS(-DSECCOMP_SUPPORT)
-ADD_LIBRARY(preload-seccomp SHARED jail/preload.c jail/seccomp.c)
-TARGET_LINK_LIBRARIES(preload-seccomp dl ubox blobmsg_json)
+ADD_LIBRARY(preload-seccomp SHARED jail/preload.c jail/seccomp.c jail/seccomp-oci.c)
+TARGET_LINK_LIBRARIES(preload-seccomp dl ${ubox} ${blobmsg_json})
 INSTALL(TARGETS preload-seccomp
        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
 ADD_DEPENDENCIES(preload-seccomp syscall-names-h)
-endif()
+SET(SOURCES_OCI_SECCOMP jail/seccomp-oci.c)
+ENDIF()
 
 IF(JAIL_SUPPORT)
-ADD_EXECUTABLE(ujail jail/jail.c jail/elf.c jail/fs.c jail/capabilities.c)
-TARGET_LINK_LIBRARIES(ujail ubox blobmsg_json)
+ADD_EXECUTABLE(ujail jail/jail.c jail/cgroups.c jail/cgroups-bpf.c jail/elf.c jail/fs.c jail/capabilities.c jail/netifd.c ${SOURCES_OCI_SECCOMP})
+TARGET_LINK_LIBRARIES(ujail ${ubox} ${ubus} ${uci} ${blobmsg_json})
 INSTALL(TARGETS ujail
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
 )
 ADD_DEPENDENCIES(ujail capabilities-names-h)
+IF(SECCOMP_SUPPORT)
+  ADD_DEPENDENCIES(ujail syscall-names-h)
+ENDIF()
+
+ADD_EXECUTABLE(uxc uxc.c)
+TARGET_LINK_LIBRARIES(uxc ${ubox} ${ubus} ${blobmsg_json})
+INSTALL(TARGETS uxc
+       RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
+)
 endif()
 
 IF(UTRACE_SUPPORT)
 ADD_EXECUTABLE(utrace trace/trace.c)
-TARGET_LINK_LIBRARIES(utrace ubox ${json} blobmsg_json)
+TARGET_LINK_LIBRARIES(utrace ${ubox} ${json} ${blobmsg_json})
 INSTALL(TARGETS utrace
        RUNTIME DESTINATION ${CMAKE_INSTALL_SBINDIR}
 )
index 7fb07683fde9836c24e41ad5c785f24cf85282a9..dede69632dfc7435d0c6430059499e01db3e96f4 100644 (file)
 
 #ifndef __CONTAINER_H
 #define __CONTAINER_H
+
 #include <stdlib.h>
+#include <stdbool.h>
+#include <sys/stat.h>
 
 static inline bool is_container() {
-       return !!getenv("container");
+       struct stat s;
+       int r = stat("/.dockerenv", &s);
+       int pv_r = stat("/pantavisor", &s);
+       return !!getenv("container") || r == 0 || pv_r == 0;
 }
 
 #endif
diff --git a/hotplug-dispatch.c b/hotplug-dispatch.c
new file mode 100644 (file)
index 0000000..4706085
--- /dev/null
@@ -0,0 +1,455 @@
+/*
+ * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/inotify.h>
+#include <sys/types.h>
+
+#include <dirent.h>
+#include <errno.h>
+#include <glob.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <libubox/avl.h>
+#include <libubox/avl-cmp.h>
+#include <libubox/list.h>
+#include <libubox/uloop.h>
+#include <libubus.h>
+
+#include "procd.h"
+
+#define HOTPLUG_BASEDIR "/etc/hotplug.d"
+#define HOTPLUG_OBJECT_PREFIX "hotplug."
+
+#define INOTIFY_SZ (sizeof(struct inotify_event) + PATH_MAX + 1)
+
+struct ubus_context *ctx;
+static char *inotify_buffer;
+static struct uloop_fd fd_inotify_read;
+
+static LIST_HEAD(subsystems);
+
+extern char **environ;
+
+struct hotplug_subsys {
+       struct list_head list;
+       struct ubus_object ubus;
+};
+
+struct envlist {
+       struct avl_node avl;
+       char *env;
+};
+
+struct hotplug_process {
+       struct ubus_object *ubus;
+       char **envp;
+       struct uloop_timeout timeout;
+       struct uloop_process process;
+       glob_t globbuf;
+       unsigned int cnt;
+       int ret;
+};
+
+static void env_free(char **envp)
+{
+       char **tmp;
+
+       tmp = envp;
+       while (*tmp)
+               free(*(tmp++));
+       free(envp);
+}
+
+static void hotplug_free(struct hotplug_process *pc)
+{
+       env_free(pc->envp);
+       globfree(&pc->globbuf);
+       free(pc);
+}
+
+static void hotplug_done(struct uloop_process *c, int ret)
+{
+       struct hotplug_process *pc = container_of(c, struct hotplug_process, process);
+
+       pc->ret = ret;
+
+       uloop_timeout_set(&pc->timeout, 50);
+}
+
+static void hotplug_exec(struct uloop_timeout *t)
+{
+       struct hotplug_process *pc = container_of(t, struct hotplug_process, timeout);
+       char *script;
+       char *exec_argv[4];
+       /* we have reached the last entry in the globbuf */
+       if (pc->cnt == pc->globbuf.gl_pathc) {
+               hotplug_free(pc);
+               return;
+       }
+
+       if (asprintf(&script, ". /lib/functions.sh\n. %s\n", pc->globbuf.gl_pathv[pc->cnt++]) == -1) {
+               pc->ret = ENOMEM;
+               return;
+       }
+
+       /* prepare for execve() */
+       exec_argv[0] = "/bin/sh";
+       exec_argv[1] = "-c";
+       exec_argv[2] = script;
+       exec_argv[3] = NULL;
+
+       /* set callback in uloop_process */
+       pc->process.cb = hotplug_done;
+       pc->process.pid = fork();
+       if (pc->process.pid == 0) {
+               /* child */
+               exit(execve(exec_argv[0], exec_argv, pc->envp));
+       } else if (pc->process.pid < 0) {
+               /* fork error */
+               free(script);
+               hotplug_free(pc);
+               return;
+       }
+       /* parent */
+       free(script);
+       uloop_process_add(&pc->process);
+}
+
+static int avl_envcmp(const void *k1, const void *k2, void *ptr)
+{
+       const char *tmp;
+
+       tmp = strchr(k1, '=');
+       if (!tmp)
+               return -1;
+
+       /*
+        * compare the variable name only, ie. limit strncmp to check
+        * only up to and including the '=' sign
+        */
+       return strncmp(k1, k2, (tmp - (char *)k1) + 1);
+}
+
+/* validate NULL-terminated environment variable name */
+static int validate_envvarname(const char *envvarname)
+{
+       const char *tmp = envvarname;
+
+       /* check for illegal characters in env variable name */
+       while (tmp[0] != '\0') {
+               if (!((tmp[0] >= 'a' && tmp[0] <= 'z') ||
+                     (tmp[0] >= 'A' && tmp[0] <= 'Z') ||
+                     (tmp[0] == '_') ||
+                     /* allow numbers unless they are at the first character */
+                     ((tmp != envvarname) && tmp[0] >= '0' && tmp[0] <= '9')))
+                       return EINVAL;
+               ++tmp;
+       }
+
+       return 0;
+}
+
+enum {
+       HOTPLUG_ENV,
+       __HOTPLUG_MAX
+};
+
+static const struct blobmsg_policy hotplug_policy[__HOTPLUG_MAX] = {
+       [HOTPLUG_ENV] = { .name = "env", .type = BLOBMSG_TYPE_ARRAY },
+};
+
+static int hotplug_call(struct ubus_context *ctx, struct ubus_object *obj,
+                       struct ubus_request_data *req, const char *method,
+                       struct blob_attr *msg)
+{
+       const char *subsys = &obj->name[strlen(HOTPLUG_OBJECT_PREFIX)];
+       struct blob_attr *tb[__HOTPLUG_MAX], *cur;
+       AVL_TREE(env, avl_envcmp, false, NULL);
+       struct envlist *envle, *p;
+       int rem;
+       char **envp, *globstr, *tmp, **tmpenv;
+       size_t envz = 0;
+       struct hotplug_process *pc;
+       bool async = true;
+       int err = UBUS_STATUS_UNKNOWN_ERROR;
+
+       blobmsg_parse(hotplug_policy, __HOTPLUG_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[HOTPLUG_ENV])
+               return UBUS_STATUS_INVALID_ARGUMENT;
+
+       tmpenv = environ;
+
+       /* first adding existing environment to avl_tree */
+       while (*tmpenv) {
+               envle = calloc(1, sizeof(struct envlist));
+               if (!envle)
+                       goto err_envle;
+
+               envle->env = strdup(*tmpenv);
+               if (!envle->env) {
+                       free(envle);
+                       goto err_envle;
+               }
+               envle->avl.key = envle->env;
+               if (avl_insert(&env, &envle->avl) == -1) {
+                       free(envle->env);
+                       free(envle);
+                       goto err_envle;
+               }
+
+               ++tmpenv;
+       }
+
+       /* then adding additional variables from ubus call */
+       blobmsg_for_each_attr(cur, tb[HOTPLUG_ENV], rem) {
+               char *enve = blobmsg_get_string(cur);
+               if (!enve)
+                       continue;
+
+               if (!strncmp(enve, "LD_", 3))
+                       continue;
+
+               if (!strcmp(enve, "PATH"))
+                       continue;
+
+               if (strlen(enve) < 3)
+                       continue;
+
+               if (!(tmp = strchr(enve, '=')))
+                       continue;
+
+               *tmp = '\0';
+               if (validate_envvarname(enve))
+                       continue;
+               *tmp = '=';
+
+               if (!strcmp(enve, "ASYNC=0"))
+                       async = false;
+
+               envle = calloc(1, sizeof(struct envlist));
+               if (!envle)
+                       goto err_envle;
+
+               envle->env = strdup(enve);
+               if (!envle->env) {
+                       free(envle);
+                       goto err_envle;
+               }
+               envle->avl.key = envle->env;
+               if (avl_insert(&env, &envle->avl)) {
+                       /* do not override existing env values, just skip */
+                       free((void*)envle->env);
+                       free(envle);
+               }
+       }
+
+       /* synchronous calls are unsupported for now */
+       if (!async) {
+               err = UBUS_STATUS_NOT_SUPPORTED;
+               goto err_envle;
+       }
+
+       /* allocating new environment */
+       avl_for_each_element(&env, envle, avl)
+               ++envz;
+
+       envp = calloc(envz + 1, sizeof(char *));
+       if (!envp)
+               goto err_envle;
+
+       /* populating new environment */
+       envz = 0;
+       avl_for_each_element_safe(&env, envle, avl, p) {
+               envp[envz++] = envle->env;
+               avl_delete(&env, &envle->avl);
+               free(envle);
+       }
+
+       pc = calloc(1, sizeof(struct hotplug_process));
+       if (!pc) {
+               env_free(envp);
+               return UBUS_STATUS_UNKNOWN_ERROR;
+       }
+       pc->timeout.cb = hotplug_exec;
+       pc->envp = envp;
+       pc->cnt = 0;
+       pc->ubus = obj;
+
+       /* glob'ing for hotplug scripts */
+       if (asprintf(&globstr, "%s/%s/*", HOTPLUG_BASEDIR, subsys) == -1) {
+               hotplug_free(pc);
+               return UBUS_STATUS_UNKNOWN_ERROR;
+       }
+
+       if (glob(globstr, GLOB_DOOFFS, NULL, &pc->globbuf)) {
+               free(globstr);
+               hotplug_free(pc);
+               return UBUS_STATUS_OK;
+       }
+
+       free(globstr);
+
+       /* asynchronous call to hotplug_exec() */
+       uloop_timeout_set(&pc->timeout, 50);
+
+       return UBUS_STATUS_OK;
+
+err_envle:
+       avl_for_each_element_safe(&env, envle, avl, p) {
+               if (envle->env)
+                       free(envle->env);
+
+               avl_delete(&env, &envle->avl);
+               free(envle);
+       }
+
+       return err;
+}
+
+static const struct ubus_method hotplug_methods[] = {
+       UBUS_METHOD("call", hotplug_call, hotplug_policy),
+};
+
+static struct ubus_object_type hotplug_object_type =
+       UBUS_OBJECT_TYPE("hotplug", hotplug_methods);
+
+static void add_subsystem(int nlen, char *newname)
+{
+       struct hotplug_subsys *nh = calloc(1, sizeof(struct hotplug_subsys));
+       char *name;
+
+       if (asprintf(&name, "%s%.*s", HOTPLUG_OBJECT_PREFIX, nlen, newname) == -1)
+               exit(ENOMEM);
+
+       /* prepare and add ubus object */
+       nh->ubus.name = name;
+       nh->ubus.type = &hotplug_object_type;
+       nh->ubus.methods = hotplug_object_type.methods;
+       nh->ubus.n_methods = hotplug_object_type.n_methods;
+       list_add(&nh->list, &subsystems);
+       ubus_add_object(ctx, &nh->ubus);
+}
+
+static void remove_subsystem(int nlen, char *name)
+{
+       struct hotplug_subsys *n, *h;
+
+       /* find match subsystem object by name or any if not given */
+       list_for_each_entry_safe(h, n, &subsystems, list) {
+               if (nlen && (strlen(h->ubus.name) != strnlen(name, nlen) + strlen(HOTPLUG_OBJECT_PREFIX)))
+                       continue;
+               if (nlen && (strncmp(name, &h->ubus.name[strlen(HOTPLUG_OBJECT_PREFIX)], nlen)))
+                       continue;
+
+               list_del(&h->list);
+               ubus_remove_object(ctx, &h->ubus);
+               free((void*)h->ubus.name);
+               free(h);
+       }
+}
+
+static int init_subsystems(void)
+{
+       DIR *dir;
+       struct dirent *dirent;
+
+       dir = opendir(HOTPLUG_BASEDIR);
+       if (dir == NULL)
+               return ENOENT;
+
+       while ((dirent = readdir(dir))) {
+               /* skip everything but directories */
+               if (dirent->d_type != DT_DIR)
+                       continue;
+
+               /* skip '.' and '..' as well as hidden files */
+               if (dirent->d_name[0] == '.')
+                       continue;
+
+               add_subsystem(strlen(dirent->d_name), dirent->d_name);
+       }
+       closedir(dir);
+
+       return 0;
+}
+
+static void inotify_read_handler(struct uloop_fd *u, unsigned int events)
+{
+       int rc;
+       char *p;
+       struct inotify_event *in;
+
+       /* read inotify events */
+       while ((rc = read(u->fd, inotify_buffer, INOTIFY_SZ)) == -1 && errno == EINTR);
+
+       if (rc <= 0)
+               return;
+
+       /* process events from buffer */
+       for (p = inotify_buffer;
+            rc - (p - inotify_buffer) >= (int)sizeof(struct inotify_event);
+            p += sizeof(struct inotify_event) + in->len) {
+               in = (struct inotify_event*)p;
+
+               /* skip everything but directories */
+               if (!(in->mask & IN_ISDIR))
+                       continue;
+
+               if (in->len < 1)
+                       continue;
+
+               /* skip hidden files */
+               if (in->name[0] == '.')
+                       continue;
+
+               /* add/remove subsystem objects */
+               if (in->mask & (IN_CREATE | IN_MOVED_TO))
+                       add_subsystem(in->len, in->name);
+               else if (in->mask & (IN_DELETE | IN_MOVED_FROM))
+                       remove_subsystem(in->len, in->name);
+       }
+}
+
+void ubus_init_hotplug(struct ubus_context *newctx)
+{
+       ctx = newctx;
+       remove_subsystem(0, NULL);
+       if (init_subsystems()) {
+               printf("failed to initialize hotplug subsystems from %s\n", HOTPLUG_BASEDIR);
+               return;
+       }
+       fd_inotify_read.fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
+       fd_inotify_read.cb = inotify_read_handler;
+       if (fd_inotify_read.fd == -1) {
+               printf("failed to initialize inotify handler for %s\n", HOTPLUG_BASEDIR);
+               return;
+       }
+
+       inotify_buffer = calloc(1, INOTIFY_SZ);
+       if (!inotify_buffer)
+               return;
+
+       if (inotify_add_watch(fd_inotify_read.fd, HOTPLUG_BASEDIR,
+               IN_CREATE | IN_MOVED_TO | IN_DELETE | IN_MOVED_FROM | IN_ONLYDIR) == -1)
+               return;
+
+       uloop_fd_add(&fd_inotify_read, ULOOP_READ);
+}
index 7b281b20b8233785de957d6b0dba23338e916e45..04aa10dbf05756a3df1307dcd95fc689cb6eb001 100644 (file)
@@ -58,26 +58,23 @@ early_mounts(void)
        unsigned int oldumask = umask(0);
 
        if (!is_container()) {
-               mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
-               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
-               mount("cgroup", "/sys/fs/cgroup", "cgroup",  MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
-               mount("tmpfs", "/dev", "tmpfs", MS_NOATIME | MS_NOSUID, "mode=0755,size=512K");
+               mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL);
+               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL);
+               mount("efivars", "/sys/firmware/efi/efivars", "efivarfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL);
+               mount("cgroup2", "/sys/fs/cgroup", "cgroup2",  MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, "nsdelegate");
+               mount("tmpfs", "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "mode=0755,size=512K");
                ignore(symlink("/tmp/shm", "/dev/shm"));
                mkdir("/dev/pts", 0755);
-               mount("devpts", "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "mode=600");
+               mount("devpts", "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, NULL);
 
                early_dev();
        }
 
        early_console("/dev/console");
-       if (mount_zram_on_tmp()) {
-               mount("tmpfs", "/tmp", "tmpfs", MS_NOSUID | MS_NODEV | MS_NOATIME, "mode=01777");
-               mkdir("/tmp/shm", 01777);
-       } else {
-               mkdir("/tmp/shm", 01777);
-               mount("tmpfs", "/tmp/shm", "tmpfs", MS_NOSUID | MS_NODEV | MS_NOATIME,
-                               "mode=01777");
-       }
+
+       mount("tmpfs", "/tmp", "tmpfs", MS_NOSUID | MS_NODEV | MS_NOATIME, "mode=01777");
+       mkdir("/tmp/shm", 01777);
+
        mkdir("/tmp/run", 0755);
        mkdir("/tmp/lock", 0755);
        mkdir("/tmp/state", 0755);
index 29eee5050cf969a98470c5dad46a03a8f68434f7..ab6a7e180fb75a9740022caaa1de075ce57ce33e 100644 (file)
 #include <unistd.h>
 #include <stdio.h>
 
+#if defined(WITH_SELINUX)
+#include <selinux/selinux.h>
+#include <selinux/restorecon.h>
+#include <selinux/avc.h>
+#endif
+
 #include "../utils/utils.h"
 #include "init.h"
 #include "../watchdog.h"
@@ -67,6 +73,50 @@ cmdline(void)
        }
 }
 
+#if defined(WITH_SELINUX)
+static int
+selinux(char **argv)
+{
+       int ret;
+       int enforce = selinux_status_getenforce();
+
+       /* is SELinux already initialized? */
+       if (getenv("SELINUX_INIT")) {
+               /* have initramfs permissions already been restored? */
+               if (!getenv("INITRAMFS") || getenv("SELINUX_RESTORECON")) {
+                       unsetenv("SELINUX_INIT");
+                       unsetenv("SELINUX_RESTORECON");
+                       return 0;
+               }
+               /* Second call (initramfs only): restore filesystem labels */
+               const char *exclude_list[] = { "/dev/console", "/proc", "/sys", 0 };
+               selinux_restorecon_set_exclude_list(exclude_list);
+               ret = selinux_restorecon("/", SELINUX_RESTORECON_RECURSE | SELINUX_RESTORECON_MASS_RELABEL);
+               putenv("SELINUX_RESTORECON=1");
+       } else {
+               /* First call: load policy */
+               ret = selinux_init_load_policy(&enforce);
+               putenv("SELINUX_INIT=1");
+       }
+
+       if (ret == 0)
+               execv(argv[0], argv);
+
+       if (enforce > 0) {
+               fprintf(stderr, "Cannot load SELinux policy, but system in enforcing mode. Halting.\n");
+               return 1;
+       }
+
+       return 0;
+}
+#else
+static int
+selinux(char **argv)
+{
+       return 0;
+}
+#endif
+
 int
 main(int argc, char **argv)
 {
@@ -79,6 +129,8 @@ main(int argc, char **argv)
        sigaction(SIGUSR2, &sa_shutdown, NULL);
        sigaction(SIGPWR, &sa_shutdown, NULL);
 
+       if (selinux(argv))
+               exit(-1);
        early();
        cmdline();
        watchdog_init(1);
@@ -92,17 +144,18 @@ main(int argc, char **argv)
 
                execvp(kmod[0], kmod);
                ERROR("Failed to start kmodloader: %m\n");
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
        if (pid <= 0) {
                ERROR("Failed to start kmodloader instance: %m\n");
        } else {
+               const struct timespec req = {0, 10 * 1000 * 1000};
                int i;
 
                for (i = 0; i < 1200; i++) {
                        if (waitpid(pid, NULL, WNOHANG) > 0)
                                break;
-                       usleep(10 * 1000);
+                       nanosleep(&req, NULL);
                        watchdog_ping();
                }
        }
index 123e11460598d6f3c3d064accb34b23df1375634..dcf9d30486fc83acd023b84a4b9dae6f0bf0b151 100644 (file)
@@ -26,11 +26,4 @@ void preinit(void);
 void early(void);
 int mkdev(const char *progname, int progmode);
 
-#ifdef ZRAM_TMPFS
-int mount_zram_on_tmp(void);
-#else
-static inline int mount_zram_on_tmp(void) {
-       return -ENOSYS;
-}
-#endif
 #endif
index 44101aa12df5e5a102b626d2eb41f1a10a40e2ad..f0a1e84fc4dec585bb80be591662ca69c3bc291a 100644 (file)
@@ -82,7 +82,7 @@ static void find_devs(bool block)
                        continue;
 
                strcpy(path, dp->d_name);
-               len = readlink(buf2, buf, sizeof(buf));
+               len = readlink(buf2, buf, sizeof(buf) - 1);
                if (len <= 0)
                        continue;
 
index fbb36df9e5adb2e668f0b033bbc2fda0f1e20ee0..46411aa413a2a65614cfc765d3d6a42dee200532 100644 (file)
@@ -23,7 +23,7 @@
 #include <libubus.h>
 
 #include <stdio.h>
-
+#include <stdlib.h>
 #include <unistd.h>
 
 #include "init.h"
@@ -75,7 +75,7 @@ check_sysupgrade(void)
 
        fclose(sysupgrade);
 
-       sysupgrade_exec_upgraded(prefix, path, command);
+       sysupgrade_exec_upgraded(prefix, path, NULL, command, NULL);
 
        while (true)
                sleep(1);
@@ -97,7 +97,6 @@ spawn_procd(struct uloop_process *proc, int ret)
        if (plugd_proc.pid > 0)
                kill(plugd_proc.pid, SIGKILL);
 
-       unsetenv("INITRAMFS");
        unsetenv("PREINIT");
        unlink("/tmp/.preinit");
 
@@ -135,7 +134,7 @@ preinit(void)
        if (!plugd_proc.pid) {
                execvp(plug[0], plug);
                ERROR("Failed to start plugd: %m\n");
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
        if (plugd_proc.pid <= 0) {
                ERROR("Failed to start new plugd instance: %m\n");
@@ -157,7 +156,7 @@ preinit(void)
        if (!preinit_proc.pid) {
                execvp(init[0], init);
                ERROR("Failed to start preinit: %m\n");
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
        if (preinit_proc.pid <= 0) {
                ERROR("Failed to start new preinit instance: %m\n");
diff --git a/initd/zram.c b/initd/zram.c
deleted file mode 100644 (file)
index 487d3d6..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-#include <fcntl.h>
-
-#include <sys/utsname.h>
-#include <sys/mount.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-
-#include "../log.h"
-#include "../container.h"
-
-#include "init.h"
-
-#define KB(x) (x * 1024)
-
-#define ZRAM_MOD_PATH "/lib/modules/%s/zram.ko"
-#define EXT4_MOD_PATH "/lib/modules/%s/ext4.ko"
-
-static long
-proc_meminfo(void)
-{
-       FILE *fp;
-       char line[256];
-       char *key;
-       long val = KB(16);
-
-       fp = fopen("/proc/meminfo", "r");
-       if (fp == NULL) {
-               ERROR("Can't open /proc/meminfo: %m\n");
-               return errno;
-       }
-
-       while (fgets(line, sizeof(line), fp)) {
-               key = strtok(line, ":");
-               if (strcasecmp(key, "MemTotal"))
-                       continue;
-               val = atol(strtok(NULL, " kB\n"));
-               break;
-       }
-       fclose(fp);
-
-       if (val > KB(32))
-               val = KB(32);
-
-       return val;
-}
-
-static int
-early_insmod(char *module)
-{
-       pid_t pid = fork();
-       char *modprobe[] = { "/sbin/modprobe", NULL, NULL };
-
-       if (!pid) {
-               char *path;
-               struct utsname ver;
-
-               uname(&ver);
-               path = alloca(strlen(module) + strlen(ver.release) + 1);
-               sprintf(path, module, ver.release);
-               modprobe[1] = path;
-               execvp(modprobe[0], modprobe);
-               ERROR("Can't exec %s: %m\n", modprobe[0]);
-               exit(-1);
-       }
-
-       if (pid <= 0) {
-               ERROR("Can't exec %s: %m\n", modprobe[0]);
-               return -1;
-       } else {
-               waitpid(pid, NULL, 0);
-       }
-
-       return 0;
-}
-
-
-int
-mount_zram_on_tmp(void)
-{
-       char *mkfs[] = { "/usr/sbin/mkfs.ext4", "-b", "4096", "-F", "-L", "TEMP", "-m", "0", "/dev/zram0", NULL };
-       FILE *fp;
-       long zramsize;
-       pid_t pid;
-       int ret;
-
-       if (early_insmod(ZRAM_MOD_PATH) || early_insmod(EXT4_MOD_PATH)) {
-               ERROR("failed to insmod zram support\n");
-               return -1;
-       }
-
-       mkdev("*", 0600);
-
-       zramsize = proc_meminfo() / 2;
-       fp = fopen("/sys/block/zram0/disksize", "r+");
-       if (fp == NULL) {
-               ERROR("Can't open /sys/block/zram0/disksize: %m\n");
-               return errno;
-       }
-       fprintf(fp, "%ld", KB(zramsize));
-       fclose(fp);
-
-       pid = fork();
-       if (!pid) {
-               execvp(mkfs[0], mkfs);
-               ERROR("Can't exec %s: %m\n", mkfs[0]);
-               exit(-1);
-       } else if (pid <= 0) {
-               ERROR("Can't exec %s: %m\n", mkfs[0]);
-               return -1;
-       } else {
-               waitpid(pid, NULL, 0);
-       }
-
-       if (!is_container()) {
-               ret = mount("/dev/zram0", "/tmp", "ext4", MS_NOSUID | MS_NODEV | MS_NOATIME, "errors=continue,noquota");
-               if (ret < 0) {
-                       ERROR("Can't mount /dev/zram0 on /tmp: %m\n");
-                       return errno;
-               }
-       }
-
-       LOG("Using up to %ld kB of RAM as ZRAM storage on /mnt\n", zramsize);
-
-       ret = chmod("/tmp", 01777);
-       if (ret < 0) {
-               ERROR("Can't set /tmp mode to 1777: %m\n");
-               return errno;
-       }
-
-       return 0;
-}
index 55554b944d74f36068afe03f7ab64140da8564dc..73a2174a97b6eb36778c1790e853c9ad18c000e8 100644 (file)
--- a/inittab.c
+++ b/inittab.c
@@ -120,14 +120,25 @@ static void child_exit(struct uloop_process *proc, int ret)
 {
        struct init_action *a = container_of(proc, struct init_action, proc);
 
-       DEBUG(4, "pid:%d\n", proc->pid);
-        uloop_timeout_set(&a->tout, a->respawn);
+       DEBUG(4, "pid:%d, exitcode:%d\n", proc->pid, ret);
+       proc->pid = 0;
+
+       if (a->respawn < 0)
+               return;
+
+       if (!dev_exist(a->id)) {
+               DEBUG(4, "Skipping respawn: device '%s' does not exist anymore\n", a->id);
+               return;
+       }
+
+       uloop_timeout_set(&a->tout, a->respawn);
 }
 
 static void respawn(struct uloop_timeout *tout)
 {
        struct init_action *a = container_of(tout, struct init_action, tout);
-       fork_worker(a);
+       if (!a->proc.pid)
+               fork_worker(a);
 }
 
 static void rcdone(struct runqueue *q)
@@ -157,13 +168,17 @@ static void askfirst(struct init_action *a)
        }
 
        a->tout.cb = respawn;
-       for (i = MAX_ARGS - 1; i >= 1; i--)
-               a->argv[i] = a->argv[i - 1];
-       a->argv[0] = ask;
+       /* shift arguments only if not yet done */
+       if (a->argv[0] != ask) {
+               for (i = MAX_ARGS - 1; i >= 1; i--)
+                       a->argv[i] = a->argv[i - 1];
+               a->argv[0] = ask;
+       }
        a->respawn = 500;
 
        a->proc.cb = child_exit;
-       fork_worker(a);
+       if (!a->proc.pid)
+               fork_worker(a);
 }
 
 static void askconsole(struct init_action *a)
@@ -171,7 +186,19 @@ static void askconsole(struct init_action *a)
        char line[256], *tty, *split;
        int i;
 
+       /* First, try console= on the kernel command line,
+        * then fallback to /sys/class/tty/console/active,
+        * which should work when linux,stdout-path (or equivalent)
+        * is in the device tree
+        */
        tty = get_cmdline_val("console", line, sizeof(line));
+       if (tty == NULL ||
+           get_cmdline_val_offset("console", line, sizeof(line), 1)) {
+               if (dev_exist("console"))
+                       tty = "console";
+               else
+                       tty = get_active_console(line, sizeof(line));
+       }
        if (tty != NULL) {
                split = strchr(tty, ',');
                if (split != NULL)
@@ -191,13 +218,17 @@ static void askconsole(struct init_action *a)
        }
 
        a->tout.cb = respawn;
-       for (i = MAX_ARGS - 1; i >= 1; i--)
-               a->argv[i] = a->argv[i - 1];
-       a->argv[0] = ask;
+       /* shift arguments only if not yet done */
+       if (a->argv[0] != ask) {
+               for (i = MAX_ARGS - 1; i >= 1; i--)
+                       a->argv[i] = a->argv[i - 1];
+               a->argv[0] = ask;
+       }
        a->respawn = 500;
 
        a->proc.cb = child_exit;
-       fork_worker(a);
+       if (!a->proc.pid)
+               fork_worker(a);
 }
 
 static void rcrespawn(struct init_action *a)
@@ -206,7 +237,8 @@ static void rcrespawn(struct init_action *a)
        a->respawn = 500;
 
        a->proc.cb = child_exit;
-       fork_worker(a);
+       if (!a->proc.pid)
+               fork_worker(a);
 }
 
 static struct init_handler handlers[] = {
@@ -259,15 +291,23 @@ void procd_inittab_run(const char *handler)
 
        list_for_each_entry(a, &actions, list)
                if (!strcmp(a->handler->name, handler)) {
-                       if (a->handler->multi) {
-                               a->handler->cb(a);
-                               continue;
-                       }
                        a->handler->cb(a);
-                       break;
+                       if (!a->handler->multi)
+                               break;
                }
 }
 
+void procd_inittab_kill(void)
+{
+       struct init_action *a;
+
+       list_for_each_entry(a, &actions, list) {
+               a->respawn = -1;
+               if (a->proc.pid)
+                       kill(a->proc.pid, SIGKILL);
+       }
+}
+
 void procd_inittab(void)
 {
 #define LINE_LEN       128
index 76e06a6d5c7a60511adbe9c52d89982a78049b24..434fc2f77e266bfc2b5e945a26a176b68c797039 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 Etienne CHAMPETIER <champetier.etienne@gmail.com>
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License version 2.1
@@ -14,7 +15,6 @@
 #define _GNU_SOURCE 1
 #include <syslog.h>
 #include <sys/prctl.h>
-
 #include <libubox/blobmsg.h>
 #include <libubox/blobmsg_json.h>
 
 #include "../capabilities-names.h"
 #include "capabilities.h"
 
+#define JAIL_CAP_ERROR (1LLU << (CAP_LAST_CAP+1))
+#define JAIL_CAP_ALL (0xffffffffffffffffLLU)
+
 static int find_capabilities(const char *name)
 {
        int i;
 
        for (i = 0; i <= CAP_LAST_CAP; i++)
-               if (capabilities_names[i] && !strcmp(capabilities_names[i], name))
+               if (capabilities_names[i] && !strcasecmp(capabilities_names[i], name))
                        return i;
 
        return -1;
 }
 
-int drop_capabilities(const char *file)
+enum {
+       OCI_CAPABILITIES_BOUNDING,
+       OCI_CAPABILITIES_EFFECTIVE,
+       OCI_CAPABILITIES_INHERITABLE,
+       OCI_CAPABILITIES_PERMITTED,
+       OCI_CAPABILITIES_AMBIENT,
+       __OCI_CAPABILITIES_MAX
+};
+
+static const struct blobmsg_policy oci_capabilities_policy[] = {
+       [OCI_CAPABILITIES_BOUNDING] = { "bounding", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_EFFECTIVE] = { "effective", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_INHERITABLE] = { "inheritable", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_PERMITTED] = { "permitted", BLOBMSG_TYPE_ARRAY },
+       [OCI_CAPABILITIES_AMBIENT] = { "ambient", BLOBMSG_TYPE_ARRAY },
+};
+
+static uint64_t parseOCIcap(struct blob_attr *msg)
 {
-       enum {
-               CAP_KEEP,
-               CAP_DROP,
-               __CAP_MAX
-       };
-       static const struct blobmsg_policy policy[__CAP_MAX] = {
-               [CAP_KEEP] = { .name = "cap.keep", .type = BLOBMSG_TYPE_ARRAY },
-               [CAP_DROP] = { .name = "cap.drop", .type = BLOBMSG_TYPE_ARRAY },
-       };
-       struct blob_buf b = { 0 };
-       struct blob_attr *tb[__CAP_MAX];
        struct blob_attr *cur;
-       int rem, cap;
-       char *name;
-       uint64_t capdrop = 0LLU;
+       int rem;
+       uint64_t caps = 0;
+       int capnum;
 
-       DEBUG("dropping capabilities\n");
+       /* each capset is optional, set all-1 mask if absent */
+       if (!msg)
+               return JAIL_CAP_ALL;
 
-       blob_buf_init(&b, 0);
-       if (!blobmsg_add_json_from_file(&b, file)) {
-               ERROR("failed to load %s\n", file);
-               return -1;
+       blobmsg_for_each_attr(cur, msg, rem) {
+               capnum = find_capabilities(blobmsg_get_string(cur));
+               if (capnum < 0)
+                       return JAIL_CAP_ERROR;
+
+               caps |= (1LLU << capnum);
+       }
+
+       return caps;
+}
+
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_CAPABILITIES_MAX];
+       uint64_t caps;
+       blobmsg_parse(oci_capabilities_policy, __OCI_CAPABILITIES_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_BOUNDING]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->bounding = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_EFFECTIVE]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->effective = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_INHERITABLE]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->inheritable = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_PERMITTED]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->permitted = caps;
+
+       caps = parseOCIcap(tb[OCI_CAPABILITIES_AMBIENT]);
+       if (caps == JAIL_CAP_ERROR)
+               return EINVAL;
+       else
+               capset->ambient = caps;
+
+       capset->apply = 1;
+
+       return 0;
+}
+
+
+int applyOCIcapabilities(struct jail_capset ocicapset, uint64_t retain)
+{
+       struct __user_cap_header_struct uh = {};
+       struct __user_cap_data_struct ud[2];
+       int cap;
+       int is_set;
+
+       if (!ocicapset.apply)
+               return 0;
+
+       /* drop from bounding set */
+       if (ocicapset.bounding != JAIL_CAP_ALL) {
+               for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+                       if (!prctl(PR_CAPBSET_READ, cap, 0, 0, 0)) {
+                               /* can't raise */
+                               if (ocicapset.bounding & (1LLU << cap))
+                                       ERROR("capability %s (%d) is not in bounding set\n", capabilities_names[cap], cap);
+
+                               continue;
+                       }
+                       if ( ((ocicapset.bounding | retain) & (1LLU << cap)) == 0) {
+                               DEBUG("dropping capability %s (%d) from bounding set\n", capabilities_names[cap], cap);
+                               if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
+                                       ERROR("prctl(PR_CAPBSET_DROP, %d) failed: %m\n", cap);
+                                       return errno;
+                               }
+                       } else {
+                               DEBUG("keeping capability %s (%d)\n", capabilities_names[cap], cap);
+                       }
+               }
        }
 
-       blobmsg_parse(policy, __CAP_MAX, tb, blob_data(b.head), blob_len(b.head));
-       if (!tb[CAP_KEEP] && !tb[CAP_DROP]) {
-               ERROR("failed to parse %s\n", file);
+       /* set effective, permitted and inheritable */
+       uh.version = _LINUX_CAPABILITY_VERSION_3;
+       uh.pid = getpid();
+
+       if (capget(&uh, ud)) {
+               ERROR("capget() failed\n");
                return -1;
        }
 
-       blobmsg_for_each_attr(cur, tb[CAP_KEEP], rem) {
-               name = blobmsg_get_string(cur);
-               if (!name) {
-                       ERROR("invalid capability name in cap.keep\n");
-                       return -1;
-               }
-               cap = find_capabilities(name);
-               if (cap == -1) {
-                       ERROR("unknown capability %s in cap.keep\n", name);
-                       return -1;
-               }
-               capdrop |= (1LLU << cap);
+       DEBUG("old capabilities: Pe=%016llx Pp=%016llx Pi=%016llx\n",
+               0LLU | ud[0].effective | (0LLU | ud[1].effective) << 32,
+               0LLU | ud[0].permitted | (0LLU | ud[1].permitted) << 32,
+               0LLU | ud[0].inheritable | (0LLU | ud[1].inheritable) << 32);
+
+       if (ocicapset.effective != JAIL_CAP_ALL) {
+               ud[0].effective = (ocicapset.effective | retain) & 0xFFFFFFFFU;
+               ud[1].effective = ((ocicapset.effective | retain) >> 32) & 0xFFFFFFFFU;
        }
 
-       if (capdrop == 0LLU) {
-               DEBUG("cap.keep empty -> only dropping capabilities from cap.drop (blacklist)\n");
-               capdrop = 0xffffffffffffffffLLU;
-       } else {
-               DEBUG("cap.keep has at least one capability -> dropping every capabilities not in cap.keep (whitelist)\n");
+       if (ocicapset.permitted != JAIL_CAP_ALL) {
+               ud[0].permitted = (ocicapset.permitted | retain) & 0xFFFFFFFFU;
+               ud[1].permitted = ((ocicapset.permitted | retain) >> 32) & 0xFFFFFFFFU;
        }
 
-       blobmsg_for_each_attr(cur, tb[CAP_DROP], rem) {
-               name = blobmsg_get_string(cur);
-               if (!name) {
-                       ERROR("invalid capability name in cap.drop\n");
-                       return -1;
-               }
-               cap = find_capabilities(name);
-               if (cap == -1) {
-                       ERROR("unknown capability %s in cap.drop\n", name);
-                       return -1;
-               }
-               capdrop &= ~(1LLU << cap);
+       if (ocicapset.inheritable != JAIL_CAP_ALL) {
+               ud[0].inheritable = (ocicapset.inheritable | retain) & 0xFFFFFFFFU;
+               ud[1].inheritable = ((ocicapset.inheritable | retain) >> 32) & 0xFFFFFFFFU;
+       }
+
+       DEBUG("new capabilities: Pe=%016llx Pp=%016llx Pi=%016llx\n",
+               0LLU | ud[0].effective | (0LLU | ud[1].effective) << 32,
+               0LLU | ud[0].permitted | (0LLU | ud[1].permitted) << 32,
+               0LLU | ud[0].inheritable | (0LLU | ud[1].inheritable) << 32);
+
+       if (capset(&uh, ud)) {
+               ERROR("capset() failed\n");
+               return -1;
        }
 
-       for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
-               if ( (capdrop & (1LLU << cap)) == 0) {
-                       DEBUG("dropping capability %s (%d)\n", capabilities_names[cap], cap);
-                       if (prctl(PR_CAPBSET_DROP, cap, 0, 0, 0)) {
-                               ERROR("prctl(PR_CAPBSET_DROP, %d) failed: %m\n", cap);
-                               return errno;
+       /* edit ambient set */
+       if (ocicapset.ambient != JAIL_CAP_ALL) {
+               for (cap = 0; cap <= CAP_LAST_CAP; cap++) {
+                       is_set = prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, cap, 0, 0);
+                       if ( (ocicapset.ambient & (1LLU << cap)) == 0) {
+                               if (is_set) {
+                                       DEBUG("dropping capability %s (%d) from ambient set\n", capabilities_names[cap], cap);
+                                       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, cap, 0, 0)) {
+                                               ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_LOWER, %d, 0, 0) failed: %m\n", cap);
+                                               return errno;
+                                       }
+                               }
+                       } else {
+                               if (!is_set) {
+                                       DEBUG("raising capability %s (%d) to ambient set\n", capabilities_names[cap], cap);
+                                       if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {\
+                                               ERROR("prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, %d, 0, 0) failed: %m\n", cap);
+                                               return errno;
+                                       }
+                               }
                        }
-               } else {
-                       DEBUG("keeping capability %s (%d)\n", capabilities_names[cap], cap);
                }
        }
 
        return 0;
 }
+
+int parseOCIcapabilities_from_file(struct jail_capset *capset, const char *file)
+{
+       struct blob_buf b = { 0 };
+       int ret;
+
+       blob_buf_init(&b, 0);
+       ret = !blobmsg_add_json_from_file(&b, file);
+       if (ret) {
+               ERROR("failed to load %s\n", file);
+               goto err;
+       }
+
+       ret = parseOCIcapabilities(capset, b.head);
+
+err:
+       blob_buf_free(&b);
+       return ret;
+}
index 11b8cc27e9d54b0cbc0e89e34153a8e97c031ee4..d8c6b8d60b56e5285b54df2c45a2de9f9e87d1d9 100644 (file)
 #ifndef _JAIL_CAPABILITIES_H_
 #define _JAIL_CAPABILITIES_H_
 
-int drop_capabilities(const char *file);
+#include <libubox/blobmsg.h>
+#include <linux/capability.h>
+
+struct jail_capset {
+       uint64_t bounding;
+       uint64_t effective;
+       uint64_t inheritable;
+       uint64_t permitted;
+       uint64_t ambient;
+       uint8_t apply;
+};
+
+int parseOCIcapabilities(struct jail_capset *capset, struct blob_attr *msg);
+int parseOCIcapabilities_from_file(struct jail_capset *capset, const char *file);
+int applyOCIcapabilities(struct jail_capset capset, uint64_t retain);
+
+/* capget/capset syscall wrappers are provided by libc */
+extern int capget(cap_user_header_t header, cap_user_data_t data);
+extern int capset(cap_user_header_t header, const cap_user_data_t data);
 
 #endif
diff --git a/jail/cgroups-bpf.c b/jail/cgroups-bpf.c
new file mode 100644 (file)
index 0000000..f1bbeff
--- /dev/null
@@ -0,0 +1,455 @@
+/*
+ * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * somehow emulate devices.allow/devices.deny using eBPF
+ *
+ * OCI run-time spec defines the syntax for allowing/denying access
+ * to devices according to the definition of cgroup-v1 in the Kernel
+ * as described in Documentation/admin-guide/cgroup-v1.
+ */
+
+#include <assert.h>
+#include <linux/bpf.h>
+#ifdef __GLIBC__
+#include <sys/cdefs.h>
+#else
+#include <sys/reg.h>
+#endif
+#include <sys/syscall.h>
+
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <libubox/list.h>
+
+#include "cgroups.h"
+#include "cgroups-bpf.h"
+#include "log.h"
+
+static struct bpf_insn *program = NULL;
+static int bpf_total_insn = 0;
+static const char *license = "GPL";
+
+static int
+syscall_bpf (int cmd, union bpf_attr *attr, unsigned int size)
+{
+       return (int) syscall (__NR_bpf, cmd, attr, size);
+}
+
+/* from crun/src/libcrun/ebpf.c */
+#define BPF_ALU32_IMM(OP, DST, IMM) \
+       ((struct bpf_insn){ .code = BPF_ALU | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
+
+#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \
+       ((struct bpf_insn){                    \
+               .code = BPF_LDX | BPF_SIZE (SIZE) | BPF_MEM, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
+
+#define BPF_MOV64_REG(DST, SRC) \
+       ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
+
+#define BPF_JMP_A(OFF) \
+       ((struct bpf_insn){ .code = BPF_JMP | BPF_JA, .dst_reg = 0, .src_reg = 0, .off = OFF, .imm = 0 })
+
+#define BPF_JMP_IMM(OP, DST, IMM, OFF) \
+       ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_K, .dst_reg = DST, .src_reg = 0, .off = OFF, .imm = IMM })
+
+#define BPF_JMP_REG(OP, DST, SRC, OFF) \
+       ((struct bpf_insn){ .code = BPF_JMP | BPF_OP (OP) | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = OFF, .imm = 0 })
+
+#define BPF_MOV64_IMM(DST, IMM) \
+       ((struct bpf_insn){ .code = BPF_ALU64 | BPF_MOV | BPF_K, .dst_reg = DST, .src_reg = 0, .off = 0, .imm = IMM })
+
+#define BPF_MOV32_REG(DST, SRC) \
+       ((struct bpf_insn){ .code = BPF_ALU | BPF_MOV | BPF_X, .dst_reg = DST, .src_reg = SRC, .off = 0, .imm = 0 })
+
+#define BPF_EXIT_INSN() \
+       ((struct bpf_insn){ .code = BPF_JMP | BPF_EXIT, .dst_reg = 0, .src_reg = 0, .off = 0, .imm = 0 })
+
+/* taken from systemd.  */
+static const struct bpf_insn pre_insn[] = {
+       /* type -> R2.  */
+       BPF_LDX_MEM (BPF_W, BPF_REG_2, BPF_REG_1, 0),
+       BPF_ALU32_IMM (BPF_AND, BPF_REG_2, 0xFFFF),
+       /* access -> R3.  */
+       BPF_LDX_MEM (BPF_W, BPF_REG_3, BPF_REG_1, 0),
+       BPF_ALU32_IMM (BPF_RSH, BPF_REG_3, 16),
+       /* major -> R4.  */
+       BPF_LDX_MEM (BPF_W, BPF_REG_4, BPF_REG_1, 4),
+       /* minor -> R5.  */
+       BPF_LDX_MEM (BPF_W, BPF_REG_5, BPF_REG_1, 8),
+};
+
+enum {
+       OCI_LINUX_CGROUPS_DEVICES_ALLOW,
+       OCI_LINUX_CGROUPS_DEVICES_TYPE,
+       OCI_LINUX_CGROUPS_DEVICES_MAJOR,
+       OCI_LINUX_CGROUPS_DEVICES_MINOR,
+       OCI_LINUX_CGROUPS_DEVICES_ACCESS,
+       __OCI_LINUX_CGROUPS_DEVICES_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_devices_policy[] = {
+       [OCI_LINUX_CGROUPS_DEVICES_ALLOW] = { "allow", BLOBMSG_TYPE_BOOL },
+       [OCI_LINUX_CGROUPS_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_CGROUPS_DEVICES_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_DEVICES_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_DEVICES_ACCESS] = { "access", BLOBMSG_TYPE_STRING },
+};
+
+/*
+ * cgroup-v1 devices got a (default) behaviour and a list of exceptions.
+ * define datatypes similar to the legacy kernel code.
+ */
+#define DEVCG_DEV_ALL (BPF_DEVCG_DEV_BLOCK | BPF_DEVCG_DEV_CHAR)
+#define DEVCG_ACC_ALL (BPF_DEVCG_ACC_READ | BPF_DEVCG_ACC_WRITE | BPF_DEVCG_ACC_MKNOD)
+
+enum devcg_behavior {
+       DEVCG_DEFAULT_NONE,
+       DEVCG_DEFAULT_ALLOW,
+       DEVCG_DEFAULT_DENY,
+};
+
+struct dev_exception_item {
+       uint32_t                major, minor;
+       short                   type;
+       short                   access;
+       struct list_head        list;
+       bool                    allow;
+};
+
+/*
+ * add a bunch of default rules
+ */
+static int add_default_exceptions(struct list_head *exceptions)
+{
+       int i, ret = 0;
+       struct dev_exception_item *cur;
+       /* from crun/src/libcrun/cgroup.c */
+       const struct dev_exception_item defrules[] = {
+               /* always allow mknod */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = ~0,  .minor = ~0,  .access = BPF_DEVCG_ACC_MKNOD },
+               { .allow = true, .type = BPF_DEVCG_DEV_BLOCK, .major = ~0,  .minor = ~0,  .access = BPF_DEVCG_ACC_MKNOD },
+               /* /dev/null */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 3,   .access = DEVCG_ACC_ALL },
+               /* /dev/random */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 8,   .access = DEVCG_ACC_ALL },
+               /* /dev/full */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 7,   .access = DEVCG_ACC_ALL },
+               /* /dev/tty */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 0,   .access = DEVCG_ACC_ALL },
+               /* /dev/zero */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 5,   .access = DEVCG_ACC_ALL },
+               /* /dev/urandom */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 1,   .minor = 9,   .access = DEVCG_ACC_ALL },
+               /* /dev/console */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 1,   .access = DEVCG_ACC_ALL },
+               /* /dev/pts/[0-255] */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 136, .minor = ~0,  .access = DEVCG_ACC_ALL },
+               /* /dev/ptmx */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 5,   .minor = 2,   .access = DEVCG_ACC_ALL },
+               /* /dev/net/tun */
+               { .allow = true, .type = BPF_DEVCG_DEV_CHAR,  .major = 10,  .minor = 200, .access = DEVCG_ACC_ALL },
+       };
+
+       for (i = 0; i < (sizeof(defrules) / sizeof(struct dev_exception_item)); ++i) {
+               cur = malloc(sizeof(struct dev_exception_item));
+               if (!cur) {
+                       ret = ENOMEM;
+                       break;
+               }
+               /* add defaults to list in reverse order (last item will be first in list) */
+               memcpy(cur, &defrules[i], sizeof(struct dev_exception_item));
+               list_add(&cur->list, exceptions);
+       }
+
+       return ret;
+}
+
+/*
+ * free all exceptions in the list
+ */
+static void flush_exceptions(struct list_head *freelist)
+{
+       struct dev_exception_item *dl, *dln;
+
+       if (!list_empty(freelist))
+               list_for_each_entry_safe(dl, dln, freelist, list) {
+                       list_del(&dl->list);
+                       free(dl);
+               }
+}
+
+/*
+ * parse OCI cgroups devices and translate into cgroups-v2 eBPF program
+ */
+int parseOCIlinuxcgroups_devices(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_DEVICES_MAX];
+       struct blob_attr *cur;
+       int rem, ret = 0;
+       int bpf_type, bpf_access;
+       unsigned char acidx;
+       bool allow = false,
+            has_access = false,
+            has_type = false,
+            has_major = false,
+            has_minor = false;
+       int total_ins = 0,
+           cur_ins = 0,
+           pre_insn_len = sizeof(pre_insn) / sizeof(struct bpf_insn),
+           next_ins;
+       char *access, *devtype;
+       uint32_t devmajor, devminor;
+       struct dev_exception_item *dl;
+       struct list_head exceptions;
+       enum devcg_behavior behavior = DEVCG_DEFAULT_ALLOW;
+       INIT_LIST_HEAD(&exceptions);
+
+       /* parse according to OCI spec */
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_linux_cgroups_devices_policy, __OCI_LINUX_CGROUPS_DEVICES_MAX,
+                             tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]) {
+                       ret = EINVAL;
+                       goto out;
+               }
+
+               allow = blobmsg_get_bool(tb[OCI_LINUX_CGROUPS_DEVICES_ALLOW]);
+
+               bpf_access = 0;
+               if (tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]) {
+                       access = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_ACCESS]);
+                       if ((strlen(access) > 3) || (strlen(access) == 0)) {
+                               ret = EINVAL;
+                               goto out;
+                       }
+
+                       for (acidx = 0; acidx < strlen(access); ++acidx) {
+                               switch (access[acidx]) {
+                                       case 'r':
+                                               bpf_access |= BPF_DEVCG_ACC_READ;
+                                               break;
+                                       case 'w':
+                                               bpf_access |= BPF_DEVCG_ACC_WRITE;
+                                               break;
+                                       case 'm':
+                                               bpf_access |= BPF_DEVCG_ACC_MKNOD;
+                                               break;
+                                       default:
+                                               ret = EINVAL;
+                                               goto out;
+                               }
+                       }
+               }
+
+               if (!bpf_access)
+                       bpf_access = DEVCG_ACC_ALL;
+
+               bpf_type = 0;
+               if (tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]) {
+                       devtype = blobmsg_get_string(tb[OCI_LINUX_CGROUPS_DEVICES_TYPE]);
+
+                       switch (devtype[0]) {
+                               case 'c':
+                                       bpf_type = BPF_DEVCG_DEV_CHAR;
+                                       break;
+                               case 'b':
+                                       bpf_type = BPF_DEVCG_DEV_BLOCK;
+                                       break;
+                               case 'a':
+                                       bpf_type = DEVCG_DEV_ALL;
+                                       break;
+                               default:
+                                       ret = EINVAL;
+                                       goto out;
+                       }
+               }
+
+               if (!bpf_type)
+                       bpf_type = DEVCG_DEV_ALL;
+
+               if (tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR])
+                       devmajor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MAJOR]);
+               else
+                       devmajor = ~0;
+
+               if (tb[OCI_LINUX_CGROUPS_DEVICES_MINOR])
+                       devminor = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_DEVICES_MINOR]);
+               else
+                       devminor = ~0;
+
+               if (bpf_type == DEVCG_DEV_ALL) {
+                       /* wildcard => change default policy and flush all existing rules */
+                       flush_exceptions(&exceptions);
+                       behavior = allow?DEVCG_DEFAULT_ALLOW:DEVCG_DEFAULT_DENY;
+               } else {
+                       /* allocate and populate record for exception */
+                       dl = malloc(sizeof(struct dev_exception_item));
+                       if (!dl) {
+                               ret = ENOSPC;
+                               break;
+                       }
+                       dl->allow = allow;
+                       dl->type = bpf_type;
+                       dl->access = bpf_access;
+                       dl->major = devmajor;
+                       dl->minor = devminor;
+
+                       /* push to exceptions list, last goes first */
+                       list_add(&dl->list, &exceptions);
+               }
+       }
+       if (ret)
+               goto out;
+
+       /* add default rules */
+       ret = add_default_exceptions(&exceptions);
+       if (ret)
+               goto out;
+
+       /* calculate number of instructions to allocate */
+       list_for_each_entry(dl, &exceptions, list) {
+               has_access = dl->access != DEVCG_ACC_ALL;
+               has_type = dl->type != DEVCG_DEV_ALL;
+               has_major = dl->major != ~0;
+               has_minor = dl->minor != ~0;
+
+               total_ins += (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 2;
+       }
+
+       /* acccount for loader instructions */
+       total_ins += pre_insn_len;
+
+       /* final accept/deny block */
+       total_ins += 2;
+
+       /* allocate memory for eBPF program */
+       program = calloc(total_ins, sizeof(struct bpf_insn));
+       if (!program) {
+               ret = ENOMEM;
+               goto out;
+       }
+
+       /* copy program loader instructions */
+       memcpy(program, &pre_insn, sizeof(pre_insn));
+       cur_ins = pre_insn_len;
+
+       /* generate eBPF program */
+       list_for_each_entry(dl, &exceptions, list) {
+               has_access = dl->access != DEVCG_ACC_ALL;
+               has_type = dl->type != DEVCG_DEV_ALL;
+               has_major = dl->major != ~0;
+               has_minor = dl->minor != ~0;
+
+               next_ins = (has_type ? 1 : 0) + (has_access ? 3 : 0) + (has_major ? 1 : 0) + (has_minor ? 1 : 0) + 1;
+
+               if (has_type) {
+                       program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_2, dl->type, next_ins);
+                       --next_ins;
+               }
+
+               if (has_access) {
+                       program[cur_ins++] = BPF_MOV32_REG(BPF_REG_1, BPF_REG_3);
+                       program[cur_ins++] = BPF_ALU32_IMM(BPF_AND, BPF_REG_1, dl->access);
+                       program[cur_ins++] = BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, next_ins - 2);
+                       next_ins -= 3;
+               }
+
+               if (has_major) {
+                       program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_4, dl->major, next_ins);
+                       --next_ins;
+               }
+
+               if (has_minor) {
+                       program[cur_ins++] = BPF_JMP_IMM(BPF_JNE, BPF_REG_5, dl->minor, next_ins);
+                       --next_ins;
+               }
+
+               program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, dl->allow ? 1 : 0);
+               program[cur_ins++] = BPF_EXIT_INSN();
+       }
+
+       /* default behavior */
+       program[cur_ins++] = BPF_MOV64_IMM(BPF_REG_0, (behavior == DEVCG_DEFAULT_ALLOW)?1:0);
+       program[cur_ins++] = BPF_EXIT_INSN();
+
+       if (debug) {
+               fprintf(stderr, "cgroup devices:\na > devices.%s\n",
+                       (behavior == DEVCG_DEFAULT_ALLOW)?"allow":"deny");
+
+               list_for_each_entry(dl, &exceptions, list)
+                       fprintf(stderr, "%c %d:%d %s%s%s > devices.%s\n",
+                               (dl->type == DEVCG_DEV_ALL)?'a':
+                                       (dl->type == BPF_DEVCG_DEV_CHAR)?'c':'b',
+                               (dl->major == ~0)?-1:dl->major,
+                               (dl->minor == ~0)?-1:dl->minor,
+                               (dl->access & BPF_DEVCG_ACC_READ)?"r":"",
+                               (dl->access & BPF_DEVCG_ACC_WRITE)?"w":"",
+                               (dl->access & BPF_DEVCG_ACC_MKNOD)?"m":"",
+                               (dl->allow)?"allow":"deny");
+
+               fprintf(stderr, "generated cgroup-devices eBPF program:\n");
+               fprintf(stderr, " [idx]\tcode\t dest\t src\t off\t imm\n");
+               for (cur_ins=0; cur_ins<total_ins; cur_ins++)
+                       fprintf(stderr, " [%03d]\t%02hhx\t%3hhu\t%3hhu\t%04hx\t%d\n", cur_ins,
+                               program[cur_ins].code,
+                               program[cur_ins].dst_reg,
+                               program[cur_ins].src_reg,
+                               program[cur_ins].off,
+                               program[cur_ins].imm);
+       }
+
+       assert(cur_ins == total_ins);
+       bpf_total_insn = total_ins;
+       ret = 0;
+
+out:
+       flush_exceptions(&exceptions);
+       return ret;
+}
+
+/*
+ * attach eBPF program to cgroup
+ */
+int attach_cgroups_ebpf(int cgroup_dirfd) {
+       int prog_fd;
+#if ( __WORDSIZE == 64 )
+       uint64_t program_ptr = (uint64_t)program;
+       uint64_t license_ptr = (uint64_t)license;
+#elif ( __WORDSIZE == 32 )
+       uint32_t program_ptr = (uint32_t)program;
+       uint32_t license_ptr = (uint32_t)license;
+#else
+#error
+#endif
+       union bpf_attr load_attr = {
+               .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
+               .license   = license_ptr,
+               .insns     = program_ptr,
+               .insn_cnt  = bpf_total_insn,
+       };
+
+       if (!program)
+               return 0;
+
+       prog_fd = syscall_bpf(BPF_PROG_LOAD, &load_attr, sizeof(load_attr));
+       if (prog_fd < 0)
+               return EIO;
+
+       union bpf_attr attach_attr = {
+               .attach_type = BPF_CGROUP_DEVICE,
+               .target_fd = cgroup_dirfd,
+               .attach_bpf_fd = prog_fd,
+       };
+
+       return syscall_bpf(BPF_PROG_ATTACH, &attach_attr, sizeof (attach_attr));
+}
diff --git a/jail/cgroups-bpf.h b/jail/cgroups-bpf.h
new file mode 100644 (file)
index 0000000..7f33d85
--- /dev/null
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _JAIL_CGROUPS_BPF_H
+#define _JAIL_CGROUPS_BPF_H
+
+int parseOCIlinuxcgroups_devices(struct blob_attr *msg);
+int attach_cgroups_ebpf(int cgroup_dirfd);
+
+#endif
diff --git a/jail/cgroups.c b/jail/cgroups.c
new file mode 100644 (file)
index 0000000..2d3dce4
--- /dev/null
@@ -0,0 +1,863 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * reads unified cgroup config as proposed in
+ * https://github.com/opencontainers/runtime-spec/pull/1040
+ * attempt conversion from cgroup1 -> cgroup2
+ * https://github.com/containers/crun/blob/0.14.1/crun.1.md#cgroup-v2
+ *
+ * ToDo:
+ *  - convert cgroup1 net_prio and net_cls to eBPF program
+ *  - rdma (anyone?) intelrdt (anyone?)
+ */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <inttypes.h>
+
+#include <libubox/avl.h>
+#include <libubox/avl-cmp.h>
+#include <libubox/blobmsg.h>
+#include <libubox/list.h>
+#include <libubox/utils.h>
+
+#include "log.h"
+#include "cgroups.h"
+#include "cgroups-bpf.h"
+
+#define CGROUP_ROOT "/sys/fs/cgroup/"
+#define CGROUP_IO_WEIGHT_MAX 10000
+
+struct cgval {
+       struct avl_node avl;
+       char *val;
+};
+
+struct avl_tree cgvals;
+static char *cgroup_path;
+static bool initialized;
+
+void cgroups_prepare(void) {
+       initialized = false;
+}
+
+void cgroups_init(const char *p) {
+       avl_init(&cgvals, avl_strcmp, false, NULL);
+       cgroup_path = strdup(p);
+       initialized = true;
+}
+
+static void cgroups_set(const char *key, const char *val)
+{
+       struct cgval *valp;
+
+       valp = avl_find_element(&cgvals, key, valp, avl);
+       if (!valp) {
+               valp = malloc(sizeof(struct cgval));
+               if (!valp)
+                       exit(ENOMEM);
+
+               valp->avl.key = strdup(key);
+               avl_insert(&cgvals, &valp->avl);
+       } else {
+               DEBUG("overwriting previous cgroup2 assignment %s=\"%s\"!\n", key, valp->val);
+               free(valp->val);
+       }
+
+       valp->val = strdup(val);
+}
+
+void cgroups_free(void)
+{
+       struct cgval *valp, *tmp;
+
+       if (initialized) {
+               avl_remove_all_elements(&cgvals, valp, avl, tmp) {
+                       free((void *)(valp->avl.key));
+                       free(valp->val);
+                       free(valp);
+               }
+               free(cgroup_path);
+       }
+}
+
+void cgroups_apply(pid_t pid)
+{
+       struct cgval *valp;
+       char *cdir, *ent;
+       int fd;
+       size_t maxlen = strlen("cgroup.subtree_control");
+
+       bool cpuset = false,
+            cpu = false,
+            hugetlb = false,
+            io = false,
+            memory = false,
+            pids = false,
+            rdma = false;
+
+       char subtree_control[64] = { 0 };
+
+       DEBUG("using cgroup path %s\n", cgroup_path);
+       mkdir_p(cgroup_path, 0700);
+
+       /* find which controllers need to be enabled */
+       avl_for_each_element(&cgvals, valp, avl) {
+               ent = (char *)valp->avl.key;
+               if (strlen(ent) > maxlen)
+                       maxlen = strlen(ent);
+
+               if (!strncmp("cpuset.", ent, 7))
+                       cpuset = true;
+               else if (!strncmp("cpu.", ent, 4))
+                       cpu = true;
+               else if (!strncmp("hugetlb.", ent, 8))
+                       hugetlb = true;
+               else if (!strncmp("io.", ent, 3))
+                       io = true;
+               else if (!strncmp("memory.", ent, 7))
+                       memory = true;
+               else if (!strncmp("pids.", ent, 5))
+                       pids = true;
+               else if (!strncmp("rdma.", ent, 5))
+                       rdma = true;
+       }
+
+       maxlen += strlen(cgroup_path) + 2;
+
+       if (cpuset)
+               strcat(subtree_control, "+cpuset ");
+
+       if (cpu)
+               strcat(subtree_control, "+cpu ");
+
+       if (hugetlb)
+               strcat(subtree_control, "+hugetlb ");
+
+       if (io)
+               strcat(subtree_control, "+io ");
+
+       if (memory)
+               strcat(subtree_control, "+memory ");
+
+       if (pids)
+               strcat(subtree_control, "+pids ");
+
+       if (rdma)
+               strcat(subtree_control, "+rdma ");
+
+       /* remove trailing space */
+       ent = strchr(subtree_control, '\0') - 1;
+       *ent = '\0';
+
+       ent = malloc(maxlen);
+       if (!ent)
+               exit(ENOMEM);
+
+       DEBUG("recursively applying cgroup.subtree_control = \"%s\"\n", subtree_control);
+       cdir = &cgroup_path[strlen(CGROUP_ROOT) - 2];
+       while ((cdir = strchr(cdir + 1, '/'))) {
+               *cdir = '\0';
+               snprintf(ent, maxlen, "%s/cgroup.subtree_control", cgroup_path);
+               DEBUG(" * %s\n", ent);
+               if ((fd = open(ent, O_WRONLY)) < 0) {
+                       ERROR("can't open %s: %m\n", ent);
+                       continue;
+               }
+
+               if (write(fd, subtree_control, strlen(subtree_control)) == -1) {
+                       ERROR("can't write to %s: %m\n", ent);
+                       close(fd);
+                       continue;
+               }
+
+               close(fd);
+               *cdir = '/';
+       }
+
+       avl_for_each_element(&cgvals, valp, avl) {
+               DEBUG("applying cgroup2 %s=\"%s\"\n", (char *)valp->avl.key, valp->val);
+               snprintf(ent, maxlen, "%s/%s", cgroup_path, (char *)valp->avl.key);
+               fd = open(ent, O_WRONLY);
+               if (fd < 0) {
+                       ERROR("can't open %s: %m\n", ent);
+                       continue;
+               }
+               if (dprintf(fd, "%s", valp->val) < 0) {
+                       ERROR("can't write to %s: %m\n", ent);
+               };
+               close(fd);
+       }
+
+       int dirfd = open(cgroup_path, O_DIRECTORY);
+       if (dirfd < 0) {
+               ERROR("can't open %s: %m\n", cgroup_path);
+       } else {
+               attach_cgroups_ebpf(dirfd);
+               close(dirfd);
+       }
+
+       snprintf(ent, maxlen, "%s/%s", cgroup_path, "cgroup.procs");
+       fd = open(ent, O_WRONLY);
+       if (fd < 0) {
+               ERROR("can't open %s: %m\n", cgroup_path);
+       } else {
+               dprintf(fd, "%d", pid);
+               close(fd);
+       }
+
+       free(ent);
+}
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT,
+       __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_weightdevice_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
+};
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE,
+       __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_throttledevice_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] = { "major", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] = { "minor", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE] = { "rate", BLOBMSG_CAST_INT64 },
+};
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT,
+       OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE,
+       OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE,
+       __OCI_LINUX_CGROUPS_BLOCKIO_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_blockio_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT] = { "weight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT] = { "leafWeight", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE] = { "weightDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE] = { "throttleReadBpsDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE] = { "throttleWriteBpsDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE] = { "throttleReadIOPSDevice", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE] = { "throttleWriteIOPSDevice", BLOBMSG_TYPE_ARRAY },
+};
+
+struct posix_dev {
+       uint64_t major;
+       uint64_t minor;
+};
+
+struct iomax_line {
+       struct avl_node avl;
+       struct posix_dev dev;
+       uint64_t rbps;
+       uint64_t wbps;
+       uint64_t riops;
+       uint64_t wiops;
+};
+
+static int avl_devcmp(const void *k1, const void *k2, void *ptr)
+{
+       struct posix_dev *d1 = (struct posix_dev *)k1, *d2 = (struct posix_dev *)k2;
+
+       if (d1->major < d2->major)
+               return -1;
+
+       if (d1->major > d2->major)
+               return 1;
+
+       if (d1->minor < d2->minor)
+               return -1;
+
+       if (d1->minor > d2->minor)
+               return 1;
+
+       return 0;
+}
+
+static struct iomax_line *get_iomax_line(struct avl_tree *iomax, uint64_t major, uint64_t minor)
+{
+       struct iomax_line *l;
+       struct posix_dev d;
+       d.major = major;
+       d.minor = minor;
+       l = avl_find_element(iomax, &d, l, avl);
+       if (!l) {
+               l = malloc(sizeof(struct iomax_line));
+               if (!l)
+                       exit(ENOMEM);
+
+               l->dev.major = d.major;
+               l->dev.minor = d.minor;
+               l->avl.key = &l->dev;
+               l->rbps = -1;
+               l->wbps = -1;
+               l->riops = -1;
+               l->wiops = -1;
+               avl_insert(iomax, &l->avl);
+       }
+
+       return l;
+}
+
+static int parseOCIlinuxcgroups_legacy_blockio(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_BLOCKIO_MAX],
+                        *tbwd[__OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX],
+                        *tbtd[__OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX],
+                        *cur;
+       int rem;
+       int weight = -1, leafweight = -1;
+       size_t numweightstrs = 0, numiomaxstrs = 0, strtotlen = 1;
+       char **weightstrs = NULL, **iomaxstrs = NULL, **curstr;
+       char *weightstr, *iomaxstr;
+       struct avl_tree iomax;
+       struct iomax_line *curiomax, *tmp;
+
+       blobmsg_parse(oci_linux_cgroups_blockio_policy, __OCI_LINUX_CGROUPS_BLOCKIO_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]) {
+               weight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
+               ++numweightstrs;
+       }
+
+       if (weight > CGROUP_IO_WEIGHT_MAX)
+               return ERANGE;
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO_LEAFWEIGHT])
+               leafweight = blobmsg_get_u32(tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHT]);
+
+       if (leafweight > CGROUP_IO_WEIGHT_MAX)
+               return ERANGE;
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem)
+               ++numweightstrs;
+
+       weightstrs = calloc(numweightstrs + 1, sizeof(char *));
+       if (!weightstrs)
+               exit(ENOMEM);
+
+       numweightstrs = 0;
+
+       if (weight > -1)
+               if (asprintf(&weightstrs[numweightstrs++], "default %d", weight) < 0)
+                       return ENOMEM;
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE], rem) {
+               uint64_t major, minor;
+               int devweight = weight, devleafweight = leafweight;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_weightdevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAX, tbwd, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR] ||
+                   !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR])
+                       return ENODATA;
+
+               if (!tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT] &&
+                   !tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       return ENODATA;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT])
+                       devweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_WEIGHT]);
+
+               if (devweight > CGROUP_IO_WEIGHT_MAX)
+                       return ERANGE;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       devleafweight = blobmsg_get_u32(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT]);
+
+               if (devleafweight > CGROUP_IO_WEIGHT_MAX)
+                       return ERANGE;
+
+               if (tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_LEAFWEIGHT])
+                       return ENOTSUP;
+
+               major = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MAJOR]);
+               minor = blobmsg_cast_u64(tbwd[OCI_LINUX_CGROUPS_BLOCKIO_WEIGHTDEVICE_MINOR]);
+
+               if (asprintf(&weightstrs[numweightstrs++], "%" PRIu64 ":%" PRIu64 " %u", major, minor, devweight) < 0)
+                       return ENOMEM;
+       }
+
+       if (numweightstrs) {
+               curstr = weightstrs;
+               while (*curstr)
+                       strtotlen += strlen(*(curstr++)) + 1;
+
+               weightstr = calloc(strtotlen, sizeof(char));
+               if (!weightstr)
+                       exit(ENOMEM);
+
+               curstr = weightstrs;
+               while (*curstr) {
+                       strcat(weightstr, *curstr);
+                       strcat(weightstr, "\n");
+                       free(*(curstr++));
+               }
+
+               cgroups_set("io.bfq.weight", weightstr);
+               free(weightstr);
+       };
+
+       free(weightstrs);
+
+       avl_init(&iomax, avl_devcmp, false, NULL);
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADBPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->rbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEBPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->wbps = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEREADIOPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->riops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEWRITEIOPSDEVICE], rem) {
+               struct iomax_line *l;
+
+               blobmsg_parse(oci_linux_cgroups_blockio_throttledevice_policy, __OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAX, tbtd, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR] ||
+                   !tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE])
+                       return ENODATA;
+
+               l = get_iomax_line(&iomax,
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MAJOR]),
+                                  blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_MINOR]));
+
+               l->wiops = blobmsg_cast_u64(tbtd[OCI_LINUX_CGROUPS_BLOCKIO_THROTTLEDEVICE_RATE]);
+       }
+
+       avl_for_each_element(&iomax, curiomax, avl)
+               ++numiomaxstrs;
+
+       if (!numiomaxstrs)
+               return 0;
+
+       iomaxstrs = calloc(numiomaxstrs + 1, sizeof(char *));
+       if (!iomaxstrs)
+               exit(ENOMEM);
+
+       numiomaxstrs = 0;
+
+       avl_for_each_element(&iomax, curiomax, avl) {
+               char iomaxlstr[160];
+               char lstr[32];
+
+               sprintf(iomaxlstr, "%" PRIu64 ":%" PRIu64 " ", curiomax->dev.major, curiomax->dev.minor);
+
+               if (curiomax->rbps != -1) {
+                       sprintf(lstr, "rbps=%" PRIu64 " ", curiomax->rbps);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->wbps != -1) {
+                       sprintf(lstr, "wbps=%" PRIu64 " ", curiomax->wbps);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->riops != -1) {
+                       sprintf(lstr, "riops=%" PRIu64 " ", curiomax->riops);
+                       strcat(iomaxlstr, lstr);
+               }
+               if (curiomax->wiops != -1) {
+                       sprintf(lstr, "wiops=%" PRIu64 " ", curiomax->wiops);
+                       strcat(iomaxlstr, lstr);
+               }
+
+               iomaxstrs[numiomaxstrs++] = strdup(iomaxlstr);
+       }
+
+       avl_for_each_element_safe(&iomax, curiomax, avl, tmp) {
+               avl_delete(&iomax, &curiomax->avl);
+               free(curiomax);
+       }
+
+       strtotlen = 1; /* 1 accounts for \0 at end of string */
+       if (numiomaxstrs) {
+               curstr = iomaxstrs;
+               while (*curstr)
+                       strtotlen += strlen(*(curstr++)) + 1; /* +1 accounts for \n at end of line */
+
+               iomaxstr = calloc(strtotlen, sizeof(char));
+               if (!iomaxstr)
+                       exit(ENOMEM);
+
+               curstr = iomaxstrs;
+
+               while (*curstr) {
+                       strcat(iomaxstr, *curstr);
+                       strcat(iomaxstr, "\n");
+                       free(*(curstr++));
+               }
+
+               cgroups_set("io.max", iomaxstr);
+               free(iomaxstr);
+       };
+
+       free(iomaxstrs);
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_CPU_SHARES,
+       OCI_LINUX_CGROUPS_CPU_PERIOD,
+       OCI_LINUX_CGROUPS_CPU_QUOTA,
+       OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME,
+       OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD,
+       OCI_LINUX_CGROUPS_CPU_CPUS,
+       OCI_LINUX_CGROUPS_CPU_MEMS,
+       __OCI_LINUX_CGROUPS_CPU_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_cpu_policy[] = {
+       [OCI_LINUX_CGROUPS_CPU_SHARES] = { "shares", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_PERIOD] = { "period", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_QUOTA] = { "quota", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] = { "realtimePeriod", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME] = { "realtimeRuntime", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_CPU_CPUS] = { "cpus", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_CGROUPS_CPU_MEMS] = { "mems", BLOBMSG_TYPE_STRING },
+};
+
+static int parseOCIlinuxcgroups_legacy_cpu(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_CPU_MAX];
+       uint64_t shares, period = 0;
+       int64_t quota = -2; /* unset */
+       char tmp[32] = { 0 };
+
+       blobmsg_parse(oci_linux_cgroups_cpu_policy, __OCI_LINUX_CGROUPS_CPU_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_REALTIMEPERIOD] ||
+           tb[OCI_LINUX_CGROUPS_CPU_REALTIMERUNTIME])
+               return ENOTSUP; /* no equivalent in cgroup2 */
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_SHARES]) {
+               shares = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_SHARES]);
+               if ((shares < 2) || (shares > 262144))
+                       return ERANGE;
+
+               snprintf(tmp, sizeof(tmp), "%" PRIu64, (((uint64_t)1) + ((shares - 2) * 9999) / 262142));
+               cgroups_set("cpu.weight", tmp);
+               tmp[0] = '\0';
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_QUOTA])
+               quota = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_CPU_QUOTA]);
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_PERIOD])
+               period = blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_CPU_PERIOD]);
+
+       if (period) {
+               if (quota >= 0)
+                       snprintf(tmp, sizeof(tmp), "%" PRId64 " %" PRIu64 , quota, period);
+               else
+                       snprintf(tmp, sizeof(tmp), "max %" PRIu64, period); /* assume default */
+       } else if (quota >= 0) {
+               snprintf(tmp, sizeof(tmp), "%" PRId64, quota);
+       } else if (quota == -1) {
+               strcpy(tmp, "max");
+       }
+
+       if (tmp[0])
+               cgroups_set("cpu.max", tmp);
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_CPUS])
+               cgroups_set("cpuset.cpus", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_CPUS]));
+
+       if (tb[OCI_LINUX_CGROUPS_CPU_MEMS])
+               cgroups_set("cpuset.mems", blobmsg_get_string(tb[OCI_LINUX_CGROUPS_CPU_MEMS]));
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_MEMORY_LIMIT,
+       OCI_LINUX_CGROUPS_MEMORY_RESERVATION,
+       OCI_LINUX_CGROUPS_MEMORY_SWAP,
+       OCI_LINUX_CGROUPS_MEMORY_KERNEL,
+       OCI_LINUX_CGROUPS_MEMORY_KERNELTCP,
+       OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS,
+       OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER,
+       OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY,
+       __OCI_LINUX_CGROUPS_MEMORY_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_memory_policy[] = {
+       [OCI_LINUX_CGROUPS_MEMORY_LIMIT] = { "limit", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_RESERVATION] = { "reservation", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_SWAP] = { "swap", BLOBMSG_CAST_INT64 }, /* signed int64! */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNEL] = { "kernel", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
+       [OCI_LINUX_CGROUPS_MEMORY_KERNELTCP] = { "kernelTCP", BLOBMSG_CAST_INT64 }, /* signed int64! ignored */
+       [OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] = { "swappiness", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] = { "disableOOMKiller", BLOBMSG_TYPE_BOOL },
+       [OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY] { "useHierarchy", BLOBMSG_TYPE_BOOL },
+};
+
+static int parseOCIlinuxcgroups_legacy_memory(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
+       char tmp[32] = { 0 };
+       int64_t limit = -1, swap, reservation;
+
+       blobmsg_parse(oci_linux_cgroups_memory_policy, __OCI_LINUX_CGROUPS_MEMORY_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       /*
+        * not all properties of the OCI memory section can be mapped to cgroup2
+        * kernel memory accounting is always enabled and included in the set
+        *   memory limit, hence these options can be ignored
+        * disableOOMKiller could be emulated using oom_score_adj + seccomp eBPF
+        *   preventing self-upgrade (but allow downgrade)
+        *
+        * see also https://github.com/opencontainers/runtime-spec/issues/1005
+        */
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAPPINESS] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_DISABLEOOMKILLER] ||
+           tb[OCI_LINUX_CGROUPS_MEMORY_USEHIERARCHY])
+               return ENOTSUP;
+
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]) {
+               limit = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_LIMIT]);
+               if (limit == -1)
+                       strcpy(tmp, "max");
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, limit);
+
+               cgroups_set("memory.max", tmp);
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]) {
+               reservation = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_RESERVATION]);
+
+               if (reservation == -1)
+                       strcpy(tmp, "max");
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, reservation);
+
+               cgroups_set("memory.low", tmp);
+       }
+
+       /* OCI 'swap' acounts for memory+swap */
+       if (tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]) {
+               swap = blobmsg_cast_s64(tb[OCI_LINUX_CGROUPS_MEMORY_SWAP]);
+
+               if (swap == -1)
+                       strcpy(tmp, "max");
+               else if (limit == -1 || (limit < swap))
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, swap);
+               else
+                       snprintf(tmp, sizeof(tmp), "%" PRId64, limit - swap);
+
+               cgroups_set("memory.swap_max", tmp);
+       }
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPS_PIDS_LIMIT,
+       __OCI_LINUX_CGROUPS_PIDS_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_pids_policy[] = {
+       [OCI_LINUX_CGROUPS_PIDS_LIMIT] = { "limit", BLOBMSG_CAST_INT64 },
+};
+
+static int parseOCIlinuxcgroups_legacy_pids(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MEMORY_MAX];
+       char tmp[32] = { 0 };
+
+       blobmsg_parse(oci_linux_cgroups_pids_policy, __OCI_LINUX_CGROUPS_PIDS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_CGROUPS_PIDS_LIMIT])
+               return EINVAL;
+
+       snprintf(tmp, sizeof(tmp), "%" PRIu64, blobmsg_cast_u64(tb[OCI_LINUX_CGROUPS_PIDS_LIMIT]));
+
+       cgroups_set("pids.max", tmp);
+
+       return 0;
+}
+
+static int parseOCIlinuxcgroups_unified(struct blob_attr *msg)
+{
+       struct blob_attr *cur;
+       int rem;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
+                       return EINVAL;
+
+               /* restrict keys */
+               if (strchr(blobmsg_name(cur), '/') ||
+                   !strcmp(blobmsg_name(cur), "cgroup.subtree_control") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.procs") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.threads") ||
+                   !strcmp(blobmsg_name(cur), "cgroup.freeze"))
+                       return EINVAL;
+
+               cgroups_set(blobmsg_name(cur), blobmsg_get_string(cur));
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_LINUX_CGROUPS_BLOCKIO,
+       OCI_LINUX_CGROUPS_CPU,
+       OCI_LINUX_CGROUPS_DEVICES,
+       OCI_LINUX_CGROUPS_HUGEPAGELIMITS,
+       OCI_LINUX_CGROUPS_INTELRDT,
+       OCI_LINUX_CGROUPS_MEMORY,
+       OCI_LINUX_CGROUPS_NETWORK,
+       OCI_LINUX_CGROUPS_PIDS,
+       OCI_LINUX_CGROUPS_RDMA,
+       OCI_LINUX_CGROUPS_UNIFIED,
+       __OCI_LINUX_CGROUPS_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_cgroups_policy[] = {
+       [OCI_LINUX_CGROUPS_BLOCKIO] = { "blockIO", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_CPU] = { "cpu", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_HUGEPAGELIMITS] = { "hugepageLimits", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_CGROUPS_INTELRDT] = { "intelRdt", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_MEMORY] = { "memory", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_NETWORK] = { "network", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_PIDS] = { "pids", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_RDMA] = { "rdma", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_CGROUPS_UNIFIED] = { "unified", BLOBMSG_TYPE_TABLE },
+};
+
+int parseOCIlinuxcgroups(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_CGROUPS_MAX];
+       int ret;
+
+       blobmsg_parse(oci_linux_cgroups_policy, __OCI_LINUX_CGROUPS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_CGROUPS_HUGEPAGELIMITS] ||
+           tb[OCI_LINUX_CGROUPS_INTELRDT] ||
+           tb[OCI_LINUX_CGROUPS_NETWORK] ||
+           tb[OCI_LINUX_CGROUPS_RDMA])
+               return ENOTSUP;
+
+       if (tb[OCI_LINUX_CGROUPS_BLOCKIO]) {
+               ret = parseOCIlinuxcgroups_legacy_blockio(tb[OCI_LINUX_CGROUPS_BLOCKIO]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_CPU]) {
+               ret = parseOCIlinuxcgroups_legacy_cpu(tb[OCI_LINUX_CGROUPS_CPU]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_DEVICES]) {
+               ret = parseOCIlinuxcgroups_devices(tb[OCI_LINUX_CGROUPS_DEVICES]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_MEMORY]) {
+               ret = parseOCIlinuxcgroups_legacy_memory(tb[OCI_LINUX_CGROUPS_MEMORY]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_PIDS]) {
+               ret = parseOCIlinuxcgroups_legacy_pids(tb[OCI_LINUX_CGROUPS_PIDS]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_LINUX_CGROUPS_UNIFIED]) {
+               ret = parseOCIlinuxcgroups_unified(tb[OCI_LINUX_CGROUPS_UNIFIED]);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
diff --git a/jail/cgroups.h b/jail/cgroups.h
new file mode 100644 (file)
index 0000000..4c8f968
--- /dev/null
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _JAIL_CGROUPS_H
+#define _JAIL_CGROUPS_H
+
+void cgroups_init(const char *p);
+int parseOCIlinuxcgroups(struct blob_attr *msg);
+void cgroups_apply(pid_t pid);
+void cgroups_free(void);
+void cgroups_prepare(void);
+
+#endif
index 7c6076afb6a7f305695163ede2a796af73f9cd5e..978fc6e113514578d302e343b9bb7807f53bb941 100644 (file)
@@ -102,7 +102,7 @@ const char* find_lib(const char *file)
        return l->path;
 }
 
-static int elf64_find_section(const char *map, unsigned int type, unsigned int *offset, unsigned int *size, unsigned int *vaddr)
+static int elf64_find_section(const char *map, unsigned int type, unsigned long *offset, unsigned long *size, unsigned long *vaddr)
 {
        Elf64_Ehdr *e;
        Elf64_Phdr *ph;
@@ -125,7 +125,7 @@ static int elf64_find_section(const char *map, unsigned int type, unsigned int *
        return -1;
 }
 
-static int elf32_find_section(const char *map, unsigned int type, unsigned int *offset, unsigned int *size, unsigned int *vaddr)
+static int elf32_find_section(const char *map, unsigned int type, unsigned long *offset, unsigned long *size, unsigned long *vaddr)
 {
        Elf32_Ehdr *e;
        Elf32_Phdr *ph;
@@ -148,7 +148,7 @@ static int elf32_find_section(const char *map, unsigned int type, unsigned int *
        return -1;
 }
 
-static int elf_find_section(const char *map, unsigned int type, unsigned int *offset, unsigned int *size, unsigned int *vaddr)
+static int elf_find_section(const char *map, unsigned int type, unsigned long *offset, unsigned long *size, unsigned long *vaddr)
 {
        int clazz = map[EI_CLASS];
 
@@ -162,7 +162,7 @@ static int elf_find_section(const char *map, unsigned int type, unsigned int *of
        return -1;
 }
 
-static int elf32_scan_dynamic(const char *map, int dyn_offset, int dyn_size, int load_offset)
+static int elf32_scan_dynamic(const char *map, unsigned long dyn_offset, unsigned long dyn_size, long load_offset)
 {
        Elf32_Dyn *dynamic = (Elf32_Dyn *) (map + dyn_offset);
        const char *strtab = NULL;
@@ -196,7 +196,7 @@ static int elf32_scan_dynamic(const char *map, int dyn_offset, int dyn_size, int
        return 0;
 }
 
-static int elf64_scan_dynamic(const char *map, int dyn_offset, int dyn_size, int load_offset)
+static int elf64_scan_dynamic(const char *map, unsigned long dyn_offset, unsigned long dyn_size, long load_offset)
 {
        Elf64_Dyn *dynamic = (Elf64_Dyn *) (map + dyn_offset);
        const char *strtab = NULL;
@@ -232,22 +232,22 @@ static int elf64_scan_dynamic(const char *map, int dyn_offset, int dyn_size, int
 
 int elf_load_deps(const char *path, const char *map)
 {
-       unsigned int dyn_offset, dyn_size;
-       unsigned int load_offset, load_vaddr;
-       unsigned int interp_offset;
+       unsigned long dyn_offset, dyn_size;
+       unsigned long load_offset, load_vaddr;
+       unsigned long interp_offset;
 
-       if (elf_find_section(map, PT_LOAD, &load_offset, NULL, &load_vaddr)) {
-               ERROR("failed to load the .load section from %s\n", path);
-               return -1;
+       if (elf_find_section(map, PT_INTERP, &interp_offset, NULL, NULL) == 0) {
+               add_path_and_deps(map+interp_offset, 1, -1, 0);
        }
 
-       if (elf_find_section(map, PT_DYNAMIC, &dyn_offset, &dyn_size, NULL)) {
-               ERROR("failed to load the .dynamic section from %s\n", path);
-               return -1;
+       if (elf_find_section(map, PT_LOAD, &load_offset, NULL, &load_vaddr)) {
+               DEBUG("failed to load the .load section from %s\n", path);
+               return 0;
        }
 
-       if (elf_find_section(map, PT_INTERP, &interp_offset, NULL, NULL) == 0) {
-               add_path_and_deps(map+interp_offset, 1, -1, 0);
+       if (elf_find_section(map, PT_DYNAMIC, &dyn_offset, &dyn_size, NULL)) {
+               DEBUG("failed to load the .dynamic section from %s\n", path);
+               return 0;
        }
 
        int clazz = map[EI_CLASS];
@@ -315,3 +315,15 @@ void init_library_search(void)
        alloc_library_path("/usr/lib");
        load_ldso_conf("/etc/ld.so.conf");
 }
+
+void free_library_search(void)
+{
+       struct library_path *p, *ptmp;
+       struct library *l, *tmp;
+
+       list_for_each_entry_safe(p, ptmp, &library_paths, list)
+               free(p);
+
+       avl_remove_all_elements(&libraries, l, avl, tmp)
+               free(l);
+}
index 78fedcd3323010ab90cea934a0bed530a0185d87..11fd7e08a96d28bf3f79aa43c1518c9cd859573b 100644 (file)
@@ -34,5 +34,6 @@ int elf_load_deps(const char *path, const char *map);
 const char* find_lib(const char *file);
 void init_library_search(void);
 int lib_open(char **fullpath, const char *file);
+void free_library_search(void);
 
 #endif
index 8cc47d32f2b768b7fca9a0ddb511a0a5f718fe3e..423878b3398f1424ec823a75882de99e874243d6 100644 (file)
--- a/jail/fs.c
+++ b/jail/fs.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
  * Copyright (C) 2015 Etienne Champetier <champetier.etienne@gmail.com>
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License version 2.1
 #include <fcntl.h>
 #include <linux/limits.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <unistd.h>
+#include <libgen.h>
 
 #include <libubox/avl.h>
 #include <libubox/avl-cmp.h>
+#include <libubox/blobmsg.h>
+#include <libubox/list.h>
+#include <libubox/utils.h>
 
 #include "elf.h"
 #include "fs.h"
 #include "jail.h"
 #include "log.h"
 
+#define UJAIL_NOAFILE "/tmp/.ujailnoafile"
+
 struct mount {
-        struct avl_node avl;
-        const char *path;
-        int readonly;
-        int error;
+       struct avl_node avl;
+       const char *source;
+       const char *target;
+       const char *filesystemtype;
+       unsigned long mountflags;
+       unsigned long propflags;
+       const char *optstr;
+       int error;
+       bool inner;
 };
 
 struct avl_tree mounts;
 
-int add_mount(const char *path, int readonly, int error)
+static int do_mount(const char *root, const char *orig_source, const char *target, const char *filesystemtype,
+                   unsigned long orig_mountflags, unsigned long propflags, const char *optstr, int error, bool inner)
 {
-       assert(path != NULL);
+       struct stat s;
+       char new[PATH_MAX];
+       char *source = (char *)orig_source;
+       int fd, ret = 0;
+       bool is_bind = (orig_mountflags & MS_BIND);
+       bool is_mask = (source == (void *)(-1));
+       unsigned long mountflags = orig_mountflags;
+
+       assert(!(inner && is_mask));
+       assert(!(inner && !orig_source));
+
+       if (source && is_bind && stat(source, &s)) {
+               ERROR("stat(%s) failed: %m\n", source);
+               return error;
+       }
+
+       if (inner)
+               if (asprintf(&source, "%s%s", root, orig_source) < 0)
+                       return ENOMEM;
+
+       snprintf(new, sizeof(new), "%s%s", root, target?target:source);
+
+       if (is_mask) {
+               if (stat(new, &s))
+                       return 0; /* doesn't exists, nothing to mask */
+
+               if (S_ISDIR(s.st_mode)) {/* use empty 0-sized tmpfs for directories */
+                       if (mount("none", new, "tmpfs", MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME, "size=0,mode=000"))
+                               return error;
+               } else {
+                       /* mount-bind 0-sized file having mode 000 */
+                       if (mount(UJAIL_NOAFILE, new, "bind", MS_BIND, NULL))
+                               return error;
+
+                       if (mount(UJAIL_NOAFILE, new, "bind", MS_REMOUNT | MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME, NULL))
+                               return error;
+               }
+
+               DEBUG("masked path %s\n", new);
+               return 0;
+       }
+
+
+       if (!is_bind || (source && S_ISDIR(s.st_mode))) {
+               mkdir_p(new, 0755);
+       } else if (is_bind && source) {
+               mkdir_p(dirname(new), 0755);
+               snprintf(new, sizeof(new), "%s%s", root, target?target:source);
+               fd = open(new, O_CREAT|O_WRONLY|O_TRUNC|O_EXCL, 0644);
+               if (fd >= 0)
+                       close(fd);
+
+               if (error && fd < 0 && errno != EEXIST) {
+                       ERROR("failed to create mount target %s: %m\n", new);
+
+                       ret = errno;
+                       goto free_source_out;
+               }
+       }
+
+       if (is_bind) {
+               if (mount(source?:new, new, filesystemtype?:"bind", MS_BIND | (mountflags & MS_REC), optstr)) {
+                       if (error)
+                               ERROR("failed to mount -B %s %s: %m\n", source, new);
+
+                       ret = error;
+                       goto free_source_out;
+               }
+               mountflags |= MS_REMOUNT;
+       }
+
+       const char *hack_fstype = ((!filesystemtype || strcmp(filesystemtype, "cgroup"))?filesystemtype:"cgroup2");
+       if (mount(source?:(is_bind?new:NULL), new, hack_fstype?:"none", mountflags, optstr)) {
+               if (error)
+                       ERROR("failed to mount %s %s: %m\n", source, new);
 
-       if (avl_find(&mounts, path))
+               ret = error;
+               goto free_source_out;
+       }
+
+       DEBUG("mount %s%s %s (%s)\n", (mountflags & MS_BIND)?"-B ":"", source, new,
+             (mountflags & MS_RDONLY)?"ro":"rw");
+
+       if (propflags && mount("none", new, "none", propflags, NULL)) {
+               if (error)
+                       ERROR("failed to mount --make-... %s \n", new);
+
+               ret = error;
+       }
+
+free_source_out:
+       if (inner)
+               free(source);
+
+       return ret;
+}
+
+static int _add_mount(const char *source, const char *target, const char *filesystemtype,
+                     unsigned long mountflags, unsigned long propflags, const char *optstr,
+                     int error, bool inner)
+{
+       assert(target != NULL);
+
+       if (avl_find(&mounts, target))
                return 1;
 
        struct mount *m;
        m = calloc(1, sizeof(struct mount));
-       assert(m != NULL);
-       m->avl.key = m->path = strdup(path);
-       m->readonly = readonly;
+       if (!m)
+               return ENOMEM;
+
+       m->avl.key = m->target = strdup(target);
+       if (source) {
+               if (source != (void*)(-1))
+                       m->source = strdup(source);
+               else
+                       m->source = (void*)(-1);
+       }
+       if (filesystemtype)
+               m->filesystemtype = strdup(filesystemtype);
+
+       if (optstr)
+               m->optstr = strdup(optstr);
+
+       m->mountflags = mountflags;
+       m->propflags = propflags;
        m->error = error;
+       m->inner = inner;
 
        avl_insert(&mounts, &m->avl);
-       DEBUG("adding mount %s ro(%d) err(%d)\n", m->path, m->readonly, m->error != 0);
+       DEBUG("adding mount %s %s bind(%d) ro(%d) err(%d)\n", (m->source == (void*)(-1))?"mask":m->source, m->target,
+               !!(m->mountflags & MS_BIND), !!(m->mountflags & MS_RDONLY), m->error != 0);
+
+       return 0;
+}
+
+int add_mount(const char *source, const char *target, const char *filesystemtype,
+             unsigned long mountflags, unsigned long propflags, const char *optstr, int error)
+{
+       return _add_mount(source, target, filesystemtype, mountflags, propflags, optstr, error, false);
+}
+
+int add_mount_inner(const char *source, const char *target, const char *filesystemtype,
+             unsigned long mountflags, unsigned long propflags, const char *optstr, int error)
+{
+       return _add_mount(source, target, filesystemtype, mountflags, propflags, optstr, error, true);
+}
+
+static int _add_mount_bind(const char *path, const char *path2, int readonly, int error)
+{
+       unsigned long mountflags = MS_BIND;
+
+       if (readonly)
+               mountflags |= MS_RDONLY;
+
+       return add_mount(path, path2, NULL, mountflags, 0, NULL, error);
+}
+
+int add_mount_bind(const char *path, int readonly, int error)
+{
+       return _add_mount_bind(path, path, readonly, error);
+}
+
+enum {
+       OCI_MOUNT_SOURCE,
+       OCI_MOUNT_DESTINATION,
+       OCI_MOUNT_TYPE,
+       OCI_MOUNT_OPTIONS,
+       __OCI_MOUNT_MAX,
+};
+
+static const struct blobmsg_policy oci_mount_policy[] = {
+       [OCI_MOUNT_SOURCE] = { "source", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_DESTINATION] = { "destination", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_MOUNT_OPTIONS] = { "options", BLOBMSG_TYPE_ARRAY },
+};
+
+struct mount_opt {
+       struct list_head list;
+       char *optstr;
+};
+
+#ifndef MS_LAZYTIME
+#define MS_LAZYTIME (1 << 25)
+#endif
+
+static int parseOCImountopts(struct blob_attr *msg, unsigned long *mount_flags, unsigned long *propagation_flags, char **mount_data, int *error)
+{
+       struct blob_attr *cur;
+       int rem;
+       unsigned long mf = 0;
+       unsigned long pf = 0;
+       char *tmp;
+       struct list_head fsopts = LIST_HEAD_INIT(fsopts);
+       size_t len = 0;
+       struct mount_opt *opt, *tmpopt;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               tmp = blobmsg_get_string(cur);
+               if (!strcmp("ro", tmp))
+                       mf |= MS_RDONLY;
+               else if (!strcmp("rw", tmp))
+                       mf &= ~MS_RDONLY;
+               else if (!strcmp("bind", tmp))
+                       mf = MS_BIND;
+               else if (!strcmp("rbind", tmp))
+                       mf |= MS_BIND | MS_REC;
+               else if (!strcmp("sync", tmp))
+                       mf |= MS_SYNCHRONOUS;
+               else if (!strcmp("async", tmp))
+                       mf &= ~MS_SYNCHRONOUS;
+               else if (!strcmp("atime", tmp))
+                       mf &= ~MS_NOATIME;
+               else if (!strcmp("noatime", tmp))
+                       mf |= MS_NOATIME;
+               else if (!strcmp("defaults", tmp))
+                       mf = 0; /* rw, suid, dev, exec, auto, nouser, and async */
+               else if (!strcmp("dev", tmp))
+                       mf &= ~MS_NODEV;
+               else if (!strcmp("nodev", tmp))
+                       mf |= MS_NODEV;
+               else if (!strcmp("iversion", tmp))
+                       mf |= MS_I_VERSION;
+               else if (!strcmp("noiversion", tmp))
+                       mf &= ~MS_I_VERSION;
+               else if (!strcmp("diratime", tmp))
+                       mf &= ~MS_NODIRATIME;
+               else if (!strcmp("nodiratime", tmp))
+                       mf |= MS_NODIRATIME;
+               else if (!strcmp("dirsync", tmp))
+                       mf |= MS_DIRSYNC;
+               else if (!strcmp("exec", tmp))
+                       mf &= ~MS_NOEXEC;
+               else if (!strcmp("noexec", tmp))
+                       mf |= MS_NOEXEC;
+               else if (!strcmp("mand", tmp))
+                       mf |= MS_MANDLOCK;
+               else if (!strcmp("nomand", tmp))
+                       mf &= ~MS_MANDLOCK;
+               else if (!strcmp("relatime", tmp))
+                       mf |= MS_RELATIME;
+               else if (!strcmp("norelatime", tmp))
+                       mf &= ~MS_RELATIME;
+               else if (!strcmp("strictatime", tmp))
+                       mf |= MS_STRICTATIME;
+               else if (!strcmp("nostrictatime", tmp))
+                       mf &= ~MS_STRICTATIME;
+               else if (!strcmp("lazytime", tmp))
+                       mf |= MS_LAZYTIME;
+               else if (!strcmp("nolazytime", tmp))
+                       mf &= ~MS_LAZYTIME;
+               else if (!strcmp("suid", tmp))
+                       mf &= ~MS_NOSUID;
+               else if (!strcmp("nosuid", tmp))
+                       mf |= MS_NOSUID;
+               else if (!strcmp("remount", tmp))
+                       mf |= MS_REMOUNT;
+               /* propagation flags */
+               else if (!strcmp("private", tmp))
+                       pf |= MS_PRIVATE;
+               else if (!strcmp("rprivate", tmp))
+                       pf |= MS_PRIVATE | MS_REC;
+               else if (!strcmp("slave", tmp))
+                       pf |= MS_SLAVE;
+               else if (!strcmp("rslave", tmp))
+                       pf |= MS_SLAVE | MS_REC;
+               else if (!strcmp("shared", tmp))
+                       pf |= MS_SHARED;
+               else if (!strcmp("rshared", tmp))
+                       pf |= MS_SHARED | MS_REC;
+               else if (!strcmp("unbindable", tmp))
+                       pf |= MS_UNBINDABLE;
+               else if (!strcmp("runbindable", tmp))
+                       pf |= MS_UNBINDABLE | MS_REC;
+               /* special case: 'nofail' */
+               else if(!strcmp("nofail", tmp))
+                       *error = 0;
+               else if (!strcmp("auto", tmp) ||
+                        !strcmp("noauto", tmp) ||
+                        !strcmp("user", tmp) ||
+                        !strcmp("group", tmp) ||
+                        !strcmp("_netdev", tmp))
+                       DEBUG("ignoring built-in mount option %s\n", tmp);
+               else {
+                       /* filesystem-specific free-form option */
+                       opt = calloc(1, sizeof(*opt));
+                       opt->optstr = tmp;
+                       list_add_tail(&opt->list, &fsopts);
+               }
+       };
+
+       *mount_flags = mf;
+       *propagation_flags = pf;
+
+       list_for_each_entry(opt, &fsopts, list) {
+               if (len)
+                       ++len;
+
+               len += strlen(opt->optstr);
+       };
+
+       if (len) {
+               *mount_data = calloc(len + 1, sizeof(char));
+               if (!(*mount_data))
+                       return ENOMEM;
+
+               len = 0;
+               list_for_each_entry(opt, &fsopts, list) {
+                       if (len)
+                               strcat(*mount_data, ",");
+
+                       strcat(*mount_data, opt->optstr);
+                       ++len;
+               }
+
+               list_for_each_entry_safe(opt, tmpopt, &fsopts, list) {
+                       list_del(&opt->list);
+                       free(opt);
+               }
+       }
+
+       DEBUG("mount flags(%08lx) propagation(%08lx) fsopts(\"%s\")\n", mf, pf, *mount_data?:"");
+
        return 0;
 }
 
+int parseOCImount(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_MOUNT_MAX];
+       unsigned long mount_flags = 0;
+       unsigned long propagation_flags = 0;
+       char *mount_data = NULL;
+       int ret, err = -1;
+
+       blobmsg_parse(oci_mount_policy, __OCI_MOUNT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_MOUNT_DESTINATION])
+               return EINVAL;
+
+       if (tb[OCI_MOUNT_OPTIONS]) {
+               ret = parseOCImountopts(tb[OCI_MOUNT_OPTIONS], &mount_flags, &propagation_flags, &mount_data, &err);
+               if (ret)
+                       return ret;
+       }
+
+       ret = add_mount(tb[OCI_MOUNT_SOURCE] ? blobmsg_get_string(tb[OCI_MOUNT_SOURCE]) : NULL,
+                 blobmsg_get_string(tb[OCI_MOUNT_DESTINATION]),
+                 tb[OCI_MOUNT_TYPE] ? blobmsg_get_string(tb[OCI_MOUNT_TYPE]) : NULL,
+                 mount_flags, propagation_flags, mount_data, err);
+
+       if (mount_data)
+               free(mount_data);
+
+       return ret;
+}
+
+static void build_noafile(void) {
+       int fd;
+
+       fd = creat(UJAIL_NOAFILE, 0000);
+       if (fd < 0)
+               return;
+
+       close(fd);
+       return;
+}
+
 int mount_all(const char *jailroot) {
        struct library *l;
        struct mount *m;
 
+       build_noafile();
+
        avl_for_each_element(&libraries, l, avl)
-               add_mount(l->path, 1, -1);
+               add_mount_bind(l->path, 1, -1);
 
        avl_for_each_element(&mounts, m, avl)
-               if (mount_bind(jailroot, m->path, m->readonly, m->error))
+               if (do_mount(jailroot, m->source, m->target, m->filesystemtype, m->mountflags,
+                            m->propflags, m->optstr, m->error, m->inner))
                        return -1;
 
        return 0;
 }
 
+void mount_free(void) {
+       struct mount *m, *tmp;
+
+       avl_remove_all_elements(&mounts, m, avl, tmp) {
+               if (m->source != (void*)(-1))
+                       free((void*)m->source);
+               free((void*)m->target);
+               free((void*)m->filesystemtype);
+               free((void*)m->optstr);
+               free(m);
+       }
+}
+
 void mount_list_init(void) {
        avl_init(&mounts, avl_strcmp, false, NULL);
 }
@@ -102,9 +483,10 @@ static int add_script_interp(const char *path, const char *map, int size)
        return add_path_and_deps(buf, 1, -1, 0);
 }
 
-int add_path_and_deps(const char *path, int readonly, int error, int lib)
+int add_2paths_and_deps(const char *path, const char *path2, int readonly, int error, int lib)
 {
        assert(path != NULL);
+       assert(path2 != NULL);
 
        if (lib == 0 && path[0] != '/') {
                ERROR("%s is not an absolute path\n", path);
@@ -114,18 +496,18 @@ int add_path_and_deps(const char *path, int readonly, int error, int lib)
        char *map = NULL;
        int fd, ret = -1;
        if (path[0] == '/') {
-               if (avl_find(&mounts, path))
+               if (avl_find(&mounts, path2))
                        return 0;
                fd = open(path, O_RDONLY|O_CLOEXEC);
-               if (fd == -1)
+               if (fd < 0)
                        return error;
-               add_mount(path, readonly, error);
+               _add_mount_bind(path, path2, readonly, error);
        } else {
                if (avl_find(&libraries, path))
                        return 0;
                char *fullpath;
                fd = lib_open(&fullpath, path);
-               if (fd == -1)
+               if (fd < 0)
                        return error;
                if (fullpath) {
                        alloc_library(fullpath, path);
index 343335fbee8581e0b70c2cd4279e635234abb6f4..541030f7eb7c2046e18bc1fc554601f04e29768a 100644 (file)
--- a/jail/fs.h
+++ b/jail/fs.h
 #ifndef _JAIL_FS_H_
 #define _JAIL_FS_H_
 
-int add_mount(const char *path, int readonly, int error);
-int add_path_and_deps(const char *path, int readonly, int error, int lib);
+#include <sys/mount.h>
+#include <libubox/blobmsg.h>
+
+int add_mount(const char *source, const char *target, const char *filesystemtype,
+             unsigned long mountflags, unsigned long propflags, const char *optstr, int error);
+int add_mount_inner(const char *source, const char *target, const char *filesystemtype,
+             unsigned long mountflags, unsigned long propflags, const char *optstr, int error);
+int add_mount_bind(const char *path, int readonly, int error);
+int parseOCImount(struct blob_attr *msg);
+int add_2paths_and_deps(const char *path, const char *path2, int readonly, int error, int lib);
+
+static inline int add_path_and_deps(const char *path, int readonly, int error, int lib)
+{
+       return add_2paths_and_deps(path, path, readonly, error, lib);
+}
+
 int mount_all(const char *jailroot);
 void mount_list_init(void);
+void mount_free(void);
 
 #endif
index 54e78419b539ae6b618bc3680c01fdd9582ea318..09780ac7e2b732cd6456c835f93c29816ad3dc89 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License version 2.1
 #include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/wait.h>
-
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+
+/* musl only defined 15 limit types, make sure all 16 are supported */
+#ifndef RLIMIT_RTTIME
+#define RLIMIT_RTTIME 15
+#undef RLIMIT_NLIMITS
+#define RLIMIT_NLIMITS 16
+#undef RLIM_NLIMITS
+#define RLIM_NLIMITS 16
+#endif
+
+#include <assert.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <errno.h>
+#include <pwd.h>
+#include <grp.h>
 #include <string.h>
-#include <sys/stat.h>
 #include <fcntl.h>
-#include <libgen.h>
 #include <sched.h>
+#include <linux/filter.h>
 #include <linux/limits.h>
+#include <linux/nsfs.h>
+#include <linux/securebits.h>
 #include <signal.h>
+#include <inttypes.h>
 
 #include "capabilities.h"
 #include "elf.h"
 #include "fs.h"
 #include "jail.h"
 #include "log.h"
-
+#include "seccomp-oci.h"
+#include "cgroups.h"
+#include "netifd.h"
+
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <libubox/list.h>
+#include <libubox/vlist.h>
 #include <libubox/uloop.h>
+#include <libubox/utils.h>
+#include <libubus.h>
+
+#ifndef CLONE_NEWCGROUP
+#define CLONE_NEWCGROUP 0x02000000
+#endif
 
 #define STACK_SIZE     (1024 * 1024)
-#define OPT_ARGS       "S:C:n:h:r:w:d:psuloc"
+#define OPT_ARGS       "cC:d:e:EfFG:h:ij:J:ln:NoO:pP:r:R:sS:uU:w:t:T:y"
+
+#define OCI_VERSION_STRING "1.0.2"
+
+struct hook_execvpe {
+       char *file;
+       char **argv;
+       char **envp;
+       int timeout;
+};
+
+struct sysctl_val {
+       char *entry;
+       char *value;
+};
+
+struct mknod_args {
+       char *path;
+       mode_t mode;
+       dev_t dev;
+       uid_t uid;
+       gid_t gid;
+};
 
 static struct {
        char *name;
        char *hostname;
        char **jail_argv;
+       char *cwd;
        char *seccomp;
+       struct sock_fprog *ociseccomp;
        char *capabilities;
+       struct jail_capset capset;
+       char *user;
+       char *group;
+       char *extroot;
+       char *overlaydir;
+       char *tmpoverlaysize;
+       char **envp;
+       char *uidmap;
+       char *gidmap;
+       char *pidfile;
+       struct sysctl_val **sysctl;
        int no_new_privs;
        int namespace;
+       struct {
+               int pid;
+               int net;
+               int ns;
+               int ipc;
+               int uts;
+               int user;
+               int cgroup;
+#ifdef CLONE_NEWTIME
+               int time;
+#endif
+       } setns;
        int procfs;
        int ronly;
        int sysfs;
+       int console;
+       int pw_uid;
+       int pw_gid;
+       int gr_gid;
+       int root_map_uid;
+       gid_t *additional_gids;
+       size_t num_additional_gids;
+       mode_t umask;
+       bool set_umask;
+       int require_jail;
+       struct {
+               struct hook_execvpe **createRuntime;
+               struct hook_execvpe **createContainer;
+               struct hook_execvpe **startContainer;
+               struct hook_execvpe **poststart;
+               struct hook_execvpe **poststop;
+       } hooks;
+       struct rlimit *rlimits[RLIM_NLIMITS];
+       int oom_score_adj;
+       bool set_oom_score_adj;
+       struct mknod_args **devices;
+       char *ocibundle;
+       bool immediately;
+       struct blob_attr *annotations;
+       int term_timeout;
 } opts;
 
+static struct blob_buf ocibuf;
+
 extern int pivot_root(const char *new_root, const char *put_old);
 
 int debug = 0;
 
 static char child_stack[STACK_SIZE];
 
-static int mkdir_p(char *dir, mode_t mask)
+static struct ubus_context *parent_ctx;
+
+int console_fd;
+
+
+static inline bool has_namespaces(void)
 {
-       char *l = strrchr(dir, '/');
-       int ret;
+return ((opts.setns.pid != -1) ||
+       (opts.setns.net != -1) ||
+       (opts.setns.ns != -1) ||
+       (opts.setns.ipc != -1) ||
+       (opts.setns.uts != -1) ||
+       (opts.setns.user != -1) ||
+       (opts.setns.cgroup != -1) ||
+#ifdef CLONE_NEWTIME
+       (opts.setns.time != -1) ||
+#endif
+       opts.namespace);
+}
 
-       if (!l)
-               return 0;
+static void free_oci_envp(char **p) {
+       char **tmp;
 
-       *l = '\0';
+       if (p) {
+               tmp = p;
+               while (*tmp)
+                       free(*(tmp++));
 
-       if (mkdir_p(dir, mask))
-               return -1;
+               free(p);
+       }
+}
 
-       *l = '/';
+static void free_hooklist(struct hook_execvpe **hooklist)
+{
+       struct hook_execvpe *cur;
 
-       ret = mkdir(dir, mask);
-       if (ret && errno == EEXIST)
-               return 0;
+       if (!hooklist)
+               return;
+
+       cur = *hooklist;
+       while (cur) {
+               free_oci_envp(cur->argv);
+               free_oci_envp(cur->envp);
+               free(cur->file);
+               free(cur++);
+       }
+       free(hooklist);
+}
+
+static void free_sysctl(void) {
+       struct sysctl_val *cur;
+
+       if (!opts.sysctl)
+               return;
+
+       cur = *opts.sysctl;
+
+       while (cur) {
+               free(cur->entry);
+               free(cur->value);
+               free(cur++);
+       }
+       free(opts.sysctl);
+}
+
+static void free_devices(void) {
+       struct mknod_args **cur;
+
+       if (!opts.devices)
+               return;
+
+       cur = opts.devices;
+
+       while (*cur) {
+               free((*cur)->path);
+               free(*(cur++));
+       }
+       free(opts.devices);
+}
+
+static void free_rlimits(void) {
+       int type;
+
+       for (type = 0; type < RLIM_NLIMITS; ++type)
+               free(opts.rlimits[type]);
+}
+
+static void free_opts(bool parent) {
+
+       free_library_search();
+       mount_free();
+       cgroups_free();
+
+       /* we need to keep argv, envp and seccomp filter in child */
+       if (parent) { /* parent-only */
+               if (opts.ociseccomp) {
+                       free(opts.ociseccomp->filter);
+                       free(opts.ociseccomp);
+               }
+
+               free_oci_envp(opts.jail_argv);
+               free_oci_envp(opts.envp);
+       }
 
-       if (ret)
-               ERROR("mkdir(%s, %d) failed: %m\n", dir, mask);
+       free_rlimits();
+       free_sysctl();
+       free_devices();
+       free(opts.hostname);
+       free(opts.cwd);
+       free(opts.uidmap);
+       free(opts.gidmap);
+       free(opts.annotations);
+       free(opts.extroot);
+       free(opts.overlaydir);
+       free_hooklist(opts.hooks.createRuntime);
+       free_hooklist(opts.hooks.createContainer);
+       free_hooklist(opts.hooks.startContainer);
+       free_hooklist(opts.hooks.poststart);
+       free_hooklist(opts.hooks.poststop);
+}
+
+static int mount_overlay(char *jail_root, char *overlaydir) {
+       char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
+       const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
+       int ret = -1, fd;
+
+       if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
+               goto out;
+
+       if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
+               goto upper_printf;
+
+       if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
+               goto work_printf;
+
+       if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
+               goto opts_printf;
+
+/*
+ * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
+ * this is to work-around a bug in overlayfs described in the overlayfs-userns
+ * patch:
+ * 3. modification of a file 'hithere' which is in l but not yet
+ * in u, and which is not owned by T, is not allowed, even if
+ * writes to u are allowed.  This may be a bug in overlayfs,
+ * but it is safe behavior.
+ */
+       if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
+               goto opts_printf;
+
+       if (mkdir_p(upperetc, 0755))
+               goto upper_etc_printf;
 
+       if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
+               goto upper_etc_printf;
+
+       fd = creat(upperresolvconf, 0644);
+       if (fd < 0) {
+               if (errno != EEXIST)
+                       ERROR("creat(%s) failed: %m\n", upperresolvconf);
+       } else {
+               close(fd);
+       }
+       DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
+
+       if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
+               goto upper_resolvconf_printf;
+
+       ret = 0;
+
+upper_resolvconf_printf:
+       free(upperresolvconf);
+upper_etc_printf:
+       free(upperetc);
+opts_printf:
+       free(optsstr);
+work_printf:
+       free(workdir);
+upper_printf:
+       free(upperdir);
+out:
        return ret;
 }
 
-int mount_bind(const char *root, const char *path, int readonly, int error)
+static void pass_console(int console_fd)
+{
+       struct ubus_context *child_ctx = ubus_connect(NULL);
+       static struct blob_buf req;
+       uint32_t id;
+
+       if (!child_ctx)
+               return;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", opts.name);
+
+       if (ubus_lookup_id(child_ctx, "container", &id) ||
+           ubus_invoke_fd(child_ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
+               INFO("ubus request failed\n");
+       else
+               close(console_fd);
+
+       blob_buf_free(&req);
+       ubus_free(child_ctx);
+}
+
+static int create_dev_console(const char *jail_root)
+{
+       char *console_fname;
+       char dev_console_path[PATH_MAX];
+       int slave_console_fd, dev_console_dummy;
+
+       /* Open UNIX/98 virtual console */
+       console_fd = posix_openpt(O_RDWR | O_NOCTTY);
+       if (console_fd < 0)
+               return -1;
+
+       console_fname = ptsname(console_fd);
+       DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
+       if (!console_fname)
+               goto no_console;
+
+       grantpt(console_fd);
+       unlockpt(console_fd);
+
+       /* pass PTY master to procd */
+       pass_console(console_fd);
+
+       /* mount-bind PTY slave to /dev/console in jail */
+       snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
+       dev_console_dummy = creat(dev_console_path, 0620);
+       if (dev_console_dummy < 0)
+               goto no_console;
+
+       close(dev_console_dummy);
+
+       if (mount(console_fname, dev_console_path, "bind", MS_BIND, NULL))
+               goto no_console;
+
+       /* use PTY slave for stdio */
+       slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
+       if (slave_console_fd < 0)
+               goto no_console;
+
+       dup2(slave_console_fd, 0);
+       dup2(slave_console_fd, 1);
+       dup2(slave_console_fd, 2);
+       close(slave_console_fd);
+
+       INFO("using guest console %s\n", console_fname);
+
+       return 0;
+
+no_console:
+       close(console_fd);
+       return 1;
+}
+
+static int hook_running = 0;
+static int hook_return_code = 0;
+static struct hook_execvpe **current_hook = NULL;
+typedef void (*hook_return_handler)(void);
+static hook_return_handler hook_return_cb = NULL;
+
+static void hook_process_timeout_cb(struct uloop_timeout *t);
+static struct uloop_timeout hook_process_timeout = {
+       .cb = hook_process_timeout_cb,
+};
+
+static void run_hooklist(void);
+static void hook_process_handler(struct uloop_process *c, int ret)
+{
+       uloop_timeout_cancel(&hook_process_timeout);
+
+       if (WIFEXITED(ret)) {
+               hook_return_code = WEXITSTATUS(ret);
+               if (hook_return_code)
+                       ERROR("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
+               else
+                       DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
+
+       } else {
+               hook_return_code = WTERMSIG(ret);
+               ERROR("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
+       }
+       hook_running = 0;
+       ++current_hook;
+       run_hooklist();
+}
+
+static struct uloop_process hook_process = {
+       .cb = hook_process_handler,
+};
+
+static void hook_process_timeout_cb(struct uloop_timeout *t)
+{
+       DEBUG("hook process failed to stop, sending SIGKILL\n");
+       kill(hook_process.pid, SIGKILL);
+}
+
+static void run_hooklist(void)
 {
+       struct hook_execvpe *hook = *current_hook;
        struct stat s;
-       char new[PATH_MAX];
-       int fd;
 
-       if (stat(path, &s)) {
-               ERROR("stat(%s) failed: %m\n", path);
-               return error;
+       if (!hook)
+               return hook_return_cb();
+
+       DEBUG("executing hook %s\n", hook->file);
+
+       if (stat(hook->file, &s))
+               hook_process_handler(&hook_process, ENOENT);
+
+       if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
+               hook_process_handler(&hook_process, EPERM);
+
+       hook_running = 1;
+       hook_process.pid = fork();
+       if (hook_process.pid == 0) {
+               /* child */
+               execve(hook->file, hook->argv, hook->envp);
+               ERROR("execve error %m\n");
+               _exit(errno);
+       } else if (hook_process.pid < 0) {
+               /* fork error */
+               ERROR("hook fork error\n");
+               hook_running = 0;
+               hook_process_handler(&hook_process, errno);
        }
 
-       snprintf(new, sizeof(new), "%s%s", root, path);
-       if (S_ISDIR(s.st_mode)) {
-               mkdir_p(new, 0755);
-       } else {
-               mkdir_p(dirname(new), 0755);
-               snprintf(new, sizeof(new), "%s%s", root, path);
-               fd = creat(new, 0644);
-               if (fd == -1) {
-                       ERROR("creat(%s) failed: %m\n", new);
-                       return -1;
+       /* parent */
+       uloop_process_add(&hook_process);
+
+       if (hook->timeout > 0)
+               uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
+
+       uloop_run();
+       if (hook_running) {
+               DEBUG("uloop interrupted, killing jail process\n");
+               kill(hook_process.pid, SIGTERM);
+               uloop_timeout_set(&hook_process_timeout, 1000);
+               uloop_run();
+       }
+}
+
+static void run_hooks(struct hook_execvpe **hooklist, hook_return_handler return_cb)
+{
+       if (!hooklist)
+               return_cb();
+
+       current_hook = hooklist;
+       hook_return_cb = return_cb;
+
+       run_hooklist();
+}
+
+static int apply_sysctl(const char *jail_root)
+{
+       struct sysctl_val **cur;
+       char *procdir, *fname;
+       int f;
+
+       if (!opts.sysctl)
+               return 0;
+
+       if (asprintf(&procdir, "%s/proc", jail_root) < 0)
+               return ENOMEM;
+
+       mkdir(procdir, 0700);
+       if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0))
+               return EPERM;
+
+       cur = opts.sysctl;
+
+       while (*cur) {
+               if (asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry) < 0)
+                       return ENOMEM;
+
+               DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
+
+               f = open(fname, O_WRONLY);
+               if (f < 0) {
+                       ERROR("sysctl: can't open %s\n", fname);
+                       free(fname);
+                       return errno;
                }
-               close(fd);
+               if (write(f, (*cur)->value, strlen((*cur)->value)) < 0) {
+                       ERROR("sysctl: write to %s\n", fname);
+                       free(fname);
+                       close(f);
+                       return errno;
+               }
+
+               free(fname);
+               close(f);
+               ++cur;
        }
+       umount(procdir);
+       rmdir(procdir);
+       free(procdir);
 
-       if (mount(path, new, NULL, MS_BIND, NULL)) {
-               ERROR("failed to mount -B %s %s: %m\n", path, new);
-               return -1;
+       return 0;
+}
+
+/* glibc defines makedev calling a function. make sure it's a pure macro */
+#if defined(__GLIBC__)
+#undef makedev
+/* from musl's sys/sysmacros.h */
+#define makedev(x,y) ( \
+       (((x)&0xfffff000ULL) << 32) | \
+       (((x)&0x00000fffULL) << 8) | \
+       (((y)&0xffffff00ULL) << 12) | \
+       (((y)&0x000000ffULL)) )
+#endif
+
+static struct mknod_args default_devices[] = {
+       { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) },
+       { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) },
+       { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) },
+       { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) },
+       { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) },
+       { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 },
+       { 0 },
+};
+
+static int create_devices(void)
+{
+       struct mknod_args **cur, *curdef;
+       char *path, *tmp;
+       int ret;
+
+       if (!opts.devices)
+               goto only_default_devices;
+
+       cur = opts.devices;
+
+       while (*cur) {
+               path = (*cur)->path;
+               /* don't allow devices outside of /dev */
+               if (strncmp(path, "/dev", 4))
+                       return EPERM;
+
+               /* make sure parent folder exists */
+               tmp = strrchr(path, '/');
+               if (!tmp)
+                       return EINVAL;
+
+               *tmp = '\0';
+               if (strcmp(path, "/dev")) {
+                       DEBUG("creating directory %s\n", path);
+
+                       mkdir_p(path, 0755);
+               }
+               *tmp = '/';
+
+               DEBUG("creating %s (mode=%08o)\n", path, (*cur)->mode);
+
+               /* create device */
+               if (mknod(path, (*cur)->mode, (*cur)->dev))
+                       return errno;
+
+               /* change owner, if needed */
+               if (((*cur)->uid || (*cur)->gid) &&
+                   chown(path, (*cur)->uid, (*cur)->gid))
+                       return errno;
+
+               ++cur;
        }
 
-       if (readonly && mount(NULL, new, NULL, MS_BIND | MS_REMOUNT | MS_RDONLY, NULL)) {
-               ERROR("failed to remount ro %s: %m\n", new);
-               return -1;
+only_default_devices:
+       curdef = default_devices;
+       while(curdef->path) {
+               DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode);
+               if (mknod(curdef->path, curdef->mode, curdef->dev)) {
+                       ++curdef;
+                       continue; /* may already exist, eg. due to a bind-mount */
+               }
+               if ((curdef->uid || curdef->gid) &&
+                   chown(curdef->path, curdef->uid, curdef->gid))
+                       return errno;
+
+               ++curdef;
        }
 
-       DEBUG("mount -B %s %s (%s)\n", path, new, readonly?"ro":"rw");
+       /* Dev symbolic links as defined in OCI spec */
+       ret = symlink("/dev/pts/ptmx", "/dev/ptmx");
+       if (ret < 0)
+               WARNING("symlink() failed to create link to /dev/pts/ptmx");
+
+       ret = symlink("/proc/self/fd", "/dev/fd");
+       if (ret < 0)
+               WARNING("symlink() failed to create link to /proc/self/fd");
+
+       ret = symlink("/proc/self/fd/0", "/dev/stdin");
+       if (ret < 0)
+               WARNING("symlink() failed to create link to /proc/self/fd/0");
+
+       ret = symlink("/proc/self/fd/1", "/dev/stdout");
+       if (ret < 0)
+               WARNING("symlink() failed to create link to /proc/self/fd/1");
+
+       ret = symlink("/proc/self/fd/2", "/dev/stderr");
+       if (ret < 0)
+               WARNING("symlink() failed to create link to /proc/self/fd/2");
 
        return 0;
 }
 
+static char jail_root[] = "/tmp/ujail-XXXXXX";
+static char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
+static mode_t old_umask;
+static void enter_jail_fs(void);
 static int build_jail_fs(void)
 {
-       char jail_root[] = "/tmp/ujail-XXXXXX";
+       char *overlaydir = NULL;
+       int ret;
+
+       old_umask = umask(0);
+
        if (mkdtemp(jail_root) == NULL) {
                ERROR("mkdtemp(%s) failed: %m\n", jail_root);
                return -1;
        }
 
+       if (apply_sysctl(jail_root)) {
+               ERROR("failed to apply sysctl values\n");
+               return -1;
+       }
+
        /* oldroot can't be MS_SHARED else pivot_root() fails */
-       if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
+       if (mount("none", "/", "none", MS_REC|MS_PRIVATE, NULL)) {
                ERROR("private mount failed %m\n");
                return -1;
        }
 
-       if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
-               ERROR("tmpfs mount failed %m\n");
-               return -1;
+       if (opts.extroot) {
+               if (mount(opts.extroot, jail_root, "bind", MS_BIND, NULL)) {
+                       ERROR("extroot mount failed %m\n");
+                       return -1;
+               }
+       } else {
+               if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
+                       ERROR("tmpfs mount failed %m\n");
+                       return -1;
+               }
+       }
+
+       if (opts.tmpoverlaysize) {
+               char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
+
+               snprintf(mountoptsstr, sizeof(mountoptsstr),
+                        "mode=0755,size=%s", opts.tmpoverlaysize);
+               if (mkdtemp(tmpovdir) == NULL) {
+                       ERROR("mkdtemp(%s) failed: %m\n", jail_root);
+                       return -1;
+               }
+               if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
+                         mountoptsstr)) {
+                       ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
+                       return -1;
+               }
+               overlaydir = tmpovdir;
+       }
+
+       if (opts.overlaydir)
+               overlaydir = opts.overlaydir;
+
+       if (overlaydir) {
+               ret = mount_overlay(jail_root, overlaydir);
+               if (ret)
+                       return ret;
        }
 
        if (chdir(jail_root)) {
@@ -151,197 +747,1911 @@ static int build_jail_fs(void)
                return -1;
        }
 
+       if (opts.console)
+               create_dev_console(jail_root);
+
+       /* make sure /etc/resolv.conf exists if in new network namespace */
+       if (opts.namespace & CLONE_NEWNET) {
+               char jailetc[PATH_MAX], jaillink[PATH_MAX];
+
+               snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
+               mkdir_p(jailetc, 0755);
+               snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
+               if (overlaydir)
+                       unlink(jaillink);
+
+               ret = symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
+               if (ret < 0)
+                       WARNING("symlink() failed to create link to ../dev/resolv.conf.d/resolv.conf.auto");
+       }
+
+       run_hooks(opts.hooks.createContainer, enter_jail_fs);
+
+       return 0;
+}
+
+static bool exit_from_child;
+static void free_and_exit(int ret)
+{
+       if (!exit_from_child && opts.ocibundle)
+               cgroups_free();
+
+       if (!exit_from_child && parent_ctx)
+               ubus_free(parent_ctx);
+
+       free_opts(!exit_from_child);
+
+       exit(ret);
+}
+
+static void post_jail_fs(void);
+static void enter_jail_fs(void)
+{
        char dirbuf[sizeof(jail_root) + 4];
+
        snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
        mkdir(dirbuf, 0755);
 
        if (pivot_root(jail_root, dirbuf) == -1) {
                ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
-               return -1;
+               free_and_exit(-1);
        }
        if (chdir("/")) {
                ERROR("chdir(/) (after pivot_root) failed: %m\n");
+               free_and_exit(-1);
+       }
+
+       snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
+       umount2(dirbuf, MNT_DETACH);
+       rmdir(dirbuf);
+       if (opts.tmpoverlaysize) {
+               char tmpdirbuf[sizeof(tmpovdir) + 4];
+               snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
+               umount2(tmpdirbuf, MNT_DETACH);
+               rmdir(tmpdirbuf);
+       }
+
+       umount2("/old", MNT_DETACH);
+       rmdir("/old");
+
+       if (create_devices()) {
+               ERROR("create_devices() failed\n");
+               free_and_exit(-1);
+       }
+       if (opts.ronly)
+               mount(NULL, "/", "bind", MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
+
+       umask(old_umask);
+       post_jail_fs();
+}
+
+static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
+{
+       int map_file;
+       char map_path[64];
+
+       if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
+               child_pid, gidmap?"gid_map":"uid_map") < 0)
+               return -1;
+
+       if ((map_file = open(map_path, O_WRONLY)) < 0)
+               return -1;
+
+       if (dprintf(map_file, "%s", mapstr)) {
+               close(map_file);
+               return -1;
+       }
+
+       close(map_file);
+       return 0;
+}
+
+static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
+{
+       int map_file;
+       char map_path[64];
+       const char *map_format = "%d %d %d\n";
+       if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
+               child_pid, gidmap?"gid_map":"uid_map") < 0)
+               return -1;
+
+       if ((map_file = open(map_path, O_WRONLY)) < 0)
+               return -1;
+
+       if (dprintf(map_file, map_format, 0, id, 1) < 0) {
+               close(map_file);
+               return -1;
+       }
+
+       close(map_file);
+       return 0;
+}
+
+static int write_setgroups(pid_t child_pid, bool allow)
+{
+       int setgroups_file;
+       char setgroups_path[64];
+
+       if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
+               child_pid) < 0) {
+               return -1;
+       }
+
+       if ((setgroups_file = open(setgroups_path, O_WRONLY)) < 0) {
                return -1;
        }
 
-       snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
-       rmdir(dirbuf);
-       umount2("/old", MNT_DETACH);
-       rmdir("/old");
+       if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
+               close(setgroups_file);
+               return -1;
+       }
+
+       close(setgroups_file);
+       return 0;
+}
+
+static void get_jail_user(int *user, int *user_gid, int *gr_gid)
+{
+       struct passwd *p = NULL;
+       struct group *g = NULL;
+
+       if (opts.user) {
+               p = getpwnam(opts.user);
+               if (!p) {
+                       ERROR("failed to get uid/gid for user %s: %d (%s)\n",
+                             opts.user, errno, strerror(errno));
+                       free_and_exit(EXIT_FAILURE);
+               }
+               *user = p->pw_uid;
+               *user_gid = p->pw_gid;
+       } else {
+               *user = -1;
+               *user_gid = -1;
+       }
+
+       if (opts.group) {
+               g = getgrnam(opts.group);
+               if (!g) {
+                       ERROR("failed to get gid for group %s: %m\n", opts.group);
+                       free_and_exit(EXIT_FAILURE);
+               }
+               *gr_gid = g->gr_gid;
+       } else {
+               *gr_gid = -1;
+       }
+};
+
+static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
+{
+       if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
+               ERROR("failed to initgroups() for user %s: %m\n", opts.user);
+               free_and_exit(EXIT_FAILURE);
+       }
+
+       if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
+               ERROR("failed to set group id %d: %m\n", gr_gid);
+               free_and_exit(EXIT_FAILURE);
+       }
+
+       if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
+               ERROR("failed to set user id %d: %m\n", pw_uid);
+               free_and_exit(EXIT_FAILURE);
+       }
+}
+
+static int apply_rlimits(void)
+{
+       int resource;
+
+       for (resource = 0; resource < RLIM_NLIMITS; ++resource) {
+               if (opts.rlimits[resource])
+                       DEBUG("applying limits to resource %u\n", resource);
+
+               if (opts.rlimits[resource] &&
+                   setrlimit(resource, opts.rlimits[resource]))
+                       return errno;
+       }
+
+       return 0;
+}
+
+#define MAX_ENVP       64
+static char** build_envp(const char *seccomp, char **ocienvp)
+{
+       static char *envp[MAX_ENVP];
+       static char preload_var[PATH_MAX];
+       static char seccomp_var[PATH_MAX];
+       static char seccomp_debug_var[20];
+       static char debug_var[] = "LD_DEBUG=all";
+       static char container_var[] = "container=ujail";
+       const char *preload_lib = find_lib("libpreload-seccomp.so");
+       char **addenv;
+
+       int count = 0;
+
+       if (seccomp && !preload_lib) {
+               ERROR("failed to add preload-lib to env\n");
+               return NULL;
+       }
+       if (seccomp) {
+               snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
+               envp[count++] = seccomp_var;
+               snprintf(seccomp_debug_var, sizeof(seccomp_debug_var), "SECCOMP_DEBUG=%2d", debug);
+               envp[count++] = seccomp_debug_var;
+               snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
+               envp[count++] = preload_var;
+       }
+
+       envp[count++] = container_var;
+
+       if (debug > 1)
+               envp[count++] = debug_var;
+
+       addenv = ocienvp;
+       while (addenv && *addenv) {
+               envp[count++] = *(addenv++);
+               if (count >= MAX_ENVP) {
+                       ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
+                       break;
+               }
+       }
+       return envp;
+}
+
+static void usage(void)
+{
+       fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
+       fprintf(stderr, "  -d <num>\tshow debug log (increase num to increase verbosity)\n");
+       fprintf(stderr, "  -S <file>\tseccomp filter config\n");
+       fprintf(stderr, "  -C <file>\tcapabilities drop config\n");
+       fprintf(stderr, "  -c\t\tset PR_SET_NO_NEW_PRIVS\n");
+       fprintf(stderr, "  -n <name>\tthe name of the jail\n");
+       fprintf(stderr, "  -e <var>\timport environment variable\n");
+       fprintf(stderr, "namespace jail options:\n");
+       fprintf(stderr, "  -h <hostname>\tchange the hostname of the jail\n");
+       fprintf(stderr, "  -N\t\tjail has network namespace\n");
+       fprintf(stderr, "  -f\t\tjail has user namespace\n");
+       fprintf(stderr, "  -F\t\tjail has cgroups namespace\n");
+       fprintf(stderr, "  -r <file>\treadonly files that should be staged\n");
+       fprintf(stderr, "  -w <file>\twriteable files that should be staged\n");
+       fprintf(stderr, "  -p\t\tjail has /proc\n");
+       fprintf(stderr, "  -s\t\tjail has /sys\n");
+       fprintf(stderr, "  -l\t\tjail has /dev/log\n");
+       fprintf(stderr, "  -u\t\tjail has a ubus socket\n");
+       fprintf(stderr, "  -U <name>\tuser to run jailed process\n");
+       fprintf(stderr, "  -G <name>\tgroup to run jailed process\n");
+       fprintf(stderr, "  -o\t\tremont jail root (/) read only\n");
+       fprintf(stderr, "  -R <dir>\texternal jail rootfs (system container)\n");
+       fprintf(stderr, "  -O <dir>\tdirectory for r/w overlayfs\n");
+       fprintf(stderr, "  -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
+       fprintf(stderr, "  -E\t\tfail if jail cannot be setup\n");
+       fprintf(stderr, "  -y\t\tprovide jail console\n");
+       fprintf(stderr, "  -J <dir>\tcreate container from OCI bundle\n");
+       fprintf(stderr, "  -i\t\tstart container immediately\n");
+       fprintf(stderr, "  -P <pidfile>\tcreate <pidfile>\n");
+       fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
+and he has the same powers as root outside the jail,\n\
+thus he can escape the jail and/or break stuff.\n\
+Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
+If you use none of the namespace jail options,\n\
+ujail will not use namespace/build a jail,\n\
+and will only drop capabilities/apply seccomp filter.\n\n");
+}
+
+static int* get_namespace_fd(const unsigned int nstype)
+{
+       switch (nstype) {
+               case CLONE_NEWPID:
+                       return &opts.setns.pid;
+               case CLONE_NEWNET:
+                       return &opts.setns.net;
+               case CLONE_NEWNS:
+                       return &opts.setns.ns;
+               case CLONE_NEWIPC:
+                       return &opts.setns.ipc;
+               case CLONE_NEWUTS:
+                       return &opts.setns.uts;
+               case CLONE_NEWUSER:
+                       return &opts.setns.user;
+               case CLONE_NEWCGROUP:
+                       return &opts.setns.cgroup;
+#ifdef CLONE_NEWTIME
+               case CLONE_NEWTIME:
+                       return &opts.setns.time;
+#endif
+               default:
+                       return NULL;
+       }
+}
+
+static int setns_open(unsigned long nstype)
+{
+       int *fd = get_namespace_fd(nstype);
+
+       assert(fd != NULL);
+
+       if (*fd < 0)
+               return 0;
+
+       if (setns(*fd, nstype) == -1) {
+               close(*fd);
+               return errno;
+       }
+
+       close(*fd);
+       return 0;
+}
+
+static int jail_running = 0;
+static int jail_return_code = 0;
+
+static void jail_process_timeout_cb(struct uloop_timeout *t);
+static struct uloop_timeout jail_process_timeout = {
+       .cb = jail_process_timeout_cb,
+};
+static void poststop(void);
+static void jail_process_handler(struct uloop_process *c, int ret)
+{
+       uloop_timeout_cancel(&jail_process_timeout);
+       if (WIFEXITED(ret)) {
+               jail_return_code = WEXITSTATUS(ret);
+               INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
+       } else {
+               jail_return_code = WTERMSIG(ret);
+               INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
+       }
+       jail_running = 0;
+       poststop();
+}
+
+static struct uloop_process jail_process = {
+       .cb = jail_process_handler,
+};
+
+static void jail_process_timeout_cb(struct uloop_timeout *t)
+{
+       DEBUG("jail process failed to stop, sending SIGKILL\n");
+       kill(jail_process.pid, SIGKILL);
+}
+
+static void jail_handle_signal(int signo)
+{
+       if (hook_running) {
+               DEBUG("forwarding signal %d to the hook process\n", signo);
+               kill(hook_process.pid, signo);
+               /* set timeout to send SIGKILL hook process in case SIGTERM doesn't succeed */
+               if (signo == SIGTERM)
+                       uloop_timeout_set(&hook_process_timeout, opts.term_timeout * 1000);
+       }
+
+       if (jail_running) {
+               DEBUG("forwarding signal %d to the jailed process\n", signo);
+               kill(jail_process.pid, signo);
+               /* set timeout to send SIGKILL jail process in case SIGTERM doesn't succeed */
+               if (signo == SIGTERM)
+                       uloop_timeout_set(&jail_process_timeout, opts.term_timeout * 1000);
+       }
+}
+
+static void signals_init(void)
+{
+       int i;
+       sigset_t sigmask;
+
+       sigfillset(&sigmask);
+       for (i = 0; i < _NSIG; i++) {
+               struct sigaction s = { 0 };
+
+               if (!sigismember(&sigmask, i))
+                       continue;
+               if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV) || (i == SIGSTOP) || (i == SIGKILL))
+                       continue;
+
+               s.sa_handler = jail_handle_signal;
+               sigaction(i, &s, NULL);
+       }
+}
+
+static void pre_exec_jail(struct uloop_timeout *t);
+static struct uloop_timeout pre_exec_timeout = {
+       .cb = pre_exec_jail,
+};
+
+int pipes[4];
+static int exec_jail(void *arg)
+{
+       char buf[1];
+
+       exit_from_child = true;
+       prctl(PR_SET_SECUREBITS, 0);
+
+       uloop_init();
+       signals_init();
+
+       close(pipes[0]);
+       close(pipes[3]);
+
+       setns_open(CLONE_NEWUSER);
+       setns_open(CLONE_NEWNET);
+       setns_open(CLONE_NEWNS);
+       setns_open(CLONE_NEWIPC);
+       setns_open(CLONE_NEWUTS);
+
+       buf[0] = 'i';
+       if (write(pipes[1], buf, 1) < 1) {
+               ERROR("can't write to parent\n");
+               return EXIT_FAILURE;
+       }
+       close(pipes[1]);
+       if (read(pipes[2], buf, 1) < 1) {
+               ERROR("can't read from parent\n");
+               return EXIT_FAILURE;
+       }
+       if (buf[0] != 'O') {
+               ERROR("parent had an error, child exiting\n");
+               return EXIT_FAILURE;
+       }
+
+       if (opts.namespace & CLONE_NEWCGROUP)
+               unshare(CLONE_NEWCGROUP);
+
+       setns_open(CLONE_NEWCGROUP);
+
+       if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
+               if (setregid(0, 0) < 0) {
+                       ERROR("setgid\n");
+                       free_and_exit(EXIT_FAILURE);
+               }
+               if (setreuid(0, 0) < 0) {
+                       ERROR("setuid\n");
+                       free_and_exit(EXIT_FAILURE);
+               }
+               if (setgroups(0, NULL) < 0) {
+                       ERROR("setgroups\n");
+                       free_and_exit(EXIT_FAILURE);
+               }
+       }
+
+       if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
+                       && sethostname(opts.hostname, strlen(opts.hostname))) {
+               ERROR("sethostname(%s) failed: %m\n", opts.hostname);
+               free_and_exit(EXIT_FAILURE);
+       }
+
+       uloop_timeout_add(&pre_exec_timeout);
+       uloop_run();
+
+       free_and_exit(-1);
+       return -1;
+}
+
+static void pre_exec_jail(struct uloop_timeout *t)
+{
+       if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
+               ERROR("failed to build jail fs\n");
+               free_and_exit(EXIT_FAILURE);
+       } else {
+               run_hooks(opts.hooks.createContainer, post_jail_fs);
+       }
+}
+
+static void post_start_hook(void);
+static void post_jail_fs(void)
+{
+       char buf[1];
+
+       if (read(pipes[2], buf, 1) < 1) {
+               ERROR("can't read from parent\n");
+               free_and_exit(EXIT_FAILURE);
+       }
+       if (buf[0] != '!') {
+               ERROR("parent had an error, child exiting\n");
+               free_and_exit(EXIT_FAILURE);
+       }
+       close(pipes[2]);
+
+       run_hooks(opts.hooks.startContainer, post_start_hook);
+}
+
+static void post_start_hook(void)
+{
+       int pw_uid, pw_gid, gr_gid;
+
+       /*
+        * make sure setuid/setgid won't drop capabilities in case capabilities
+        * have been specified explicitely.
+        */
+       if (opts.capset.apply) {
+               if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
+                       ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+                       free_and_exit(EXIT_FAILURE);
+               }
+       }
+
+       /* drop capabilities, retain those still needed to further setup jail */
+       if (applyOCIcapabilities(opts.capset, (1LLU << CAP_SETGID) | (1LLU << CAP_SETUID) | (1LLU << CAP_SETPCAP)))
+               free_and_exit(EXIT_FAILURE);
+
+       /* use either cmdline-supplied user/group or uid/gid from OCI spec */
+       get_jail_user(&pw_uid, &pw_gid, &gr_gid);
+       set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
+
+       if (opts.additional_gids &&
+           (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) {
+               ERROR("setgroups failed: %m\n");
+               free_and_exit(EXIT_FAILURE);
+       }
+
+       if (opts.set_umask)
+               umask(opts.umask);
+
+       /* restore securebits back to normal (and lock them if not in userns) */
+       if (opts.capset.apply) {
+               if (prctl(PR_SET_SECUREBITS, (opts.namespace & CLONE_NEWUSER)?0:
+                   SECBIT_KEEP_CAPS_LOCKED|SECBIT_NO_SETUID_FIXUP_LOCKED|SECBIT_NOROOT_LOCKED)) {
+                       ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+                       free_and_exit(EXIT_FAILURE);
+               }
+       }
+
+       /* drop remaining capabilities to end up with specified sets */
+       if (applyOCIcapabilities(opts.capset, 0))
+               free_and_exit(EXIT_FAILURE);
+
+       if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+               free_and_exit(EXIT_FAILURE);
+       }
+
+       char **envp = build_envp(opts.seccomp, opts.envp);
+       if (!envp)
+               free_and_exit(EXIT_FAILURE);
+
+       if (opts.cwd && chdir(opts.cwd))
+               free_and_exit(EXIT_FAILURE);
+
+       if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
+               free_and_exit(EXIT_FAILURE);
+
+       uloop_end();
+       free_opts(false);
+       INFO("exec-ing %s\n", *opts.jail_argv);
+       if (opts.envp) /* respect PATH if potentially set in ENV */
+               execvpe(*opts.jail_argv, opts.jail_argv, envp);
+       else
+               execve(*opts.jail_argv, opts.jail_argv, envp);
+
+       /* we get there only if execve fails */
+       ERROR("failed to execve %s: %m\n", *opts.jail_argv);
+       exit(EXIT_FAILURE);
+}
+
+int ns_open_pid(const char *nstype, const pid_t target_ns)
+{
+       char pid_pid_path[PATH_MAX];
+
+       snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/%s", target_ns, nstype);
+
+       return open(pid_pid_path, O_RDONLY);
+}
+
+static int parseOCIenvarray(struct blob_attr *msg, char ***envp)
+{
+       struct blob_attr *cur;
+       int sz = 0, rem;
+
+       blobmsg_for_each_attr(cur, msg, rem)
+               ++sz;
+
+       if (sz > 0) {
+               *envp = calloc(1 + sz, sizeof(char*));
+               if (!(*envp))
+                       return ENOMEM;
+       } else {
+               *envp = NULL;
+               return 0;
+       }
+
+       sz = 0;
+       blobmsg_for_each_attr(cur, msg, rem)
+               (*envp)[sz++] = strdup(blobmsg_get_string(cur));
+
+       if (sz)
+               (*envp)[sz] = NULL;
+
+       return 0;
+}
+
+enum {
+       OCI_ROOT_PATH,
+       OCI_ROOT_READONLY,
+       __OCI_ROOT_MAX,
+};
+
+static const struct blobmsg_policy oci_root_policy[] = {
+       [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
+       [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
+};
+
+static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
+{
+       char extroot[PATH_MAX] = { 0 };
+       struct blob_attr *tb[__OCI_ROOT_MAX];
+       char *cur;
+       char *root_path;
+
+       blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_ROOT_PATH])
+               return ENODATA;
+
+       root_path = blobmsg_get_string(tb[OCI_ROOT_PATH]);
+
+       /* prepend bundle directory in case of relative paths */
+       if (root_path[0] != '/') {
+               strncpy(extroot, jsonfile, PATH_MAX - 1);
+
+               cur = strrchr(extroot, '/');
+
+               if (!cur)
+                       return ENOTDIR;
+
+               *(++cur) = '\0';
+       }
+
+       strncat(extroot, root_path, PATH_MAX - (strlen(extroot) + 1));
+
+       /* follow symbolic link(s) */
+       opts.extroot = realpath(extroot, NULL);
+       if (!opts.extroot)
+               return errno;
+
+       if (tb[OCI_ROOT_READONLY])
+               opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
+
+       return 0;
+}
+
+
+enum {
+       OCI_HOOK_PATH,
+       OCI_HOOK_ARGS,
+       OCI_HOOK_ENV,
+       OCI_HOOK_TIMEOUT,
+       __OCI_HOOK_MAX,
+};
+
+static const struct blobmsg_policy oci_hook_policy[] = {
+       [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING },
+       [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
+};
+
+
+static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_HOOK_MAX];
+       struct blob_attr *cur;
+       int rem, ret = 0;
+       int idx = 0;
+
+       blobmsg_for_each_attr(cur, msg, rem)
+               ++idx;
+
+       if (!idx)
+               return 0;
+
+       *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *));
+       idx = 0;
+
+       if (!(*hooklist))
+               return ENOMEM;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[OCI_HOOK_PATH]) {
+                       ret = EINVAL;
+                       goto errout;
+               }
+
+               (*hooklist)[idx] = calloc(1, sizeof(struct hook_execvpe));
+               if (tb[OCI_HOOK_ARGS]) {
+                       ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv));
+                       if (ret)
+                               goto errout;
+               } else {
+                       (*hooklist)[idx]->argv = calloc(2, sizeof(char *));
+                       ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
+                       ((*hooklist)[idx]->argv)[1] = NULL;
+               };
+
+
+               if (tb[OCI_HOOK_ENV]) {
+                       ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp));
+                       if (ret)
+                               goto errout;
+               }
+
+               if (tb[OCI_HOOK_TIMEOUT])
+                       (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]);
+
+               (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
+
+               ++idx;
+       }
+
+       (*hooklist)[idx] = NULL;
+
+       DEBUG("added %d hooks\n", idx);
+
+       return 0;
+
+errout:
+       free_hooklist(*hooklist);
+       *hooklist = NULL;
+
+       return ret;
+};
+
+
+enum {
+       OCI_HOOKS_PRESTART,
+       OCI_HOOKS_CREATERUNTIME,
+       OCI_HOOKS_CREATECONTAINER,
+       OCI_HOOKS_STARTCONTAINER,
+       OCI_HOOKS_POSTSTART,
+       OCI_HOOKS_POSTSTOP,
+       __OCI_HOOKS_MAX,
+};
+
+static const struct blobmsg_policy oci_hooks_policy[] = {
+       [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCIhooks(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_HOOKS_MAX];
+       int ret;
+
+       blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_HOOKS_PRESTART])
+               INFO("warning: ignoring deprecated prestart hook\n");
+
+       if (tb[OCI_HOOKS_CREATERUNTIME]) {
+               ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]);
+               if (ret)
+                       return ret;
+       }
+
+       if (tb[OCI_HOOKS_CREATECONTAINER]) {
+               ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]);
+               if (ret)
+                       goto out_createruntime;
+       }
+
+       if (tb[OCI_HOOKS_STARTCONTAINER]) {
+               ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]);
+               if (ret)
+                       goto out_createcontainer;
+       }
+
+       if (tb[OCI_HOOKS_POSTSTART]) {
+               ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]);
+               if (ret)
+                       goto out_startcontainer;
+       }
+
+       if (tb[OCI_HOOKS_POSTSTOP]) {
+               ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]);
+               if (ret)
+                       goto out_poststart;
+       }
+
+       return 0;
+
+out_poststart:
+       free_hooklist(opts.hooks.poststart);
+out_startcontainer:
+       free_hooklist(opts.hooks.startContainer);
+out_createcontainer:
+       free_hooklist(opts.hooks.createContainer);
+out_createruntime:
+       free_hooklist(opts.hooks.createRuntime);
+
+       return ret;
+};
+
+
+enum {
+       OCI_PROCESS_USER_UID,
+       OCI_PROCESS_USER_GID,
+       OCI_PROCESS_USER_UMASK,
+       OCI_PROCESS_USER_ADDITIONALGIDS,
+       __OCI_PROCESS_USER_MAX,
+};
+
+static const struct blobmsg_policy oci_process_user_policy[] = {
+       [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
+};
+
+static int parseOCIprocessuser(struct blob_attr *msg) {
+       struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
+       struct blob_attr *cur;
+       int rem;
+       int has_gid = 0;
+
+       blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_PROCESS_USER_UID])
+               opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
+
+       if (tb[OCI_PROCESS_USER_GID]) {
+               opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+               opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
+               has_gid = 1;
+       }
+
+       if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) {
+               size_t gidcnt = 0;
+
+               blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
+                       ++gidcnt;
+                       if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
+                               continue;
+               }
+
+               if (gidcnt) {
+                       opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t));
+                       gidcnt = 0;
+
+                       /* always add primary GID to set of GIDs if set */
+                       if (has_gid)
+                               opts.additional_gids[gidcnt++] = opts.gr_gid;
+
+                       blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
+                               if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
+                                       continue;
+                               opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur);
+                       }
+                       opts.num_additional_gids = gidcnt;
+               }
+               DEBUG("read %zu additional groups\n", gidcnt);
+       }
+
+       if (tb[OCI_PROCESS_USER_UMASK]) {
+               opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]);
+               opts.set_umask = true;
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_PROCESS_RLIMIT_TYPE,
+       OCI_PROCESS_RLIMIT_SOFT,
+       OCI_PROCESS_RLIMIT_HARD,
+       __OCI_PROCESS_RLIMIT_MAX,
+};
+
+static const struct blobmsg_policy oci_process_rlimit_policy[] = {
+       [OCI_PROCESS_RLIMIT_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_PROCESS_RLIMIT_SOFT] = { "soft", BLOBMSG_CAST_INT64 },
+       [OCI_PROCESS_RLIMIT_HARD] = { "hard", BLOBMSG_CAST_INT64 },
+};
+
+/* from manpage GETRLIMIT(2) */
+static const char* const rlimit_names[RLIM_NLIMITS] = {
+       [RLIMIT_AS] = "AS",
+       [RLIMIT_CORE] = "CORE",
+       [RLIMIT_CPU] = "CPU",
+       [RLIMIT_DATA] = "DATA",
+       [RLIMIT_FSIZE] = "FSIZE",
+       [RLIMIT_LOCKS] = "LOCKS",
+       [RLIMIT_MEMLOCK] = "MEMLOCK",
+       [RLIMIT_MSGQUEUE] = "MSGQUEUE",
+       [RLIMIT_NICE] = "NICE",
+       [RLIMIT_NOFILE] = "NOFILE",
+       [RLIMIT_NPROC] = "NPROC",
+       [RLIMIT_RSS] = "RSS",
+       [RLIMIT_RTPRIO] = "RTPRIO",
+       [RLIMIT_RTTIME] = "RTTIME",
+       [RLIMIT_SIGPENDING] = "SIGPENDING",
+       [RLIMIT_STACK] = "STACK",
+};
+
+static int resolve_rlimit(char *type) {
+       unsigned int rltype;
+
+       for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype)
+               if (rlimit_names[rltype] &&
+                   !strncmp("RLIMIT_", type, 7) &&
+                   !strcmp(rlimit_names[rltype], type + 7))
+                       return rltype;
+
+       return -1;
+}
+
+
+static int parseOCIrlimit(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_PROCESS_RLIMIT_MAX];
+       int limtype = -1;
+       struct rlimit *curlim;
+
+       blobmsg_parse(oci_process_rlimit_policy, __OCI_PROCESS_RLIMIT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_PROCESS_RLIMIT_TYPE] ||
+           !tb[OCI_PROCESS_RLIMIT_SOFT] ||
+           !tb[OCI_PROCESS_RLIMIT_HARD])
+               return ENODATA;
+
+       limtype = resolve_rlimit(blobmsg_get_string(tb[OCI_PROCESS_RLIMIT_TYPE]));
+
+       if (limtype < 0)
+               return EINVAL;
+
+       if (opts.rlimits[limtype])
+               return ENOTUNIQ;
+
+       curlim = malloc(sizeof(struct rlimit));
+       curlim->rlim_cur = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_SOFT]);
+       curlim->rlim_max = blobmsg_cast_u64(tb[OCI_PROCESS_RLIMIT_HARD]);
+
+       opts.rlimits[limtype] = curlim;
+
+       return 0;
+};
+
+enum {
+       OCI_PROCESS_ARGS,
+       OCI_PROCESS_CAPABILITIES,
+       OCI_PROCESS_CWD,
+       OCI_PROCESS_ENV,
+       OCI_PROCESS_OOMSCOREADJ,
+       OCI_PROCESS_NONEWPRIVILEGES,
+       OCI_PROCESS_RLIMITS,
+       OCI_PROCESS_TERMINAL,
+       OCI_PROCESS_USER,
+       __OCI_PROCESS_MAX,
+};
+
+static const struct blobmsg_policy oci_process_policy[] = {
+       [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
+       [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
+       [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 },
+       [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
+       [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
+       [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
+       [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
+};
+
+
+static int parseOCIprocess(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_PROCESS_MAX], *cur;
+       int rem, res;
+
+       blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_PROCESS_ARGS])
+               return ENOENT;
+
+       res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv);
+       if (res)
+               return res;
+
+       if (tb[OCI_PROCESS_TERMINAL])
+               opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
+
+       if (tb[OCI_PROCESS_NONEWPRIVILEGES])
+               opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
+
+       if (tb[OCI_PROCESS_CWD])
+               opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD]));
+
+       if (tb[OCI_PROCESS_ENV]) {
+               res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
+               return res;
+
+       if (tb[OCI_PROCESS_CAPABILITIES] &&
+           (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
+               return res;
+
+       if (tb[OCI_PROCESS_RLIMITS]) {
+               blobmsg_for_each_attr(cur, tb[OCI_PROCESS_RLIMITS], rem) {
+                       res = parseOCIrlimit(cur);
+                       if (res)
+                               return res;
+               }
+       }
+
+       if (tb[OCI_PROCESS_OOMSCOREADJ]) {
+               opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]);
+               opts.set_oom_score_adj = true;
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_LINUX_NAMESPACE_TYPE,
+       OCI_LINUX_NAMESPACE_PATH,
+       __OCI_LINUX_NAMESPACE_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_namespace_policy[] = {
+       [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
+};
+
+static int resolve_nstype(char *type) {
+       if (!strcmp("pid", type))
+               return CLONE_NEWPID;
+       else if (!strcmp("network", type))
+               return CLONE_NEWNET;
+       else if (!strcmp("net", type))
+               return CLONE_NEWNET;
+       else if (!strcmp("mount", type))
+               return CLONE_NEWNS;
+       else if (!strcmp("ipc", type))
+               return CLONE_NEWIPC;
+       else if (!strcmp("uts", type))
+               return CLONE_NEWUTS;
+       else if (!strcmp("user", type))
+               return CLONE_NEWUSER;
+       else if (!strcmp("cgroup", type))
+               return CLONE_NEWCGROUP;
+#ifdef CLONE_NEWTIME
+       else if (!strcmp("time", type))
+               return CLONE_NEWTIME;
+#endif
+       else
+               return 0;
+}
+
+static int parseOCIlinuxns(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
+       int nstype;
+       int *setns;
+       int fd;
+
+       blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_NAMESPACE_TYPE])
+               return EINVAL;
+
+       nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
+       if (!nstype)
+               return EINVAL;
+
+       if (opts.namespace & nstype)
+               return ENOTUNIQ;
+
+       setns = get_namespace_fd(nstype);
+
+       if (!setns)
+               return EFAULT;
+
+       if (*setns != -1)
+               return ENOTUNIQ;
+
+       if (tb[OCI_LINUX_NAMESPACE_PATH]) {
+               DEBUG("opening existing %s namespace from path %s\n",
+                       blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
+                       blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]));
+
+               fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY);
+               if (fd < 0)
+                       return errno?:ESTALE;
+
+               if (ioctl(fd, NS_GET_NSTYPE) != nstype) {
+                       close(fd);
+                       return EINVAL;
+               }
+
+               DEBUG("opened existing %s namespace got filehandler %u\n",
+                       blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
+                       fd);
+
+               *setns = fd;
+       } else {
+               opts.namespace |= nstype;
+       }
+
+       return 0;
+}
+
+/*
+ * join namespace of existing PID
+ * The string argument is the reference PID followed by ':' and a
+ * ',' separated list of namespaces to to join.
+ */
+static int jail_join_ns(char *arg)
+{
+       pid_t pid;
+       int fd;
+       int nstype;
+       char *tmp, *etmp, *nspath;
+       int *setns;
+
+       tmp = strchr(arg, ':');
+       if (!tmp)
+               return EINVAL;
+
+       *tmp = '\0';
+       pid = atoi(arg);
+
+       do {
+               ++tmp;
+               etmp = strchr(tmp, ',');
+               if (etmp)
+                       *etmp = '\0';
+
+               nstype = resolve_nstype(tmp);
+               if (!nstype)
+                       return EINVAL;
+
+               if (opts.namespace & nstype)
+                       return ENOTUNIQ;
+
+               setns = get_namespace_fd(nstype);
+
+               if (!setns)
+                       return EFAULT;
+
+               if (*setns != -1)
+                       return ENOTUNIQ;
+
+               if (asprintf(&nspath, "/proc/%d/ns/%s", pid, tmp) < 0)
+                       return ENOMEM;
+
+               fd = open(nspath, O_RDONLY);
+               free(nspath);
+
+               if (fd < 0)
+                       return errno?:ESTALE;
+
+               *setns = fd;
+
+               if (etmp)
+                       tmp = etmp;
+               else
+                       tmp = NULL;
+       } while (tmp);
+
+       return 0;
+}
+
+static void get_jail_root_user(bool is_gidmap, uint32_t container_id, uint32_t host_id, uint32_t size)
+{
+       if (container_id == 0 && size >= 1)
+               if (!is_gidmap)
+                       opts.root_map_uid = host_id;
+}
+
+enum {
+       OCI_LINUX_UIDGIDMAP_CONTAINERID,
+       OCI_LINUX_UIDGIDMAP_HOSTID,
+       OCI_LINUX_UIDGIDMAP_SIZE,
+       __OCI_LINUX_UIDGIDMAP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
+       [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
+};
+
+static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
+{
+       struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
+       struct blob_attr *cur;
+       int rem;
+       char *map;
+       size_t len, pos, totallen = 0;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
+                   !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
+                   !tb[OCI_LINUX_UIDGIDMAP_SIZE])
+                       return EINVAL;
+
+               /* count length */
+               totallen += snprintf(NULL, 0, "%d %d %d\n",
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+       }
+
+       /* allocate combined mapping string */
+       map = malloc(totallen + 1);
+       if (!map)
+               return ENOMEM;
+
+       pos = 0;
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               get_jail_root_user(is_gidmap, blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+
+               /* write mapping line into pre-allocated string */
+               len = snprintf(&map[pos], totallen + 1, "%d %d %d\n",
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
+                        blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
+               pos += len;
+               totallen -= len;
+       }
+
+       assert(totallen == 0);
+
+       if (is_gidmap)
+               opts.gidmap = map;
+       else
+               opts.uidmap = map;
+
+       return 0;
+}
+
+enum {
+       OCI_DEVICES_TYPE,
+       OCI_DEVICES_PATH,
+       OCI_DEVICES_MAJOR,
+       OCI_DEVICES_MINOR,
+       OCI_DEVICES_FILEMODE,
+       OCI_DEVICES_UID,
+       OCI_DEVICES_GID,
+       __OCI_DEVICES_MAX,
+};
+
+static const struct blobmsg_policy oci_devices_policy[] = {
+       [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
+       [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING },
+       [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 },
+       [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 },
+       [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 },
+       [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 },
+       [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 },
+};
+
+static mode_t resolve_devtype(char *tstr)
+{
+       if (!strcmp("c", tstr) ||
+           !strcmp("u", tstr))
+               return S_IFCHR;
+       else if (!strcmp("b", tstr))
+               return S_IFBLK;
+       else if (!strcmp("p", tstr))
+               return S_IFIFO;
+       else
+               return 0;
+}
+
+static int parseOCIdevices(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_DEVICES_MAX];
+       struct blob_attr *cur;
+       int rem;
+       size_t cnt = 0;
+       struct mknod_args *tmp;
+
+       blobmsg_for_each_attr(cur, msg, rem)
+               ++cnt;
+
+       opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *));
+
+       cnt = 0;
+       blobmsg_for_each_attr(cur, msg, rem) {
+               blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[OCI_DEVICES_TYPE] ||
+                   !tb[OCI_DEVICES_PATH])
+                       return ENODATA;
+
+               tmp = calloc(1, sizeof(struct mknod_args));
+               if (!tmp)
+                       return ENOMEM;
+
+               tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
+               if (!tmp->mode) {
+                       free(tmp);
+                       return EINVAL;
+               }
+
+               if (tmp->mode != S_IFIFO) {
+                       if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR]) {
+                               free(tmp);
+                               return ENODATA;
+                       }
+
+                       tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]),
+                                          blobmsg_get_u32(tb[OCI_DEVICES_MINOR]));
+               }
+
+               if (tb[OCI_DEVICES_FILEMODE]) {
+                       if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE])) {
+                               free(tmp);
+                               return EINVAL;
+                       }
+
+                       tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]);
+               } else {
+                       tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */
+               }
+
+               tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH]));
+
+               if (tb[OCI_DEVICES_UID])
+                       tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]);
+               else
+                       tmp->uid = -1;
+
+               if (tb[OCI_DEVICES_GID])
+                       tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]);
+               else
+                       tmp->gid = -1;
+
+               DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
+               opts.devices[cnt++] = tmp;
+       }
+
+       opts.devices[cnt] = NULL;
+
+       return 0;
+}
+
+static int parseOCIsysctl(struct blob_attr *msg)
+{
+       struct blob_attr *cur;
+       int rem;
+       char *tmp, *tc;
+       size_t cnt = 0;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               if (!blobmsg_name(cur) || !blobmsg_get_string(cur))
+                       return EINVAL;
+
+               ++cnt;
+       }
+
+       if (!cnt)
+               return 0;
+
+       opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *));
+       if (!opts.sysctl)
+               return ENOMEM;
+
+       cnt = 0;
+       blobmsg_for_each_attr(cur, msg, rem) {
+               opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val));
+               if (!opts.sysctl[cnt])
+                       return ENOMEM;
+
+               /* replace '.' with '/' in entry name */
+               tc = tmp = strdup(blobmsg_name(cur));
+               while ((tc = strchr(tc, '.')))
+                       *tc = '/';
+
+               opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur));
+               opts.sysctl[cnt]->entry = tmp;
+
+               ++cnt;
+       }
+
+       opts.sysctl[cnt] = NULL;
+
+       return 0;
+}
+
+
+enum {
+       OCI_LINUX_CGROUPSPATH,
+       OCI_LINUX_RESOURCES,
+       OCI_LINUX_SECCOMP,
+       OCI_LINUX_SYSCTL,
+       OCI_LINUX_NAMESPACES,
+       OCI_LINUX_DEVICES,
+       OCI_LINUX_UIDMAPPINGS,
+       OCI_LINUX_GIDMAPPINGS,
+       OCI_LINUX_MASKEDPATHS,
+       OCI_LINUX_READONLYPATHS,
+       OCI_LINUX_ROOTFSPROPAGATION,
+       __OCI_LINUX_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_policy[] = {
+       [OCI_LINUX_CGROUPSPATH] = { "cgroupsPath", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
+};
+
+static int parseOCIlinux(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_MAX];
+       struct blob_attr *cur;
+       int rem;
+       int res = 0;
+       char *cgpath;
+       char cgfullpath[256] = "/sys/fs/cgroup";
+
+       blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (tb[OCI_LINUX_NAMESPACES]) {
+               blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
+                       res = parseOCIlinuxns(cur);
+                       if (res)
+                               return res;
+               }
+       }
+
+       if (tb[OCI_LINUX_UIDMAPPINGS]) {
+               res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_GIDMAPPINGS]) {
+               res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_READONLYPATHS]) {
+               blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) {
+                       res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, 0, NULL, 0);
+                       if (res)
+                               return res;
+               }
+       }
+
+       if (tb[OCI_LINUX_MASKEDPATHS]) {
+               blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) {
+                       res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, 0, NULL, 0);
+                       if (res)
+                               return res;
+               }
+       }
+
+       if (tb[OCI_LINUX_SYSCTL]) {
+               res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_SECCOMP]) {
+               opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
+               if (!opts.ociseccomp)
+                       return EINVAL;
+       }
+
+       if (tb[OCI_LINUX_DEVICES]) {
+               res = parseOCIdevices(tb[OCI_LINUX_DEVICES]);
+               if (res)
+                       return res;
+       }
+
+       if (tb[OCI_LINUX_CGROUPSPATH]) {
+               cgpath = blobmsg_get_string(tb[OCI_LINUX_CGROUPSPATH]);
+               if (cgpath[0] == '/') {
+                       if (strlen(cgpath) + 1 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+                               return E2BIG;
+
+                       strcat(cgfullpath, cgpath);
+               } else {
+                       strcat(cgfullpath, "/containers/");
+                       if (strlen(opts.name) + strlen(cgpath) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+                               return E2BIG;
+
+                       strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+                       strcat(cgfullpath, "/");
+                       strcat(cgfullpath, cgpath);
+               }
+       } else {
+               strcat(cgfullpath, "/containers/");
+               if (2 * strlen(opts.name) + 2 >= (sizeof(cgfullpath) - strlen(cgfullpath)))
+                       return E2BIG;
+
+               strcat(cgfullpath, opts.name); /* should be container name rather than jail name */
+               strcat(cgfullpath, "/");
+               strcat(cgfullpath, opts.name); /* should be container instance name rather than jail name */
+       }
+
+       cgroups_init(cgfullpath);
+
+       if (tb[OCI_LINUX_RESOURCES]) {
+               res = parseOCIlinuxcgroups(tb[OCI_LINUX_RESOURCES]);
+               if (res)
+                       return res;
+       }
+
+       return 0;
+}
+
+enum {
+       OCI_VERSION,
+       OCI_HOSTNAME,
+       OCI_PROCESS,
+       OCI_ROOT,
+       OCI_MOUNTS,
+       OCI_HOOKS,
+       OCI_LINUX,
+       OCI_ANNOTATIONS,
+       __OCI_MAX,
+};
+
+static const struct blobmsg_policy oci_policy[] = {
+       [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
+       [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
+       [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
+       [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
+       [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
+       [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE },
+       [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
+       [OCI_ANNOTATIONS] = { "annotations", BLOBMSG_TYPE_TABLE },
+};
+
+static int parseOCI(const char *jsonfile)
+{
+       struct blob_attr *tb[__OCI_MAX];
+       struct blob_attr *cur;
+       int rem;
+       int res;
+
+       blob_buf_init(&ocibuf, 0);
+
+       if (!blobmsg_add_json_from_file(&ocibuf, jsonfile)) {
+               res=ENOENT;
+               goto errout;
+       }
+
+       blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
+
+       if (!tb[OCI_VERSION]) {
+               res=ENOMSG;
+               goto errout;
+       }
+
+       if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
+               ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
+               res=ENOTSUP;
+               goto errout;
+       }
+
+       if (tb[OCI_HOSTNAME])
+               opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME]));
+
+       if (!tb[OCI_PROCESS]) {
+               res=ENODATA;
+               goto errout;
+       }
+
+       if ((res = parseOCIprocess(tb[OCI_PROCESS])))
+               goto errout;
 
-       if (opts.procfs) {
-               mkdir("/proc", 0755);
-               mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+       if (!tb[OCI_ROOT]) {
+               res=ENODATA;
+               goto errout;
        }
-       if (opts.sysfs) {
-               mkdir("/sys", 0755);
-               mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0);
+       if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
+               goto errout;
+
+       if (!tb[OCI_MOUNTS]) {
+               res=ENODATA;
+               goto errout;
        }
-       if (opts.ronly)
-               mount(NULL, "/", NULL, MS_RDONLY | MS_REMOUNT, 0);
 
-       return 0;
+       blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
+               if ((res = parseOCImount(cur)))
+                       goto errout;
+
+       if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
+               goto errout;
+
+       if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS])))
+               goto errout;
+
+       if (tb[OCI_ANNOTATIONS])
+               opts.annotations = blob_memdup(tb[OCI_ANNOTATIONS]);
+
+errout:
+       blob_buf_free(&ocibuf);
+
+       return res;
 }
 
-#define MAX_ENVP       8
-static char** build_envp(const char *seccomp)
+static int set_oom_score_adj(void)
 {
-       static char *envp[MAX_ENVP];
-       static char preload_var[PATH_MAX];
-       static char seccomp_var[PATH_MAX];
-       static char debug_var[] = "LD_DEBUG=all";
-       const char *preload_lib = find_lib("libpreload-seccomp.so");
-       int count = 0;
+       int f;
+       char fname[32];
 
-       if (seccomp && !preload_lib) {
-               ERROR("failed to add preload-lib to env\n");
-               return NULL;
-       }
-       if (seccomp) {
-               snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
-               envp[count++] = seccomp_var;
-               snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
-               envp[count++] = preload_var;
-       }
-       if (debug > 1)
-               envp[count++] = debug_var;
+       if (!opts.set_oom_score_adj)
+               return 0;
 
-       return envp;
+       snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid);
+       f = open(fname, O_WRONLY | O_TRUNC);
+       if (f < 0)
+               return errno;
+
+       dprintf(f, "%d", opts.oom_score_adj);
+       close(f);
+
+       return 0;
 }
 
-static void usage(void)
+
+enum {
+       OCI_STATE_CREATING,
+       OCI_STATE_CREATED,
+       OCI_STATE_RUNNING,
+       OCI_STATE_STOPPED,
+};
+
+static int jail_oci_state = OCI_STATE_CREATED;
+static void pipe_send_start_container(struct uloop_timeout *t);
+static struct uloop_timeout start_container_timeout = {
+       .cb = pipe_send_start_container,
+};
+
+static int handle_start(struct ubus_context *ctx, struct ubus_object *obj,
+                       struct ubus_request_data *req, const char *method,
+                       struct blob_attr *msg)
 {
-       fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
-       fprintf(stderr, "  -d <num>\tshow debug log (increase num to increase verbosity)\n");
-       fprintf(stderr, "  -S <file>\tseccomp filter config\n");
-       fprintf(stderr, "  -C <file>\tcapabilities drop config\n");
-       fprintf(stderr, "  -c\t\tset PR_SET_NO_NEW_PRIVS\n");
-       fprintf(stderr, "  -n <name>\tthe name of the jail\n");
-       fprintf(stderr, "namespace jail options:\n");
-       fprintf(stderr, "  -h <hostname>\tchange the hostname of the jail\n");
-       fprintf(stderr, "  -r <file>\treadonly files that should be staged\n");
-       fprintf(stderr, "  -w <file>\twriteable files that should be staged\n");
-       fprintf(stderr, "  -p\t\tjail has /proc\n");
-       fprintf(stderr, "  -s\t\tjail has /sys\n");
-       fprintf(stderr, "  -l\t\tjail has /dev/log\n");
-       fprintf(stderr, "  -u\t\tjail has a ubus socket\n");
-       fprintf(stderr, "  -o\t\tremont jail root (/) read only\n");
-       fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
-and he has the same powers as root outside the jail,\n\
-thus he can escape the jail and/or break stuff.\n\
-Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
-If you use none of the namespace jail options,\n\
-ujail will not use namespace/build a jail,\n\
-and will only drop capabilities/apply seccomp filter.\n\n");
+       if (jail_oci_state != OCI_STATE_CREATED)
+               return UBUS_STATUS_INVALID_ARGUMENT;
+
+       uloop_timeout_add(&start_container_timeout);
+
+       return UBUS_STATUS_OK;
 }
 
-static int exec_jail(void *_notused)
+static struct blob_buf bb;
+static int handle_state(struct ubus_context *ctx, struct ubus_object *obj,
+                       struct ubus_request_data *req, const char *method,
+                       struct blob_attr *msg)
 {
-       if (opts.capabilities && drop_capabilities(opts.capabilities))
-               exit(EXIT_FAILURE);
+       char *statusstr;
 
-       if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
-                ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
-               exit(EXIT_FAILURE);
+       switch (jail_oci_state) {
+               case OCI_STATE_CREATING:
+                       statusstr = "creating";
+                       break;
+               case OCI_STATE_CREATED:
+                       statusstr = "created";
+                       break;
+               case OCI_STATE_RUNNING:
+                       statusstr = "running";
+                       break;
+               case OCI_STATE_STOPPED:
+                       statusstr = "stopped";
+                       break;
+               default:
+                       statusstr = "unknown";
        }
 
-       if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
-                       && sethostname(opts.hostname, strlen(opts.hostname))) {
-               ERROR("sethostname(%s) failed: %m\n", opts.hostname);
-               exit(EXIT_FAILURE);
-       }
+       blob_buf_init(&bb, 0);
+       blobmsg_add_string(&bb, "ociVersion", OCI_VERSION_STRING);
+       blobmsg_add_string(&bb, "id", opts.name);
+       blobmsg_add_string(&bb, "status", statusstr);
+       if (jail_oci_state == OCI_STATE_CREATED ||
+           jail_oci_state == OCI_STATE_RUNNING)
+               blobmsg_add_u32(&bb, "pid", jail_process.pid);
 
-       if (opts.namespace && build_jail_fs()) {
-               ERROR("failed to build jail fs\n");
-               exit(EXIT_FAILURE);
-       }
+       blobmsg_add_string(&bb, "bundle", opts.ocibundle);
 
-       char **envp = build_envp(opts.seccomp);
-       if (!envp)
-               exit(EXIT_FAILURE);
+       if (opts.annotations)
+               blobmsg_add_blob(&bb, opts.annotations);
 
-       INFO("exec-ing %s\n", *opts.jail_argv);
-       execve(*opts.jail_argv, opts.jail_argv, envp);
-       /* we get there only if execve fails */
-       ERROR("failed to execve %s: %m\n", *opts.jail_argv);
-       exit(EXIT_FAILURE);
+       ubus_send_reply(ctx, req, bb.head);
+
+       return UBUS_STATUS_OK;
 }
 
-static int jail_running = 1;
-static int jail_return_code = 0;
+enum {
+       CONTAINER_KILL_ATTR_SIGNAL,
+       __CONTAINER_KILL_ATTR_MAX,
+};
 
-static void jail_process_timeout_cb(struct uloop_timeout *t);
-static struct uloop_timeout jail_process_timeout = {
-       .cb = jail_process_timeout_cb,
+static const struct blobmsg_policy container_kill_attrs[__CONTAINER_KILL_ATTR_MAX] = {
+       [CONTAINER_KILL_ATTR_SIGNAL] = { "signal", BLOBMSG_TYPE_INT32 },
 };
 
-static void jail_process_handler(struct uloop_process *c, int ret)
+static int
+container_handle_kill(struct ubus_context *ctx, struct ubus_object *obj,
+                   struct ubus_request_data *req, const char *method,
+                   struct blob_attr *msg)
 {
-       uloop_timeout_cancel(&jail_process_timeout);
-       if (WIFEXITED(ret)) {
-               jail_return_code = WEXITSTATUS(ret);
-               INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
-       } else {
-               jail_return_code = WTERMSIG(ret);
-               INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
+       struct blob_attr *tb[__CONTAINER_KILL_ATTR_MAX], *cur;
+       int sig = SIGTERM;
+
+       blobmsg_parse(container_kill_attrs, __CONTAINER_KILL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
+
+       cur = tb[CONTAINER_KILL_ATTR_SIGNAL];
+       if (cur)
+               sig = blobmsg_get_u32(cur);
+
+       if (jail_oci_state == OCI_STATE_CREATING)
+               return UBUS_STATUS_NOT_FOUND;
+
+       if (kill(jail_process.pid, sig) == 0)
+               return 0;
+
+       switch (errno) {
+       case EINVAL: return UBUS_STATUS_INVALID_ARGUMENT;
+       case EPERM:  return UBUS_STATUS_PERMISSION_DENIED;
+       case ESRCH:  return UBUS_STATUS_NOT_FOUND;
        }
-       jail_running = 0;
-       uloop_end();
-}
 
-static struct uloop_process jail_process = {
-       .cb = jail_process_handler,
-};
+       return UBUS_STATUS_UNKNOWN_ERROR;
+}
 
-static void jail_process_timeout_cb(struct uloop_timeout *t)
+static int
+jail_writepid(pid_t pid)
 {
-       DEBUG("jail process failed to stop, sending SIGKILL\n");
-       kill(jail_process.pid, SIGKILL);
+       FILE *_pidfile;
+
+       if (!opts.pidfile)
+               return 0;
+
+       _pidfile = fopen(opts.pidfile, "w");
+       if (_pidfile == NULL)
+               return errno;
+
+       if (fprintf(_pidfile, "%d\n", pid) < 0) {
+               fclose(_pidfile);
+               return errno;
+       }
+
+       if (fclose(_pidfile))
+               return errno;
+
+       return 0;
 }
 
-static void jail_handle_signal(int signo)
+static int checkpath(const char *path)
 {
-       DEBUG("forwarding signal %d to the jailed process\n", signo);
-       kill(jail_process.pid, signo);
+       int dirfd = open(path, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
+       if (dirfd < 0) {
+               ERROR("path %s open failed %m\n", path);
+               return -1;
+       }
+       close(dirfd);
+
+       return 0;
 }
 
+static struct ubus_method container_methods[] = {
+       UBUS_METHOD_NOARG("start", handle_start),
+       UBUS_METHOD_NOARG("state", handle_state),
+       UBUS_METHOD("kill", container_handle_kill, container_kill_attrs),
+};
+
+static struct ubus_object_type container_object_type =
+       UBUS_OBJECT_TYPE("container", container_methods);
+
+static struct ubus_object container_object = {
+       .type = &container_object_type,
+       .methods = container_methods,
+       .n_methods = ARRAY_SIZE(container_methods),
+};
+
+static void post_main(struct uloop_timeout *t);
+static struct uloop_timeout post_main_timeout = {
+       .cb = post_main,
+};
+static int netns_fd;
+static int pidns_fd;
+#ifdef CLONE_NEWTIME
+static int timens_fd;
+#endif
+static void post_create_runtime(void);
+
+struct env_e {
+       struct list_head list;
+       char *envarg;
+};
+
 int main(int argc, char **argv)
 {
-       sigset_t sigmask;
        uid_t uid = getuid();
-       char log[] = "/dev/log";
-       char ubus[] = "/var/run/ubus.sock";
-       int ch, i;
+       const char log[] = "/dev/log";
+       const char ubus[] = "/var/run/ubus/ubus.sock";
+       int ret = EXIT_FAILURE;
+       int ch;
+       char *tmp;
+       struct list_head envl = LIST_HEAD_INIT(envl);
+       struct env_e *enve, *tmpenve;
+       unsigned short int envn = 0, envc = 0;
 
        if (uid) {
                ERROR("not root, aborting: %m\n");
                return EXIT_FAILURE;
        }
 
+       /* those are filehandlers, so -1 indicates unused */
+       opts.setns.pid = -1;
+       opts.setns.net = -1;
+       opts.setns.ns = -1;
+       opts.setns.ipc = -1;
+       opts.setns.uts = -1;
+       opts.setns.user = -1;
+       opts.setns.cgroup = -1;
+#ifdef CLONE_NEWTIME
+       opts.setns.time = -1;
+#endif
+
+       /* default 5 seconds timeout after SIGTERM before SIGKILL is sent */
+       opts.term_timeout = 5;
+
        umask(022);
        mount_list_init();
        init_library_search();
+       cgroups_prepare();
+       exit_from_child = false;
 
        while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
                switch (ch) {
                case 'd':
                        debug = atoi(optarg);
                        break;
+               case 'e':
+                       enve = calloc(1, sizeof(*enve));
+                       enve->envarg = optarg;
+                       list_add_tail(&enve->list, &envl);
+                       break;
                case 'p':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.procfs = 1;
                        break;
                case 'o':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.ronly = 1;
                        break;
+               case 'f':
+                       opts.namespace |= CLONE_NEWUSER;
+                       break;
+               case 'F':
+                       opts.namespace |= CLONE_NEWCGROUP;
+                       break;
+               case 'R':
+                       opts.extroot = realpath(optarg, NULL);
+                       break;
                case 's':
-                       opts.namespace = 1;
+                       opts.namespace |= CLONE_NEWNS;
                        opts.sysfs = 1;
                        break;
                case 'S':
                        opts.seccomp = optarg;
-                       add_mount(optarg, 1, -1);
+                       add_mount_bind(optarg, 1, -1);
                        break;
                case 'C':
                        opts.capabilities = optarg;
@@ -352,104 +2662,516 @@ int main(int argc, char **argv)
                case 'n':
                        opts.name = optarg;
                        break;
+               case 'N':
+                       opts.namespace |= CLONE_NEWNET;
+                       break;
                case 'h':
-                       opts.hostname = optarg;
+                       opts.namespace |= CLONE_NEWUTS;
+                       opts.hostname = strdup(optarg);
+                       break;
+               case 'j':
+                       jail_join_ns(optarg);
                        break;
                case 'r':
-                       opts.namespace = 1;
-                       add_path_and_deps(optarg, 1, 0, 0);
+                       opts.namespace |= CLONE_NEWNS;
+                       tmp = strchr(optarg, ':');
+                       if (tmp) {
+                               *(tmp++) = '\0';
+                               add_2paths_and_deps(optarg, tmp, 1, 0, 0);
+                       } else {
+                               add_path_and_deps(optarg, 1, 0, 0);
+                       }
                        break;
                case 'w':
-                       opts.namespace = 1;
-                       add_path_and_deps(optarg, 0, 0, 0);
+                       opts.namespace |= CLONE_NEWNS;
+                       tmp = strchr(optarg, ':');
+                       if (tmp) {
+                               *(tmp++) = '\0';
+                               add_2paths_and_deps(optarg, tmp, 0, 0, 0);
+                       } else {
+                               add_path_and_deps(optarg, 0, 0, 0);
+                       }
                        break;
                case 'u':
-                       opts.namespace = 1;
-                       add_mount(ubus, 0, -1);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_mount_bind(ubus, 0, -1);
                        break;
                case 'l':
-                       opts.namespace = 1;
-                       add_mount(log, 0, -1);
+                       opts.namespace |= CLONE_NEWNS;
+                       add_mount_bind(log, 0, -1);
+                       break;
+               case 'U':
+                       opts.user = optarg;
+                       break;
+               case 'G':
+                       opts.group = optarg;
                        break;
+               case 'O':
+                       opts.overlaydir = realpath(optarg, NULL);
+                       break;
+               case 't':
+                       opts.term_timeout = atoi(optarg);
+                       break;
+               case 'T':
+                       opts.tmpoverlaysize = optarg;
+                       break;
+               case 'E':
+                       opts.require_jail = 1;
+                       break;
+               case 'y':
+                       opts.console = 1;
+                       break;
+               case 'J':
+                       opts.ocibundle = optarg;
+                       break;
+               case 'i':
+                       opts.immediately = true;
+                       break;
+               case 'P':
+                       opts.pidfile = optarg;
+                       break;
+               }
+       }
+
+       if (opts.namespace && !opts.ocibundle)
+               opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
+
+       /*
+        * env import from cmdline is not available for OCI containers
+        */
+       if (opts.ocibundle && !list_empty(&envl)) {
+               ret=-ENOTSUP;
+               goto errout;
+       }
+
+       /*
+        * prepare list of env variables to import for slim containers
+        */
+       if (!list_empty(&envl)) {
+               list_for_each_entry(enve, &envl, list)
+                       ++envn;
+
+               opts.envp = calloc(1 + envn, sizeof(char*));
+               list_for_each_entry_safe(enve, tmpenve, &envl, list) {
+                       tmp = getenv(enve->envarg);
+                       if (tmp) {
+                               ret = asprintf(&opts.envp[envc++], "%s=%s", enve->envarg, tmp);
+                               if (ret < 0) {
+                                       ERROR("filed to handle envargs %s\n", tmp);
+                                       free(enve);
+                                       goto errout;
+                               }
+                       }
+
+                       list_del(&enve->list);
+                       free(enve);
+               }
+
+               opts.envp[envc] = NULL;
+       }
+
+       /*
+        * uid in parent user namespace representing root user in new
+        * user namespace, defaults to nobody unless specified in uidMappings
+        */
+       opts.root_map_uid = 65534;
+
+       if (opts.capabilities && parseOCIcapabilities_from_file(&opts.capset, opts.capabilities)) {
+               ERROR("failed to read capabilities from file %s\n", opts.capabilities);
+               ret=-1;
+               goto errout;
+       }
+
+       if (opts.ocibundle) {
+               char *jsonfile;
+               int ocires;
+
+               if (!opts.name) {
+                       ERROR("OCI bundle needs a named jail\n");
+                       ret=-1;
+                       goto errout;
+               }
+               if (asprintf(&jsonfile, "%s/config.json", opts.ocibundle) < 0) {
+                       ret=-ENOMEM;
+                       goto errout;
+               }
+               ocires = parseOCI(jsonfile);
+               free(jsonfile);
+               if (ocires) {
+                       ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
+                       ret=ocires;
+                       goto errout;
                }
        }
 
+       if (opts.namespace & CLONE_NEWNET) {
+               if (!opts.name) {
+                       ERROR("netns needs a named jail\n");
+                       ret=-1;
+                       goto errout;
+               }
+       }
+
+
+       if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
+               ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
+               ret=-1;
+               goto errout;
+       }
+
+       if (opts.extroot && checkpath(opts.extroot)) {
+               ERROR("invalid rootfs path '%s'", opts.extroot);
+               ret=-1;
+               goto errout;
+       }
+
+       if (opts.overlaydir && checkpath(opts.overlaydir)) {
+               ERROR("invalid rootfs overlay path '%s'", opts.overlaydir);
+               ret=-1;
+               goto errout;
+       }
+
        /* no <binary> param found */
-       if (argc - optind < 1) {
+       if (!opts.ocibundle && (argc - optind < 1)) {
                usage();
-               return EXIT_FAILURE;
+               ret=EXIT_FAILURE;
+               goto errout;
        }
-       if (!(opts.namespace||opts.capabilities||opts.seccomp)) {
+       if (!(opts.ocibundle||opts.namespace||opts.capabilities||opts.seccomp||
+               (opts.setns.net != -1) ||
+               (opts.setns.ns != -1) ||
+               (opts.setns.ipc != -1) ||
+               (opts.setns.uts != -1) ||
+               (opts.setns.user != -1) ||
+               (opts.setns.cgroup != -1))) {
                ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
                usage();
-               return EXIT_FAILURE;
+               ret=EXIT_FAILURE;
+               goto errout;
        }
-       DEBUG("Using namespaces(%d), capabilities(%d), seccomp(%d)\n",
+       DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
                opts.namespace,
-               opts.capabilities != 0,
-               opts.seccomp != 0);
+               opts.capset.apply,
+               opts.seccomp != 0 || opts.ociseccomp != 0);
 
-       opts.jail_argv = &argv[optind];
+       uloop_init();
+       signals_init();
 
-       if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
-               ERROR("failed to load dependencies\n");
-               return -1;
+       parent_ctx = ubus_connect(NULL);
+       ubus_add_uloop(parent_ctx);
+
+       if (opts.ocibundle) {
+               char *objname;
+               if (asprintf(&objname, "container.%s", opts.name) < 0) {
+                       ret=-ENOMEM;
+                       goto errout;
+               }
+
+               container_object.name = objname;
+               ret = ubus_add_object(parent_ctx, &container_object);
+               if (ret) {
+                       ERROR("Failed to add object: %s\n", ubus_strerror(ret));
+                       ret=-1;
+                       goto errout;
+               }
+       }
+
+       /* deliberately not using 'else' on unrelated conditional branches */
+       if (!opts.ocibundle) {
+               /* allocate NULL-terminated array for argv */
+               opts.jail_argv = calloc(1 + argc - optind, sizeof(void *));
+               if (!opts.jail_argv) {
+                       ret=EXIT_FAILURE;
+                       goto errout;
+               }
+               for (size_t s = optind; s < argc; s++)
+                       opts.jail_argv[s - optind] = strdup(argv[s]);
+
+               if (opts.namespace & CLONE_NEWUSER)
+                       get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
+       }
+
+       if (!opts.extroot) {
+               if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
+                       ERROR("failed to load dependencies\n");
+                       ret=-1;
+                       goto errout;
+               }
        }
 
        if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
                ERROR("failed to load libpreload-seccomp.so\n");
-               return -1;
+               opts.seccomp = 0;
+               if (opts.require_jail) {
+                       ret=-1;
+                       goto errout;
+               }
        }
 
-       if (opts.name)
-               prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
+       uloop_timeout_add(&post_main_timeout);
+       uloop_run();
 
-       uloop_init();
+errout:
+       if (opts.ocibundle)
+               cgroups_free();
 
-       sigfillset(&sigmask);
-       for (i = 0; i < _NSIG; i++) {
-               struct sigaction s = { 0 };
+       free_opts(true);
 
-               if (!sigismember(&sigmask, i))
-                       continue;
-               if ((i == SIGCHLD) || (i == SIGPIPE))
-                       continue;
+       return ret;
+}
 
-               s.sa_handler = jail_handle_signal;
-               sigaction(i, &s, NULL);
+static void post_main(struct uloop_timeout *t)
+{
+       if (apply_rlimits()) {
+               ERROR("error applying resource limits\n");
+               free_and_exit(EXIT_FAILURE);
        }
 
-       if (opts.namespace) {
-               add_mount("/dev/full", 0, -1);
-               add_mount("/dev/null", 0, -1);
-               add_mount("/dev/urandom", 0, -1);
-               add_mount("/dev/zero", 0, -1);
+       if (opts.name)
+               prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
+
+       if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
+               free_and_exit(-1);
+
+       if (has_namespaces()) {
+               if (opts.namespace & CLONE_NEWNS) {
+                       if (!opts.extroot && (opts.user || opts.group)) {
+                               add_mount_bind("/etc/passwd", 1, -1);
+                               add_mount_bind("/etc/group", 1, -1);
+                       }
+
+#if defined(__GLIBC__)
+                       if (!opts.extroot)
+                               add_mount_bind("/etc/nsswitch.conf", 1, -1);
+#endif
+                       if (opts.setns.ns == -1) {
+                               if (!(opts.namespace & CLONE_NEWNET)) {
+                                       add_mount_bind("/etc/resolv.conf", 1, 0);
+                               } else {
+                                       /* new mount namespace to provide /dev/resolv.conf.d */
+                                       char hostdir[PATH_MAX];
+
+                                       snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
+                                       mkdir_p(hostdir, 0755);
+                                       add_mount(hostdir, "/dev/resolv.conf.d", NULL,
+                                               MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, 0, NULL, 0);
+                               }
+                       }
+                       /* default mounts */
+                       add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "size=1M", -1);
+                       add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
+
+                       if (opts.procfs || opts.ocibundle) {
+                               add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0, NULL, -1);
+
+                               /*
+                                * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only
+                                * which cannot be expressed with OCI spec, but happends to be very useful.
+                                * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or
+                                * readonlyPath.
+                                * If not running in a new network namespace, only make /proc/sys read-only.
+                                * If running in a new network namespace, temporarily stash (ie. mount-bind)
+                                * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net.
+                                * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into
+                                * /proc/sys/net.
+                                * This works because mounts are executed in incrementing strcmp() order and
+                                * /proc/self/net appears there before /proc/sys/net and hence the operation
+                                * succeeds as the bind-mount of /proc/self/net is performed first and then
+                                * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII
+                                * table (and in the alphabet).
+                                */
+                               if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, 0, NULL, -1))
+                                       if (opts.namespace & CLONE_NEWNET)
+                                               if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, 0, NULL, -1))
+                                                       add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, 0, NULL, -1);
+
+                       }
+                       if (opts.sysfs || opts.ocibundle)
+                               add_mount("sysfs", "/sys", "sysfs", MS_RELATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, 0, NULL, -1);
+
+                       if (opts.ocibundle)
+                               add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, 0, "mode=1777", -1);
+
+               }
+
+               if (opts.setns.pid != -1) {
+                       pidns_fd = ns_open_pid("pid", getpid());
+                       setns_open(CLONE_NEWPID);
+               } else {
+                       pidns_fd = -1;
+               }
+
+#ifdef CLONE_NEWTIME
+               if (opts.setns.time != -1) {
+                       timens_fd = ns_open_pid("time", getpid());
+                       setns_open(CLONE_NEWTIME);
+               } else {
+                       timens_fd = -1;
+               }
+#endif
+
+               if (opts.namespace & CLONE_NEWUSER) {
+                       if (prctl(PR_SET_SECUREBITS, SECBIT_NO_SETUID_FIXUP)) {
+                               ERROR("prctl(PR_SET_SECUREBITS) failed: %m\n");
+                               free_and_exit(EXIT_FAILURE);
+                       }
+                       if (seteuid(opts.root_map_uid)) {
+                               ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+                               free_and_exit(EXIT_FAILURE);
+                       }
+               }
 
-               int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | SIGCHLD;
-               if (opts.hostname)
-                       flags |= CLONE_NEWUTS;
-               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, flags, NULL);
+               jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | (opts.namespace & (~CLONE_NEWCGROUP)), NULL);
        } else {
                jail_process.pid = fork();
        }
 
        if (jail_process.pid > 0) {
                /* parent process */
+               char sig_buf[1];
+
                uloop_process_add(&jail_process);
-               uloop_run();
-               if (jail_running) {
-                       DEBUG("uloop interrupted, killing jail process\n");
-                       kill(jail_process.pid, SIGTERM);
-                       uloop_timeout_set(&jail_process_timeout, 1000);
-                       uloop_run();
-               }
-               uloop_done();
-               return jail_return_code;
+               jail_running = 1;
+               if (seteuid(0)) {
+                       ERROR("seteuid(%d) failed: %m\n", opts.root_map_uid);
+                       free_and_exit(EXIT_FAILURE);
+               }
+
+               prctl(PR_SET_SECUREBITS, 0);
+
+               if (pidns_fd != -1) {
+                       setns(pidns_fd, CLONE_NEWPID);
+                       close(pidns_fd);
+               }
+#ifdef CLONE_NEWTIME
+               if (timens_fd != -1) {
+                       setns(timens_fd, CLONE_NEWTIME);
+                       close(timens_fd);
+               }
+#endif
+               if (opts.setns.net != -1)
+                       close(opts.setns.net);
+               if (opts.setns.ns != -1)
+                       close(opts.setns.ns);
+               if (opts.setns.ipc != -1)
+                       close(opts.setns.ipc);
+               if (opts.setns.uts != -1)
+                       close(opts.setns.uts);
+               if (opts.setns.user != -1)
+                       close(opts.setns.user);
+               if (opts.setns.cgroup != -1)
+                       close(opts.setns.cgroup);
+               close(pipes[1]);
+               close(pipes[2]);
+               if (read(pipes[0], sig_buf, 1) < 1) {
+                       ERROR("can't read from child\n");
+                       free_and_exit(-1);
+               }
+               close(pipes[0]);
+               set_oom_score_adj();
+
+               if (opts.ocibundle)
+                       cgroups_apply(jail_process.pid);
+
+               if (opts.namespace & CLONE_NEWUSER) {
+                       if (write_setgroups(jail_process.pid, true)) {
+                               ERROR("can't write setgroups\n");
+                               free_and_exit(-1);
+                       }
+                       if (!opts.uidmap) {
+                               bool has_gr = (opts.gr_gid != -1);
+                               if (opts.pw_uid != -1) {
+                                       write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
+                                       write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
+                               } else {
+                                       write_single_uid_gid_map(jail_process.pid, 0, 65534);
+                                       write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
+                               }
+                       } else {
+                               write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
+                               if (opts.gidmap)
+                                       write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
+                       }
+               }
+
+               if (opts.namespace & CLONE_NEWNET)
+                       jail_network_start(parent_ctx, opts.name, jail_process.pid);
+
+               if (jail_writepid(jail_process.pid)) {
+                       ERROR("failed to write pidfile: %m\n");
+                       free_and_exit(-1);
+               }
        } else if (jail_process.pid == 0) {
                /* fork child process */
-               return exec_jail(NULL);
+               free_and_exit(exec_jail(NULL));
        } else {
                ERROR("failed to clone/fork: %m\n");
-               return EXIT_FAILURE;
+               free_and_exit(EXIT_FAILURE);
+       }
+       run_hooks(opts.hooks.createRuntime, post_create_runtime);
+}
+
+static void post_poststart(void);
+static void post_create_runtime(void)
+{
+       char sig_buf[1];
+
+       sig_buf[0] = 'O';
+       if (write(pipes[3], sig_buf, 1) < 0) {
+               ERROR("can't write to child\n");
+               free_and_exit(-1);
+       }
+
+       jail_oci_state = OCI_STATE_CREATED;
+       if (opts.ocibundle && !opts.immediately)
+               uloop_run(); /* wait for 'start' command via ubus */
+       else
+               pipe_send_start_container(NULL);
+}
+
+static void pipe_send_start_container(struct uloop_timeout *t)
+{
+       char sig_buf[1];
+
+       jail_oci_state = OCI_STATE_RUNNING;
+       sig_buf[0] = '!';
+       if (write(pipes[3], sig_buf, 1) < 0) {
+               ERROR("can't write to child\n");
+               free_and_exit(-1);
+       }
+       close(pipes[3]);
+
+       run_hooks(opts.hooks.poststart, post_poststart);
+}
+
+static void post_poststart(void)
+{
+       uloop_run(); /* idle here while jail is running */
+       if (jail_running) {
+               DEBUG("uloop interrupted, killing jail process\n");
+               kill(jail_process.pid, SIGTERM);
+               uloop_timeout_set(&jail_process_timeout, 1000);
+               uloop_run();
+       }
+       uloop_done();
+       poststop();
+}
+
+static void post_poststop(void);
+static void poststop(void) {
+       if (opts.namespace & CLONE_NEWNET) {
+               setns(netns_fd, CLONE_NEWNET);
+               jail_network_stop();
+               close(netns_fd);
        }
+       run_hooks(opts.hooks.poststop, post_poststop);
+}
+
+static void post_poststop(void)
+{
+       free_opts(true);
+       if (parent_ctx)
+               ubus_free(parent_ctx);
+
+       exit(jail_return_code);
 }
index 5739d3d269020fff849eb2be4119008bfa7cc3ee..158d73b7299c178293ddf82e6641892ad1ddbfbe 100644 (file)
@@ -14,5 +14,6 @@
 #define _JAIL_JAIL_H_
 
 int mount_bind(const char *root, const char *path, int readonly, int error);
+int ns_open_pid(const char *nstype, const pid_t target_ns);
 
 #endif
index 74a9f11f62b4218d002015603d00cb663a21a06b..a1091d49f2e72269b83d0f363bde72a1a6444bed 100644 (file)
@@ -20,6 +20,10 @@ extern int debug;
 #define INFO(fmt, ...) do { \
        printf("jail: "fmt, ## __VA_ARGS__); \
        } while (0)
+#define WARNING(fmt, ...) do { \
+       syslog(LOG_WARNING, "jail: "fmt, ## __VA_ARGS__); \
+       printf("jail: "fmt, ## __VA_ARGS__); \
+       } while (0)
 #define ERROR(fmt, ...) do { \
        syslog(LOG_ERR, "jail: "fmt, ## __VA_ARGS__); \
        fprintf(stderr,"jail: "fmt, ## __VA_ARGS__); \
diff --git a/jail/netifd.c b/jail/netifd.c
new file mode 100644 (file)
index 0000000..6f9cd72
--- /dev/null
@@ -0,0 +1,552 @@
+/*
+ * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * launch private ubus and netifd instances for containers with managed
+ * network namespace.
+ */
+
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <libgen.h>
+#include <fcntl.h>
+
+#include <sys/inotify.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <pwd.h>
+
+#include <linux/limits.h>
+
+#include <libubox/uloop.h>
+#include <libubox/utils.h>
+#include <libubus.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <uci.h>
+
+#include "netifd.h"
+#include "log.h"
+#include "jail.h"
+
+#define INOTIFY_SZ (sizeof(struct inotify_event) + PATH_MAX + 1)
+
+static const char ubusd_path[] = "/sbin/ubusd";
+static const char netifd_path[] = "/sbin/netifd";
+static const char uci_net[] = "network";
+static const char ubus_sock_name[] = "ubus.sock";
+
+static char *jail_name, *ubus_sock_path, *ubus_sock_dir, *uci_config_network = NULL;
+
+static char *inotify_buffer;
+static struct uloop_fd fd_inotify_read;
+static struct passwd *ubus_pw;
+static pid_t ns_pid;
+
+static struct ubus_context *host_ubus_ctx = NULL;
+static struct ubus_context *jail_ubus_ctx = NULL;
+
+static struct ubus_subscriber config_watch_subscribe;
+
+/* generate /etc/config/network for jail'ed netifd */
+static int gen_jail_uci_network(void)
+{
+       struct uci_context *uci_ctx = uci_alloc_context();
+       struct uci_package *pkg = NULL;
+       struct uci_element *e, *t;
+       bool has_loopback = false;
+       int ret = 0;
+       FILE *ucinetf;
+
+       /* if no network configuration is active just return */
+       if (!uci_config_network)
+               goto uci_out;
+
+       /* open output uci network config file */
+       ucinetf = fopen(uci_config_network, "w");
+       if (!ucinetf) {
+               ret = errno;
+               goto uci_out;
+       }
+
+       /* load network uci package */
+       if (uci_load(uci_ctx, uci_net, &pkg) != UCI_OK) {
+               char *err;
+               uci_get_errorstr(uci_ctx, &err, uci_net);
+               fprintf(stderr, "unable to load configuration (%s)\n", err);
+               free(err);
+               ret = EIO;
+               goto ucinetf_out;
+       }
+
+       /* remove all sections which don't match jail */
+       uci_foreach_element_safe(&pkg->sections, t, e) {
+               struct uci_section *s = uci_to_section(e);
+               struct uci_option *o = uci_lookup_option(uci_ctx, s, "jail");
+               struct uci_ptr ptr = { .p = pkg, .s = s };
+
+               /* keep match, but remove 'jail' option and rename 'jail_ifname' */
+               if (o && o->type == UCI_TYPE_STRING && !strcmp(o->v.string, jail_name)) {
+                       ptr.o = o;
+                       struct uci_option *jio = uci_lookup_option(uci_ctx, s, "jail_device");
+                       if (!jio)
+                               jio = uci_lookup_option(uci_ctx, s, "jail_ifname");
+
+                       if (jio) {
+                               struct uci_ptr ren_ptr = { .p = pkg, .s = s, .o = jio, .value = "device" };
+                               struct uci_option *host_device = uci_lookup_option(uci_ctx, s, "device");
+                               struct uci_option *legacy_ifname = uci_lookup_option(uci_ctx, s, "ifname");
+                               if (host_device && legacy_ifname) {
+                                       struct uci_ptr delif_ptr = { .p = pkg, .s = s, .o = legacy_ifname };
+                                       uci_delete(uci_ctx, &delif_ptr);
+                               }
+
+                               struct uci_ptr renif_ptr = { .p = pkg, .s = s, .o = host_device?:legacy_ifname, .value = "host_device" };
+                               uci_rename(uci_ctx, &renif_ptr);
+                               uci_rename(uci_ctx, &ren_ptr);
+                       }
+               }
+
+               uci_delete(uci_ctx, &ptr);
+       }
+
+       /* check if device 'lo' is defined by any remaining interfaces */
+       uci_foreach_element(&pkg->sections, e) {
+               struct uci_section *s = uci_to_section(e);
+               if (strcmp(s->type, "interface"))
+                       continue;
+
+               const char *devname = uci_lookup_option_string(uci_ctx, s, "device");
+               if (devname && !strcmp(devname, "lo")) {
+                       has_loopback = true;
+                       break;
+               }
+       }
+
+       /* create loopback interface section if not defined */
+       if (!has_loopback) {
+               struct uci_ptr ptr = { .p = pkg, .section = "loopback", .value = "interface" };
+               uci_set(uci_ctx, &ptr);
+               uci_reorder_section(uci_ctx, ptr.s, 0);
+               struct uci_ptr ptr1 = { .p = pkg, .s = ptr.s, .option = "device", .value = "lo" };
+               struct uci_ptr ptr2 = { .p = pkg, .s = ptr.s, .option = "proto", .value = "static" };
+               struct uci_ptr ptr3 = { .p = pkg, .s = ptr.s, .option = "ipaddr", .value = "127.0.0.1" };
+               struct uci_ptr ptr4 = { .p = pkg, .s = ptr.s, .option = "netmask", .value = "255.0.0.0" };
+               uci_set(uci_ctx, &ptr1);
+               uci_set(uci_ctx, &ptr2);
+               uci_set(uci_ctx, &ptr3);
+               uci_set(uci_ctx, &ptr4);
+       }
+
+       ret = uci_export(uci_ctx, ucinetf, pkg, false);
+
+ucinetf_out:
+       fclose(ucinetf);
+
+uci_out:
+       uci_free_context(uci_ctx);
+
+       return ret;
+}
+
+static void run_ubusd(struct uloop_timeout *t)
+{
+       static struct blob_buf req;
+       void *ins, *in, *cmd;
+       uint32_t id;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", jail_name);
+       ins = blobmsg_open_table(&req, "instances");
+       in = blobmsg_open_table(&req, "ubus");
+       cmd = blobmsg_open_array(&req, "command");
+       blobmsg_add_string(&req, "", ubusd_path);
+       blobmsg_add_string(&req, "", "-s");
+       blobmsg_add_string(&req, "", ubus_sock_path);
+       blobmsg_close_array(&req, cmd);
+
+       if (ubus_pw) {
+               blobmsg_add_string(&req, "user", "ubus");
+               blobmsg_add_string(&req, "group", "ubus");
+       }
+
+       blobmsg_close_table(&req, in);
+       blobmsg_close_table(&req, ins);
+
+       if (!ubus_lookup_id(host_ubus_ctx, "container", &id))
+               ubus_invoke(host_ubus_ctx, id, "add", req.head, NULL, NULL, 3000);
+
+       blob_buf_free(&req);
+}
+
+static void run_netifd(struct uloop_timeout *t)
+{
+       static struct blob_buf req;
+       void *ins, *in, *cmd, *jail, *setns, *setnso, *namespaces, *mount, *pathenv;
+       char *resolvconf_dir, *resolvconf, *ucimount, *ubusmount;
+       char uci_dir[] = "/var/containers/ujail-uci-XXXXXX";
+
+       uint32_t id;
+       bool running = false;
+
+       uloop_fd_delete(&fd_inotify_read);
+       close(fd_inotify_read.fd);
+
+       jail_ubus_ctx = ubus_connect(ubus_sock_path);
+       if (!jail_ubus_ctx)
+               return;
+
+       if (asprintf(&resolvconf_dir, "/tmp/resolv.conf-%s.d", jail_name) == -1)
+               return;
+
+       if (asprintf(&resolvconf, "%s/resolv.conf.auto", resolvconf_dir) == -1)
+               goto netifd_out_resolvconf_dir;
+
+       if (!mkdtemp(uci_dir))
+               goto netifd_out_resolvconf;
+
+       if (asprintf(&uci_config_network, "%s/network", uci_dir) == -1)
+               goto netifd_out_ucidir;
+
+       if (asprintf(&ucimount, "%s:/etc/config", uci_dir) == -1)
+               goto netifd_out_ucinetconf;
+
+       if (asprintf(&ubusmount, "%s:/var/run/ubus", ubus_sock_dir) == -1)
+               goto netifd_out_ucimount;
+
+       if (gen_jail_uci_network())
+               goto netifd_out_ubusmount;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", jail_name);
+       ins = blobmsg_open_table(&req, "instances");
+       in = blobmsg_open_table(&req, "netifd");
+
+       cmd = blobmsg_open_array(&req, "command");
+       blobmsg_add_string(&req, "", netifd_path);
+       blobmsg_add_string(&req, "", "-r");
+       blobmsg_add_string(&req, "", resolvconf);
+       blobmsg_close_array(&req, cmd);
+
+       pathenv = blobmsg_open_table(&req, "env");
+       blobmsg_add_string(&req, "PATH", "/usr/sbin:/usr/bin:/sbin:/bin");
+       blobmsg_close_table(&req, pathenv);
+
+       jail = blobmsg_open_table(&req, "jail");
+
+       setns = blobmsg_open_array(&req, "setns");
+       setnso = blobmsg_open_table(&req, "");
+       blobmsg_add_u32(&req, "pid", ns_pid);
+       namespaces = blobmsg_open_array(&req, "namespaces");
+       blobmsg_add_string(&req, "", "net");
+       blobmsg_add_string(&req, "", "ipc");
+       blobmsg_add_string(&req, "", "uts");
+       blobmsg_close_array(&req, namespaces);
+       blobmsg_close_table(&req, setnso);
+       blobmsg_close_array(&req, setns);
+
+       mount = blobmsg_open_table(&req, "mount");
+       blobmsg_add_string(&req, ubusmount, "1");
+       blobmsg_add_string(&req, resolvconf_dir, "1");
+       blobmsg_add_string(&req, ucimount, "0");
+       blobmsg_add_string(&req, "/bin/cat", "0");
+       blobmsg_add_string(&req, "/bin/ipcalc.sh", "0");
+       blobmsg_add_string(&req, "/bin/kill", "0");
+       blobmsg_add_string(&req, "/bin/ubus", "0");
+       blobmsg_add_string(&req, "/etc/hotplug.d", "0");
+       blobmsg_add_string(&req, "/lib/functions", "0");
+       blobmsg_add_string(&req, "/lib/functions.sh", "0");
+       blobmsg_add_string(&req, "/lib/netifd", "0");
+       blobmsg_add_string(&req, "/lib/network", "0");
+       blobmsg_add_string(&req, "/usr/bin/awk", "0");
+       blobmsg_add_string(&req, "/usr/bin/killall", "0");
+       blobmsg_add_string(&req, "/usr/bin/logger", "0");
+       blobmsg_add_string(&req, "/usr/bin/jshn", "0");
+       blobmsg_add_string(&req, "/usr/share/libubox/jshn.sh", "0");
+       blobmsg_add_string(&req, "/sbin/hotplug-call", "0");
+       blobmsg_add_string(&req, "/sbin/udhcpc", "0");
+       blobmsg_close_table(&req, mount);
+
+       blobmsg_add_u8(&req, "log", 1);
+       blobmsg_add_u8(&req, "procfs", 1);
+       blobmsg_add_u8(&req, "sysfs", 1);
+
+       blobmsg_add_u8(&req, "requirejail", 1);
+
+       blobmsg_close_table(&req, jail);
+
+       blobmsg_add_u8(&req, "stdout", 1);
+       blobmsg_add_u8(&req, "stderr", 1);
+
+       blobmsg_close_table(&req, in);
+       blobmsg_close_table(&req, ins);
+
+       if (!ubus_lookup_id(host_ubus_ctx, "container", &id))
+               running = !ubus_invoke(host_ubus_ctx, id, "add", req.head, NULL, NULL, 3000);
+
+       if (!running)
+               blob_buf_free(&req);
+netifd_out_ubusmount:
+       free(ubusmount);
+netifd_out_ucimount:
+       free(ucimount);
+netifd_out_ucinetconf:
+       if (!running) {
+               unlink(uci_config_network);
+               free(uci_config_network);
+       }
+netifd_out_ucidir:
+       if (!running)
+               rmdir(uci_dir);
+netifd_out_resolvconf:
+       free(resolvconf);
+netifd_out_resolvconf_dir:
+       free(resolvconf_dir);
+
+       uloop_end();
+}
+
+static struct uloop_timeout netifd_start_timeout = { .cb = run_netifd, };
+
+static void inotify_read_handler(struct uloop_fd *u, unsigned int events)
+{
+       int rc;
+       char *p;
+       struct inotify_event *in;
+
+       /* read inotify events */
+       while ((rc = read(u->fd, inotify_buffer, INOTIFY_SZ)) == -1 && errno == EINTR);
+
+       if (rc <= 0)
+               return;
+
+       /* process events from buffer */
+       for (p = inotify_buffer;
+           rc - (p - inotify_buffer) >= (int)sizeof(struct inotify_event);
+           p += sizeof(struct inotify_event) + in->len) {
+               in = (struct inotify_event*)p;
+
+               if (in->len < 4)
+                       continue;
+
+               if (!strncmp(ubus_sock_name, in->name, in->len))
+                       uloop_timeout_add(&netifd_start_timeout);
+        }
+}
+
+static void netns_updown(struct ubus_context *ubus, const char *name, bool start, int netns_fd)
+{
+       static struct blob_buf req;
+       uint32_t id;
+
+       if (!ubus)
+               return;
+
+       blob_buf_init(&req, 0);
+       if (name)
+               blobmsg_add_string(&req, "jail", name);
+
+       blobmsg_add_u8(&req, "start", start);
+
+       if (ubus_lookup_id(ubus, "network", &id) ||
+           ubus_invoke_fd(ubus, id, "netns_updown", req.head, NULL, NULL, 3000, netns_fd)) {
+               INFO("ubus request failed\n");
+       }
+
+       blob_buf_free(&req);
+}
+
+static void jail_network_reload(struct uloop_timeout *t)
+{
+       uint32_t id;
+
+       if (!jail_ubus_ctx)
+               return;
+
+       if (gen_jail_uci_network())
+               return;
+
+       if (ubus_lookup_id(jail_ubus_ctx, "network", &id))
+               return;
+
+       ubus_invoke(jail_ubus_ctx, id, "reload", NULL, NULL, NULL, 3000);
+}
+
+static const struct blobmsg_policy service_watch_policy = { "config", BLOBMSG_TYPE_STRING };
+static struct uloop_timeout jail_network_reload_timeout = { .cb = jail_network_reload, };
+
+static int config_watch_notify_cb(struct ubus_context *ctx, struct ubus_object *obj,
+                          struct ubus_request_data *req, const char *method,
+                          struct blob_attr *msg)
+{
+       struct blob_attr *attr;
+       const char *config;
+
+       if (strcmp(method, "config.change"))
+               return 0;
+
+       blobmsg_parse(&service_watch_policy, 1, &attr, blob_data(msg), blob_len(msg));
+       if (!attr)
+               return 1;
+
+       config = blobmsg_get_string(attr);
+       if (strcmp(config, "network"))
+               return 0;
+
+       uloop_timeout_add(&jail_network_reload_timeout);
+
+       return 0;
+}
+
+static void watch_ubus_service(void)
+{
+       uint32_t id;
+
+       config_watch_subscribe.cb = config_watch_notify_cb;
+       if (ubus_register_subscriber(host_ubus_ctx, &config_watch_subscribe)) {
+               ERROR("failed to register ubus subscriber\n");
+               return;
+       }
+
+       if (ubus_lookup_id(host_ubus_ctx, "service", &id))
+               return;
+
+       if (!ubus_subscribe(host_ubus_ctx, &config_watch_subscribe, id))
+               return;
+
+       ERROR("failed to subscribe %d\n", id);
+}
+
+static struct uloop_timeout ubus_start_timeout = { .cb = run_ubusd, };
+
+int jail_network_start(struct ubus_context *new_ctx, char *new_jail_name, pid_t new_ns_pid)
+{
+       ubus_pw = getpwnam("ubus");
+       int ret = 0;
+       int netns_fd;
+
+       host_ubus_ctx = new_ctx;
+       ns_pid = new_ns_pid;
+       jail_name = new_jail_name;
+
+       if (asprintf(&ubus_sock_dir, "/var/containers/ubus-%s", jail_name) == -1) {
+               ret = ENOMEM;
+               goto errout_dir;
+       }
+
+       if (asprintf(&ubus_sock_path, "%s/%s", ubus_sock_dir, ubus_sock_name) == -1) {
+               ret = ENOMEM;
+               goto errout_path;
+       }
+
+       mkdir_p(ubus_sock_dir, 0755);
+       if (ubus_pw) {
+               ret = chown(ubus_sock_dir, ubus_pw->pw_uid, ubus_pw->pw_gid);
+               if (ret) {
+                       ret = errno;
+                       goto errout;
+               }
+       }
+
+       fd_inotify_read.fd = inotify_init1(IN_NONBLOCK | IN_CLOEXEC);
+       fd_inotify_read.cb = inotify_read_handler;
+       if (fd_inotify_read.fd == -1) {
+               ERROR("failed to initialize inotify handler\n");
+               ret = EIO;
+               goto errout;
+       }
+       uloop_fd_add(&fd_inotify_read, ULOOP_READ);
+
+       inotify_buffer = calloc(1, INOTIFY_SZ);
+       if (!inotify_buffer) {
+               ret = ENOMEM;
+               goto errout_inotify;
+       }
+
+       if (inotify_add_watch(fd_inotify_read.fd, ubus_sock_dir, IN_CREATE) == -1) {
+               ERROR("failed to add inotify watch on %s\n", ubus_sock_dir);
+               free(inotify_buffer);
+               ret = EIO;
+               goto errout_inotify;
+       }
+
+       watch_ubus_service();
+
+       netns_fd = ns_open_pid("net", ns_pid);
+       if (netns_fd < 0) {
+               ret = ESRCH;
+               goto errout_inotify;
+       }
+
+       netns_updown(host_ubus_ctx, jail_name, true, netns_fd);
+
+       close(netns_fd);
+       uloop_timeout_add(&ubus_start_timeout);
+       uloop_run();
+
+       return 0;
+
+errout_inotify:
+       close(fd_inotify_read.fd);
+errout:
+       free(ubus_sock_path);
+errout_path:
+       free(ubus_sock_dir);
+errout_dir:
+       return ret;
+}
+
+static int jail_delete_instance(const char *instance)
+{
+       static struct blob_buf req;
+       uint32_t id;
+
+       if (ubus_lookup_id(host_ubus_ctx, "container", &id))
+               return -1;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", jail_name);
+       blobmsg_add_string(&req, "instance", instance);
+
+       return ubus_invoke(host_ubus_ctx, id, "delete", req.head, NULL, NULL, 3000);
+}
+
+int jail_network_stop(void)
+{
+       int host_netns = open("/proc/self/ns/net", O_RDONLY);
+
+       if (host_netns < 0)
+               return errno;
+
+       netns_updown(jail_ubus_ctx, NULL, false, host_netns);
+
+       close(host_netns);
+       ubus_free(jail_ubus_ctx);
+
+       jail_delete_instance("netifd");
+       jail_delete_instance("ubus");
+
+       if (uci_config_network) {
+               unlink(uci_config_network);
+               rmdir(dirname(uci_config_network));
+               free(uci_config_network);
+       }
+
+       free(ubus_sock_path);
+       rmdir(ubus_sock_dir);
+       free(ubus_sock_dir);
+
+       return 0;
+}
diff --git a/jail/netifd.h b/jail/netifd.h
new file mode 100644 (file)
index 0000000..589ed14
--- /dev/null
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2021 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _JAIL_NETIFD_H
+#define _JAIL_NETIFD_H
+#include <libubus.h>
+
+int jail_network_start(struct ubus_context *new_ctx, char *new_jail_name, pid_t new_ns_pid);
+int jail_network_stop(void);
+
+#endif
index 24358c6bc30482a97a26c6b4999004ca3d2d822c..351a9f81368bfd563c83e1a0e06409385dbe99de 100644 (file)
 #include <string.h>
 #include <dlfcn.h>
 
+#include "log.h"
 #include "seccomp.h"
 #include "../preload.h"
 
 static main_t __main__;
+int debug;
 
 static int __preload_main__(int argc, char **argv, char **envp)
 {
        char *env_file = getenv("SECCOMP_FILE");
+       char *env_debug = getenv("SECCOMP_DEBUG");
 
        if (!env_file || !env_file[0]) {
                ERROR("SECCOMP_FILE not specified\n");
                return -1;
        }
 
+       if (env_debug)
+               debug = atoi(env_debug);
+       else
+               debug = 0;
+
        if (install_syscall_filter(*argv, env_file))
                return -1;
 
        unsetenv("LD_PRELOAD");
+       unsetenv("SECCOMP_DEBUG");
        unsetenv("SECCOMP_FILE");
 
        return (*__main__)(argc, argv, envp);
@@ -53,8 +62,10 @@ int __libc_start_main(main_t main,
        start_main_t __start_main__;
 
        __start_main__ = dlsym(RTLD_NEXT, "__libc_start_main");
-       if (!__start_main__)
+       if (!__start_main__) {
                INFO("failed to find __libc_start_main %s\n", dlerror());
+               return -1;
+       }
 
        __main__ = main;
 
@@ -73,8 +84,10 @@ void __uClibc_main(main_t main,
        uClibc_main __start_main__;
 
        __start_main__ = dlsym(RTLD_NEXT, "__uClibc_main");
-       if (!__start_main__)
+       if (!__start_main__) {
                INFO("failed to find __uClibc_main %s\n", dlerror());
+               return;
+       }
 
        __main__ = main;
 
index fd6b3e257c385c4e3b03ccb9ed89fe67fec70ff4..077483f3bf10524306a5343fd5b0135fb591b3bb 100644 (file)
 #define SECCOMP_RET_TRAP       0x00030000U /* disallow and force a SIGSYS */
 #define SECCOMP_RET_ERRNO      0x00050000U /* returns an errno */
 #define SECCOMP_RET_LOG                0x00070000U
+#define SECCOMP_RET_LOGALLOW   0x7ffc0000U
 #define SECCOMP_RET_TRACE      0x7ff00000U /* pass to a tracer or disallow */
 #define SECCOMP_RET_ALLOW      0x7fff0000U /* allow */
+#define SECCOMP_RET_KILLPROCESS        0x80000000U
 #define SECCOMP_RET_ERROR(x)   (SECCOMP_RET_ERRNO | ((x) & 0x0000ffffU))
 #define SECCOMP_RET_LOGGER(x)  (SECCOMP_RET_LOG | ((x) & 0x0000ffffU))
 
@@ -60,20 +62,14 @@ struct seccomp_data {
 
 #define syscall_nr (offsetof(struct seccomp_data, nr))
 #define arch_nr (offsetof(struct seccomp_data, arch))
+#define syscall_arg(x) (offsetof(struct seccomp_data, args[x]))
 
-#if defined(__i386__)
-# define REG_SYSCALL   REG_EAX
-# define ARCH_NR       AUDIT_ARCH_I386
-#elif defined(__x86_64__)
+#if defined(__aarch64__)
+# define REG_SYSCALL   regs.regs[8]
+# define ARCH_NR       AUDIT_ARCH_AARCH64
+#elif defined(__amd64__)
 # define REG_SYSCALL   REG_RAX
 # define ARCH_NR       AUDIT_ARCH_X86_64
-#elif defined(__mips__)
-# define REG_SYSCALL   regs[2]
-# if __BYTE_ORDER == __LITTLE_ENDIAN
-#  define ARCH_NR      AUDIT_ARCH_MIPSEL
-# else
-#  define ARCH_NR      AUDIT_ARCH_MIPS
-# endif
 #elif defined(__arm__) && (defined(__ARM_EABI__) || defined(__thumb__))
 # define REG_SYSCALL   regs.uregs[7]
 # if __BYTE_ORDER == __LITTLE_ENDIAN
@@ -81,6 +77,16 @@ struct seccomp_data {
 # else
 #  define ARCH_NR      AUDIT_ARCH_ARMEB
 # endif
+#elif defined(__i386__)
+# define REG_SYSCALL   REG_EAX
+# define ARCH_NR       AUDIT_ARCH_I386
+#elif defined(__mips__)
+# define REG_SYSCALL   regs[2]
+# if __BYTE_ORDER == __LITTLE_ENDIAN
+#  define ARCH_NR      AUDIT_ARCH_MIPSEL
+# else
+#  define ARCH_NR      AUDIT_ARCH_MIPS
+# endif
 #elif defined(__PPC__)
 # define REG_SYSCALL   regs.gpr[0]
 # define ARCH_NR       AUDIT_ARCH_PPC
diff --git a/jail/seccomp-oci.c b/jail/seccomp-oci.c
new file mode 100644 (file)
index 0000000..f089ac6
--- /dev/null
@@ -0,0 +1,449 @@
+/*
+ * parse and setup OCI seccomp filter
+ * Copyright (c) 2020 Daniel Golle <daniel@makrotopia.org>
+ * seccomp example with syscall reporting
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Authors:
+ *  Kees Cook <keescook@chromium.org>
+ *  Will Drewry <wad@chromium.org>
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * BPF control flow
+ *
+ * (check_arch)<t>---(check_syscall)<f>---+----[...]<f>---(return default_action)
+ *       |<f>                |<t>         |
+ *      KILL         (check_argument)<f>--+
+ *                           |<t>
+ *                         [...]
+ *                           |<t>
+ *                    (return action)
+ */
+#define _GNU_SOURCE 1
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <libubox/utils.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+
+#include "log.h"
+#include "seccomp-bpf.h"
+#include "seccomp-oci.h"
+#include "../syscall-names.h"
+#include "seccomp-syscalls-helpers.h"
+
+static uint32_t resolve_action(char *actname)
+{
+       if (!strcmp(actname, "SCMP_ACT_KILL"))
+               return SECCOMP_RET_KILL;
+       else if (!strcmp(actname, "SCMP_ACT_KILL_PROCESS"))
+               return SECCOMP_RET_KILLPROCESS;
+       else if (!strcmp(actname, "SCMP_ACT_TRAP"))
+               return SECCOMP_RET_TRAP;
+       else if (!strcmp(actname, "SCMP_ACT_ERRNO"))
+               return SECCOMP_RET_ERRNO;
+       else if (!strcmp(actname, "SCMP_ACT_ERROR"))
+               return SECCOMP_RET_ERRNO;
+       else if (!strcmp(actname, "SCMP_ACT_TRACE"))
+               return SECCOMP_RET_TRACE;
+       else if (!strcmp(actname, "SCMP_ACT_ALLOW"))
+               return SECCOMP_RET_ALLOW;
+       else if (!strcmp(actname, "SCMP_ACT_LOG"))
+               return SECCOMP_RET_LOGALLOW;
+       else {
+               ERROR("unknown seccomp action %s\n", actname);
+               return SECCOMP_RET_KILL;
+       }
+}
+
+static uint8_t resolve_op_ins(const char *op)
+{
+       if (!strcmp(op, "SCMP_CMP_NE")) /* invert EQ */
+               return BPF_JEQ;
+       else if (!strcmp(op, "SCMP_CMP_LT")) /* invert GE */
+               return BPF_JGE;
+       else if (!strcmp(op, "SCMP_CMP_LE")) /* invert GT */
+               return BPF_JGT;
+       else if (!strcmp(op, "SCMP_CMP_EQ"))
+               return BPF_JEQ;
+       else if (!strcmp(op, "SCMP_CMP_GE"))
+               return BPF_JGE;
+       else if (!strcmp(op, "SCMP_CMP_GT"))
+               return BPF_JGT;
+       else if (!strcmp(op, "SCMP_CMP_MASKED_EQ"))
+               return BPF_JEQ;
+       else {
+               ERROR("unknown seccomp op %s\n", op);
+               return 0;
+       }
+}
+
+static bool resolve_op_is_masked(const char *op)
+{
+       if (!strcmp(op, "SCMP_CMP_MASKED_EQ"))
+               return true;
+
+       return false;
+}
+
+static bool resolve_op_inv(const char *op)
+{
+       if (!strcmp(op, "SCMP_CMP_NE") ||
+           !strcmp(op, "SCMP_CMP_LT") ||
+           !strcmp(op, "SCMP_CMP_LE"))
+               return true;
+
+       return false;
+}
+
+static uint32_t resolve_architecture(char *archname)
+{
+       if (!archname)
+               return 0;
+
+       if (!strcmp(archname, "SCMP_ARCH_X86"))
+               return AUDIT_ARCH_I386;
+       else if (!strcmp(archname, "SCMP_ARCH_X86_64"))
+               return AUDIT_ARCH_X86_64;
+       else if (!strcmp(archname, "SCMP_ARCH_X32"))
+               /*
+                * return AUDIT_ARCH_X86_64;
+                * 32-bit userland on 64-bit kernel is not supported yet
+                */
+               return 0;
+       else if (!strcmp(archname, "SCMP_ARCH_ARM"))
+               return AUDIT_ARCH_ARM;
+       else if (!strcmp(archname, "SCMP_ARCH_AARCH64"))
+               return AUDIT_ARCH_AARCH64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS"))
+               return AUDIT_ARCH_MIPS;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS64"))
+               return AUDIT_ARCH_MIPS64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPS64N32"))
+               return AUDIT_ARCH_MIPS64N32;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL"))
+               return AUDIT_ARCH_MIPSEL;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64"))
+               return AUDIT_ARCH_MIPSEL64;
+       else if (!strcmp(archname, "SCMP_ARCH_MIPSEL64N32"))
+               return AUDIT_ARCH_MIPSEL64N32;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC"))
+               return AUDIT_ARCH_PPC;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC64"))
+               return AUDIT_ARCH_PPC64;
+       else if (!strcmp(archname, "SCMP_ARCH_PPC64LE"))
+               return AUDIT_ARCH_PPC64LE;
+       else if (!strcmp(archname, "SCMP_ARCH_S390"))
+               return AUDIT_ARCH_S390;
+       else if (!strcmp(archname, "SCMP_ARCH_S390X"))
+               return AUDIT_ARCH_S390X;
+       else if (!strcmp(archname, "SCMP_ARCH_PARISC"))
+               return AUDIT_ARCH_PARISC;
+       else if (!strcmp(archname, "SCMP_ARCH_PARISC64"))
+               return AUDIT_ARCH_PARISC64;
+       else {
+               ERROR("unknown seccomp architecture %s\n", archname);
+               return 0;
+       }
+}
+
+enum {
+       OCI_LINUX_SECCOMP_DEFAULTACTION,
+       OCI_LINUX_SECCOMP_ARCHITECTURES,
+       OCI_LINUX_SECCOMP_FLAGS,
+       OCI_LINUX_SECCOMP_SYSCALLS,
+       __OCI_LINUX_SECCOMP_MAX,
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_policy[] = {
+       [OCI_LINUX_SECCOMP_DEFAULTACTION] = { "defaultAction", BLOBMSG_TYPE_STRING },
+       [OCI_LINUX_SECCOMP_ARCHITECTURES] = { "architectures", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_FLAGS] = { "flags", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS] = { "syscalls", BLOBMSG_TYPE_ARRAY },
+};
+
+enum {
+       OCI_LINUX_SECCOMP_SYSCALLS_NAMES,
+       OCI_LINUX_SECCOMP_SYSCALLS_ACTION,
+       OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS,
+       __OCI_LINUX_SECCOMP_SYSCALLS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_policy[] = {
+       [OCI_LINUX_SECCOMP_SYSCALLS_NAMES] = { "names", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET] = { "errnoRet", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ACTION] = { "action", BLOBMSG_TYPE_STRING },
+};
+
+enum {
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO,
+       OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP,
+       __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX
+};
+
+static const struct blobmsg_policy oci_linux_seccomp_syscalls_args_policy[] = {
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX] = { "index", BLOBMSG_TYPE_INT32 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE] = { "value", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO] = { "valueTwo", BLOBMSG_CAST_INT64 },
+       [OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP] = { "op", BLOBMSG_TYPE_STRING },
+};
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg)
+{
+       struct blob_attr *tb[__OCI_LINUX_SECCOMP_MAX];
+       struct blob_attr *tbn[__OCI_LINUX_SECCOMP_SYSCALLS_MAX];
+       struct blob_attr *tba[__OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX];
+       struct blob_attr *cur, *curn, *curarg;
+       int rem, remn, remargs, sc;
+       struct sock_filter *filter;
+       struct sock_fprog *prog;
+       int sz = 4, idx = 0;
+       uint32_t default_policy = 0;
+       uint32_t seccomp_arch;
+       bool arch_matched;
+       char *op_str;
+
+       blobmsg_parse(oci_linux_seccomp_policy, __OCI_LINUX_SECCOMP_MAX,
+                     tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[OCI_LINUX_SECCOMP_DEFAULTACTION]) {
+               ERROR("seccomp: no default action set\n");
+               return NULL;
+       }
+
+       default_policy = resolve_action(blobmsg_get_string(tb[OCI_LINUX_SECCOMP_DEFAULTACTION]));
+
+       /* verify architecture while ignoring the x86_64 anomaly for now */
+       if (tb[OCI_LINUX_SECCOMP_ARCHITECTURES]) {
+               arch_matched = false;
+               blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_ARCHITECTURES], rem) {
+                       seccomp_arch = resolve_architecture(blobmsg_get_string(cur));
+                       if (ARCH_NR == seccomp_arch) {
+                               arch_matched = true;
+                               break;
+                       }
+               }
+               if (!arch_matched) {
+                       ERROR("seccomp architecture doesn't match system\n");
+                       return NULL;
+               }
+       }
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+               sz += 2; /* load and return */
+
+               blobmsg_parse(oci_linux_seccomp_syscalls_policy,
+                             __OCI_LINUX_SECCOMP_SYSCALLS_MAX,
+                             tbn, blobmsg_data(cur), blobmsg_len(cur));
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn) {
+                       sc = find_syscall(blobmsg_get_string(curn));
+                       if (sc == -1) {
+                               DEBUG("unknown syscall '%s'\n", blobmsg_get_string(curn));
+                               /* TODO: support run.oci.seccomp_fail_unknown_syscall=1 annotation */
+                               continue;
+                       }
+                       ++sz;
+               }
+
+               if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS]) {
+                       blobmsg_for_each_attr(curarg, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remargs) {
+                               sz += 2; /* load and compare */
+
+                               blobmsg_parse(oci_linux_seccomp_syscalls_args_policy,
+                                             __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX,
+                                             tba, blobmsg_data(curarg), blobmsg_len(curarg));
+                               if (!tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX] ||
+                                   !tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE] ||
+                                   !tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP])
+                                       return NULL;
+
+                               if (blobmsg_get_u32(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX]) > 5)
+                                       return NULL;
+
+                               op_str = blobmsg_get_string(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP]);
+                               if (!resolve_op_ins(op_str))
+                                       return NULL;
+
+                               if (resolve_op_is_masked(op_str))
+                                       ++sz; /* SCMP_CMP_MASKED_EQ needs an extra BPF_AND op */
+                       }
+               }
+       }
+
+       if (sz < 6)
+               return NULL;
+
+       prog = malloc(sizeof(struct sock_fprog));
+       if (!prog)
+               return NULL;
+
+       filter = calloc(sz, sizeof(struct sock_filter));
+       if (!filter) {
+               ERROR("failed to allocate memory for seccomp filter\n");
+               goto errout2;
+       }
+
+       /* validate arch */
+       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, arch_nr);
+       set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 1, 0, ARCH_NR);
+       set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_KILL);
+
+       blobmsg_for_each_attr(cur, tb[OCI_LINUX_SECCOMP_SYSCALLS], rem) {
+               uint32_t action;
+               uint32_t op_idx;
+               uint8_t op_ins;
+               bool op_inv, op_masked;
+               uint64_t op_val, op_val2;
+               int start_rule_idx;
+               int next_rule_idx;
+
+               blobmsg_parse(oci_linux_seccomp_syscalls_policy,
+                             __OCI_LINUX_SECCOMP_SYSCALLS_MAX,
+                             tbn, blobmsg_data(cur), blobmsg_len(cur));
+               action = resolve_action(blobmsg_get_string(
+                               tbn[OCI_LINUX_SECCOMP_SYSCALLS_ACTION]));
+               if (tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]) {
+                       if (action != SECCOMP_RET_ERRNO)
+                               goto errout1;
+
+                       action = SECCOMP_RET_ERROR(blobmsg_get_u32(
+                                       tbn[OCI_LINUX_SECCOMP_SYSCALLS_ERRNORET]));
+               } else if (action == SECCOMP_RET_ERRNO)
+                       action = SECCOMP_RET_ERROR(EPERM);
+
+               /* load syscall */
+               set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, syscall_nr);
+
+               /* get number of syscall names */
+               next_rule_idx = idx;
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn) {
+                       if (find_syscall(blobmsg_get_string(curn)) == -1)
+                               continue;
+
+                       ++next_rule_idx;
+               }
+               start_rule_idx = next_rule_idx;
+
+               /* calculate length of argument filter rules */
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remn) {
+                       blobmsg_parse(oci_linux_seccomp_syscalls_args_policy,
+                                     __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX,
+                                     tba, blobmsg_data(curn), blobmsg_len(curn));
+                       next_rule_idx += 2;
+                       op_str = blobmsg_get_string(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP]);
+                       if (resolve_op_is_masked(op_str))
+                               ++next_rule_idx;
+               }
+
+               ++next_rule_idx; /* account for return action */
+
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_NAMES], remn) {
+                       sc = find_syscall(blobmsg_get_string(curn));
+                       if (sc == -1)
+                               continue;
+                       /*
+                        * check syscall, skip other syscall checks if match is found.
+                        * if no match is found, jump to next section
+                        */
+                       set_filter(&filter[idx], BPF_JMP + BPF_JEQ + BPF_K,
+                                  start_rule_idx - (idx + 1),
+                                  ((idx + 1) == start_rule_idx)?(next_rule_idx - (idx + 1)):0,
+                                  sc);
+                       ++idx;
+               }
+
+               assert(idx = start_rule_idx);
+
+               /* generate argument filter rules */
+               blobmsg_for_each_attr(curn, tbn[OCI_LINUX_SECCOMP_SYSCALLS_ARGS], remn) {
+                       blobmsg_parse(oci_linux_seccomp_syscalls_args_policy,
+                                     __OCI_LINUX_SECCOMP_SYSCALLS_ARGS_MAX,
+                                     tba, blobmsg_data(curn), blobmsg_len(curn));
+
+                       op_str = blobmsg_get_string(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_OP]);
+                       op_ins = resolve_op_ins(op_str);
+                       op_inv = resolve_op_inv(op_str);
+                       op_masked = resolve_op_is_masked(op_str);
+                       op_idx = blobmsg_get_u32(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_INDEX]);
+                       op_val = blobmsg_cast_u64(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUE]);
+                       if (tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO])
+                               op_val2 = blobmsg_cast_u64(tba[OCI_LINUX_SECCOMP_SYSCALLS_ARGS_VALUETWO]);
+                       else
+                               op_val2 = 0;
+
+                       /* load argument */
+                       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, syscall_arg(op_idx));
+
+                       /* apply mask */
+                       if (op_masked)
+                               set_filter(&filter[idx++], BPF_ALU + BPF_K + BPF_AND, 0, 0, op_val);
+
+                       set_filter(&filter[idx], BPF_JMP + op_ins + BPF_K,
+                                  op_inv?(next_rule_idx - (idx + 1)):0,
+                                  op_inv?0:(next_rule_idx - (idx + 1)),
+                                  op_masked?op_val2:op_val);
+                       ++idx;
+               }
+
+               /* if we have reached until here, all conditions were met and we can return */
+               set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, action);
+
+               assert(idx == next_rule_idx);
+       }
+
+       set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, default_policy);
+
+       assert(idx == sz);
+
+       prog->len = (unsigned short) idx;
+       prog->filter = filter;
+
+       DEBUG("generated seccomp-bpf program:\n");
+       if (debug) {
+               fprintf(stderr, " [idx]\tcode\t jt\t jf\tk\n");
+               for (idx=0; idx<sz; idx++)
+                       fprintf(stderr, " [%03d]\t%04hx\t%3hhu\t%3hhu\t%08x\n", idx,
+                               filter[idx].code,
+                               filter[idx].jt,
+                               filter[idx].jf,
+                               filter[idx].k);
+       }
+
+       return prog;
+
+errout1:
+       free(prog->filter);
+errout2:
+       free(prog);
+       return NULL;
+}
+
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog)
+{
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
+               goto errout;
+       }
+
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, prog)) {
+               ERROR("prctl(PR_SET_SECCOMP) failed: %m\n");
+               goto errout;
+       }
+       free(prog);
+
+       return 0;
+
+errout:
+       free(prog->filter);
+       free(prog);
+       return errno;
+}
diff --git a/jail/seccomp-oci.h b/jail/seccomp-oci.h
new file mode 100644 (file)
index 0000000..8cc8ae2
--- /dev/null
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_OCI_H_
+#define _JAIL_SECCOMP_OCI_H_
+
+#include <linux/filter.h>
+
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg);
+int applyOCIlinuxseccomp(struct sock_fprog *prog);
+
+#ifndef SECCOMP_SUPPORT
+struct sock_fprog *parseOCIlinuxseccomp(struct blob_attr *msg) {
+       return NULL;
+}
+
+int applyOCIlinuxseccomp(struct sock_fprog *prog) {
+       return ENOTSUP;
+}
+#endif
+
+#endif
diff --git a/jail/seccomp-syscalls-helpers.h b/jail/seccomp-syscalls-helpers.h
new file mode 100644 (file)
index 0000000..f86e468
--- /dev/null
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _JAIL_SECCOMP_HELPERS_H_
+#define _JAIL_SECCOMP_HELPERS_H_
+
+static int find_syscall(const char *name)
+{
+       int i;
+
+       for (i = 0; i < SYSCALL_COUNT; i++) {
+               int sc = syscall_index_to_number(i);
+               if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
+                       return sc;
+       }
+
+       return -1;
+}
+
+static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
+{
+       filter->code = code;
+       filter->jt = jt;
+       filter->jf = jf;
+       filter->k = k;
+}
+
+#endif
index fae08f98ee2f4efa47171db7566a2a3eb257e008..3eeb61605af1e15853e7cade8771d4cd07ea73c3 100644 (file)
 #include <libubox/blobmsg.h>
 #include <libubox/blobmsg_json.h>
 
-#include "seccomp-bpf.h"
+#include "log.h"
 #include "seccomp.h"
-#include "../syscall-names.h"
-
-static int find_syscall(const char *name)
-{
-       int i;
-
-       for (i = 0; i < SYSCALL_COUNT; i++) {
-               int sc = syscall_index_to_number(i);
-               if (syscall_name(sc) && !strcmp(syscall_name(sc), name))
-                       return sc;
-       }
-
-       return -1;
-}
-
-static void set_filter(struct sock_filter *filter, __u16 code, __u8 jt, __u8 jf, __u32 k)
-{
-       filter->code = code;
-       filter->jt = jt;
-       filter->jf = jf;
-       filter->k = k;
-}
+#include "seccomp-oci.h"
 
 int install_syscall_filter(const char *argv, const char *file)
 {
-       enum {
-               SECCOMP_WHITELIST,
-               SECCOMP_POLICY,
-               __SECCOMP_MAX
-       };
-       static const struct blobmsg_policy policy[__SECCOMP_MAX] = {
-               [SECCOMP_WHITELIST] = { .name = "whitelist", .type = BLOBMSG_TYPE_ARRAY },
-               [SECCOMP_POLICY] = { .name = "policy", .type = BLOBMSG_TYPE_INT32 },
-       };
        struct blob_buf b = { 0 };
-       struct blob_attr *tb[__SECCOMP_MAX];
-       struct blob_attr *cur;
-       int rem;
+       struct sock_fprog *prog = NULL;
 
-       struct sock_filter *filter;
-       struct sock_fprog prog = { 0 };
-       int sz = 5, idx = 0, default_policy = 0;
-
-       INFO("%s: setting up syscall filter\n", argv);
+       DEBUG("%s: setting up syscall filter\n", argv);
 
        blob_buf_init(&b, 0);
        if (!blobmsg_add_json_from_file(&b, file)) {
@@ -71,70 +35,11 @@ int install_syscall_filter(const char *argv, const char *file)
                return -1;
        }
 
-       blobmsg_parse(policy, __SECCOMP_MAX, tb, blob_data(b.head), blob_len(b.head));
-       if (!tb[SECCOMP_WHITELIST]) {
-               ERROR("%s: %s is missing the syscall table\n", argv, file);
+       prog = parseOCIlinuxseccomp(b.head);
+       if (!prog) {
+               ERROR("%s: failed to parse seccomp filter rules %s\n", argv, file);
                return -1;
        }
 
-       if (tb[SECCOMP_POLICY])
-               default_policy = blobmsg_get_u32(tb[SECCOMP_POLICY]);
-
-       blobmsg_for_each_attr(cur, tb[SECCOMP_WHITELIST], rem)
-               sz += 2;
-
-       filter = calloc(sz, sizeof(struct sock_filter));
-       if (!filter) {
-               ERROR("failed to allocate filter memory\n");
-               return -1;
-       }
-
-       /* validate arch */
-       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, arch_nr);
-       set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 1, 0, ARCH_NR);
-       set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_KILL);
-
-       /* get syscall */
-       set_filter(&filter[idx++], BPF_LD + BPF_W + BPF_ABS, 0, 0, syscall_nr);
-
-       blobmsg_for_each_attr(cur, tb[SECCOMP_WHITELIST], rem) {
-               char *name = blobmsg_get_string(cur);
-               int nr;
-
-               if (!name) {
-                       INFO("%s: invalid syscall name\n", argv);
-                       continue;
-               }
-
-               nr  = find_syscall(name);
-               if (nr == -1) {
-                       INFO("%s: unknown syscall %s\n", argv, name);
-                       continue;
-               }
-
-               /* add whitelist */
-               set_filter(&filter[idx++], BPF_JMP + BPF_JEQ + BPF_K, 0, 1, nr);
-               set_filter(&filter[idx++], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_ALLOW);
-       }
-
-       if (default_policy)
-               /* notify tracer; without tracer return -1 and set errno to ENOSYS */
-               set_filter(&filter[idx], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_TRACE);
-       else
-               /* kill the process */
-               set_filter(&filter[idx], BPF_RET + BPF_K, 0, 0, SECCOMP_RET_KILL);
-
-       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
-               ERROR("%s: prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n", argv);
-               return errno;
-       }
-
-       prog.len = (unsigned short) idx + 1;
-       prog.filter = filter;
-
-       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
-               ERROR("%s: prctl(PR_SET_SECCOMP) failed: %m\n", argv);
-               return errno;
-       }
-       return 0;
+       return applyOCIlinuxseccomp(prog);
 }
index 24c1dd7d3363cc475a21815c0324bedcaf547c7a..b0c8d305694294f2468ebcb30542c7cb8eb94744 100644 (file)
 #include <stdio.h>
 #include <syslog.h>
 
-#define INFO(fmt, ...) do { \
-       syslog(LOG_INFO,"preload-seccomp: "fmt, ## __VA_ARGS__); \
-       fprintf(stderr,"preload-seccomp: "fmt, ## __VA_ARGS__); \
-       } while (0)
-#define ERROR(fmt, ...) do { \
-       syslog(LOG_ERR,"preload-seccomp: "fmt, ## __VA_ARGS__); \
-       fprintf(stderr,"preload-seccomp: "fmt, ## __VA_ARGS__); \
-       } while (0)
-
 int install_syscall_filter(const char *argv, const char *file);
 
 #endif
diff --git a/log.h b/log.h
index f0c4268f1be1760012b1a4bdd8aa4b931f99c6f9..77bb34b430b34a3cdc9ad1aa8d09166a70f37e24 100644 (file)
--- a/log.h
+++ b/log.h
                ulog(LOG_NOTICE, fmt, ## __VA_ARGS__); \
        } } while (0)
 
+#define P_DEBUG(level, fmt, ...) do { \
+       if (debug >= level) { \
+               ulog(LOG_NOTICE, fmt, ## __VA_ARGS__); \
+       } else { \
+               procd_udebug_printf(fmt, ## __VA_ARGS__); \
+       } } while (0)
+
 #define LOG   ULOG_INFO
 #define ERROR ULOG_ERR
 
index 18d9131111ae3ec8579d19d300d4b019dfe78928..17e3bc472b8f9678bfcbfc0e04ac8cfddfe0e1e9 100755 (executable)
@@ -13,11 +13,11 @@ CC=$1
 
 echo "#include <asm/unistd.h>"
 echo "static const char *__syscall_names[] = {"
-echo "#include <sys/syscall.h>" | ${CC} -E -dM - | grep '^#define __NR_' | \
-       LC_ALL=C sed -r -n -e 's/^\#define[ \t]+__NR_([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_Linux]+)(.*)/ [\2] = "\1",/p'
+echo "#include <sys/syscall.h>" | ${CC} -E -dM - | grep '^#define __NR_[a-z0-9_]\+[ \t].*[0-9].*$' | \
+       LC_ALL=C sed -r -n -e 's/^\#define[ \t]+__NR_([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_LSYCABE]+)(.*)/ [\2] = "\1",/p'
 echo "};"
 
-extra_syscalls="$(echo "#include <sys/syscall.h>" | ${CC} -E -dM - | sed -n -e '/^#define __ARM_NR_/ s///p')"
+extra_syscalls="$(echo "#include <sys/syscall.h>" | ${CC} -E -dM - | sed -r -n -e 's/^#define __ARM_NR_([a-z0-9_]+)/\1/p')"
 
 cat <<EOF
 static inline const char *syscall_name(unsigned i) {
@@ -26,7 +26,7 @@ static inline const char *syscall_name(unsigned i) {
   switch (i) {
 EOF
 echo "$extra_syscalls" | \
-    LC_ALL=C sed -r -n -e 's/^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_Linux]+)(.*)/    case \2: return "\1";/p'
+    LC_ALL=C sed -r -n -e 's/^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_LAMBSE]+)(.*)/    case \2: return "\1";/p'
 cat <<EOF
   default: return (void*)0;
   }
@@ -40,7 +40,7 @@ static inline int syscall_index(unsigned i) {
   switch (i) {
 EOF
 echo "$extra_syscalls" | \
-    LC_ALL=C perl -ne 'print "  case $2: return ARRAY_SIZE(__syscall_names) + ", $. - 1, ";\n" if /^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_Linux]+)(.*)/;'
+    LC_ALL=C perl -ne 'print "  case $2: return ARRAY_SIZE(__syscall_names) + ", $. - 1, ";\n" if /^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_LAMBSE]+)(.*)/;'
 cat <<EOF
   default: return -1;
   }
@@ -54,7 +54,7 @@ static inline int syscall_index_to_number(unsigned i) {
   switch (i) {
 EOF
 echo "$extra_syscalls" | \
-    LC_ALL=C perl -ne 'print "  case ARRAY_SIZE(__syscall_names) + ", $. - 1, ": return $2;\n" if /^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_Linux]+)(.*)/;'
+    LC_ALL=C perl -ne 'print "  case ARRAY_SIZE(__syscall_names) + ", $. - 1, ": return $2;\n" if /^([a-z0-9_]+)[ \t]+([ ()+0-9a-zNR_LAMBSE]+)(.*)/;'
 cat <<EOF
   default: return -1;
   }
index 12df421bb205c4074a4f08af4493bec3d9d37891..f84acef97e29aef03751c78f951158281b12164e 100644 (file)
@@ -16,6 +16,7 @@
 #include <sys/types.h>
 #include <sys/mount.h>
 
+#include <stdlib.h>
 #include <unistd.h>
 
 #include "../procd.h"
@@ -28,14 +29,14 @@ static struct uloop_process udevtrigger;
 
 static void coldplug_complete(struct uloop_timeout *t)
 {
-       DEBUG(4, "Coldplug complete\n");
+       P_DEBUG(4, "Coldplug complete\n");
        hotplug_last_event(NULL);
        procd_state_next();
 }
 
 static void udevtrigger_complete(struct uloop_process *proc, int ret)
 {
-       DEBUG(4, "Finished udevtrigger\n");
+       P_DEBUG(4, "Finished udevtrigger\n");
        hotplug_last_event(coldplug_complete);
 }
 
@@ -47,9 +48,9 @@ void procd_coldplug(void)
        if (!is_container()) {
                umount2("/dev/pts", MNT_DETACH);
                umount2("/dev/", MNT_DETACH);
-               mount("tmpfs", "/dev", "tmpfs", MS_NOSUID, "mode=0755,size=512K");
+               mount("tmpfs", "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "mode=0755,size=512K");
                mkdir("/dev/pts", 0755);
-               mount("devpts", "/dev/pts", "devpts", MS_NOEXEC | MS_NOSUID, 0);
+               mount("devpts", "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, 0);
        }
 
        ignore(symlink("/tmp/shm", "/dev/shm"));
@@ -59,7 +60,7 @@ void procd_coldplug(void)
        if (!udevtrigger.pid) {
                execvp(argv[0], argv);
                ERROR("Failed to start coldplug: %m\n");
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
 
        if (udevtrigger.pid <= 0) {
@@ -69,5 +70,5 @@ void procd_coldplug(void)
 
        uloop_process_add(&udevtrigger);
 
-       DEBUG(4, "Launched coldplug instance, pid=%d\n", (int) udevtrigger.pid);
+       P_DEBUG(4, "Launched coldplug instance, pid=%d\n", (int) udevtrigger.pid);
 }
index 799123dcea195b562e7c71dee2197566f0bb6b3c..53f338339fdedf05ee82c21f795be21c93419458 100644 (file)
 #include <libubox/blobmsg_json.h>
 #include <libubox/json_script.h>
 #include <libubox/uloop.h>
+#include <libubox/utils.h>
 #include <json-c/json.h>
 
+#include <errno.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <stdlib.h>
@@ -107,16 +109,28 @@ static char *hotplug_msg_find_var(struct blob_attr *msg, const char *name)
        return NULL;
 }
 
-static void mkdir_p(char *dir)
+static void chgrp_error(const char *group, const char *target, const char *failed)
 {
-       char *l = strrchr(dir, '/');
+       ERROR("cannot set group %s for %s (%s: %d)\n",
+              group, target, failed, errno);
+}
 
-       if (l) {
-               *l = '\0';
-               mkdir_p(dir);
-               *l = '/';
-               mkdir(dir, 0755);
-       }
+static void chgrp_target(struct blob_attr *bgroup, struct blob_attr *btarget)
+{
+       int ret = 0;
+       struct group *g = NULL;
+       const char *group = blobmsg_get_string(bgroup);
+       const char *target = blobmsg_get_string(btarget);
+
+       errno = 0;
+
+       g = getgrnam(group);
+       if (!g)
+               return chgrp_error(group, target, "getgrnam");
+
+       ret = chown(target, 0, g->gr_gid);
+       if (ret < 0)
+               return chgrp_error(group, target, "chown");
 }
 
 static void handle_makedev(struct blob_attr *msg, struct blob_attr *data)
@@ -131,7 +145,6 @@ static void handle_makedev(struct blob_attr *msg, struct blob_attr *data)
        char *minor = hotplug_msg_find_var(msg, "MINOR");
        char *major = hotplug_msg_find_var(msg, "MAJOR");
        char *subsystem = hotplug_msg_find_var(msg, "SUBSYSTEM");
-       int ret = 0;
 
        blobmsg_parse_array(mkdev_policy, 3, tb, blobmsg_data(data), blobmsg_data_len(data));
        if (tb[0] && tb[1] && minor && major && subsystem) {
@@ -139,7 +152,7 @@ static void handle_makedev(struct blob_attr *msg, struct blob_attr *data)
                char *d = strdup(blobmsg_get_string(tb[0]));
 
                d = dirname(d);
-               mkdir_p(d);
+               mkdir_p(d, 0755);
                free(d);
 
                if (!strcmp(subsystem, "block"))
@@ -147,17 +160,8 @@ static void handle_makedev(struct blob_attr *msg, struct blob_attr *data)
                mknod(blobmsg_get_string(tb[0]),
                                m | strtoul(blobmsg_data(tb[1]), NULL, 8),
                                makedev(atoi(major), atoi(minor)));
-               if (tb[2]) {
-                       struct group *g = getgrnam(blobmsg_get_string(tb[2]));
-
-                       if (g)
-                               ret = chown(blobmsg_get_string(tb[0]), 0, g->gr_gid);
-
-                       if (!g || ret < 0)
-                               ERROR("cannot set group %s for %s\n",
-                                       blobmsg_get_string(tb[2]),
-                                       blobmsg_get_string(tb[0]));
-               }
+               if (tb[2])
+                       chgrp_target(tb[2], tb[0]);
        }
        umask(oldumask);
 }
@@ -206,7 +210,7 @@ static void handle_exec(struct blob_attr *msg, struct blob_attr *data)
                argv[i] = NULL;
                execvp(argv[0], &argv[0]);
        }
-       exit(-1);
+       exit(EXIT_FAILURE);
 }
 
 static void handle_button_start(struct blob_attr *msg, struct blob_attr *data)
@@ -227,11 +231,11 @@ static void handle_firmware(struct blob_attr *msg, struct blob_attr *data)
        int fw, src, load, len;
        static char buf[4096];
 
-       DEBUG(2, "Firmware request for %s/%s\n", dir, file);
+       P_DEBUG(2, "Firmware request for %s/%s\n", dir, file);
 
        if (!file || !dir || !dev) {
                ERROR("Request for unknown firmware %s/%s\n", dir, file);
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
 
        path = alloca(strlen(dir) + strlen(file) + 2);
@@ -256,11 +260,11 @@ send_to_kernel:
        load = open(loadpath, O_WRONLY);
        if (!load) {
                ERROR("Failed to open %s: %m\n", loadpath);
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
        if (write(load, "1", 1) == -1) {
                ERROR("Failed to write to %s: %m\n", loadpath);
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
        close(load);
 
@@ -268,7 +272,7 @@ send_to_kernel:
        fw = open(syspath, O_WRONLY);
        if (fw < 0) {
                ERROR("Failed to open %s: %m\n", syspath);
-               exit(-1);
+               exit(EXIT_FAILURE);
        }
 
        len = s.st_size;
@@ -292,9 +296,23 @@ send_to_kernel:
                ERROR("failed to write to %s: %m\n", loadpath);
        close(load);
 
-       DEBUG(2, "Done loading %s\n", path);
+       P_DEBUG(2, "Done loading %s\n", path);
 
-       exit(-1);
+       exit(EXIT_FAILURE);
+}
+
+static void handle_start_console(struct blob_attr *msg, struct blob_attr *data)
+{
+       char *dev = blobmsg_get_string(blobmsg_data(data));
+
+       P_DEBUG(2, "Start console request for %s\n", dev);
+
+       procd_inittab_run("respawn");
+       procd_inittab_run("askfirst");
+
+       P_DEBUG(2, "Done starting console for %s\n", dev);
+
+       exit(EXIT_FAILURE);
 }
 
 enum {
@@ -303,6 +321,7 @@ enum {
        HANDLER_EXEC,
        HANDLER_BUTTON,
        HANDLER_FW,
+       HANDLER_START_CONSOLE,
 };
 
 static struct cmd_handler {
@@ -336,6 +355,10 @@ static struct cmd_handler {
                .name = "load-firmware",
                .handler = handle_firmware,
        },
+       [HANDLER_START_CONSOLE] = {
+               .name = "start-console",
+               .handler = handle_start_console,
+       },
 };
 
 static void queue_next(void)
@@ -367,12 +390,12 @@ static void queue_next(void)
 
        uloop_process_add(&queue_proc);
 
-       DEBUG(4, "Launched hotplug exec instance, pid=%d\n", (int) queue_proc.pid);
+       P_DEBUG(4, "Launched hotplug exec instance, pid=%d\n", (int) queue_proc.pid);
 }
 
 static void queue_proc_cb(struct uloop_process *c, int ret)
 {
-       DEBUG(4, "Finished hotplug exec instance, pid=%d\n", (int) c->pid);
+       P_DEBUG(4, "Finished hotplug exec instance, pid=%d\n", (int) c->pid);
 
        if (current) {
                current->complete(current->msg, current->data, ret);
@@ -490,13 +513,13 @@ static void rule_handle_command(struct json_script_ctx *ctx, const char *name,
        int rem, i;
 
        if (debug > 3) {
-               DEBUG(4, "Command: %s\n", name);
+               P_DEBUG(4, "Command: %s\n", name);
                blobmsg_for_each_attr(cur, data, rem)
-                       DEBUG(4, " %s\n", (char *) blobmsg_data(cur));
+                       P_DEBUG(4, " %s\n", (char *) blobmsg_data(cur));
 
-               DEBUG(4, "Message:\n");
+               P_DEBUG(4, "Message:\n");
                blobmsg_for_each_attr(cur, vars, rem)
-                       DEBUG(4, " %s=%s\n", blobmsg_name(cur), (char *) blobmsg_data(cur));
+                       P_DEBUG(4, " %s=%s\n", blobmsg_name(cur), (char *) blobmsg_data(cur));
        }
 
        for (i = 0; i < ARRAY_SIZE(handlers); i++)
@@ -537,7 +560,7 @@ static void hotplug_handler_debug(struct blob_attr *data)
                return;
 
        str = blobmsg_format_json(data, true);
-       DEBUG(3, "%s\n", str);
+       P_DEBUG(3, "%s\n", str);
        free(str);
 }
 
@@ -589,7 +612,7 @@ void hotplug(char *rules)
 
        rule_file = strdup(rules);
        nls.nl_family = AF_NETLINK;
-       nls.nl_pid = getpid();
+       nls.nl_pid = 0;
        nls.nl_groups = -1;
 
        if ((hotplug_fd.fd = socket(PF_NETLINK, SOCK_DGRAM | SOCK_CLOEXEC, NETLINK_KOBJECT_UEVENT)) == -1) {
diff --git a/procd.c b/procd.c
index 3de62082bc62e4d21936a0cc5df0d93034a9d659..1223283c3e96eecb6e77b5c1dc8fbf9a10812c04 100644 (file)
--- a/procd.c
+++ b/procd.c
 
 unsigned int debug;
 
+static struct udebug ud;
+static struct udebug_buf udb;
+static bool udebug_enabled;
+
+static void procd_udebug_vprintf(const char *format, va_list ap)
+{
+       if (!udebug_enabled)
+               return;
+
+       udebug_entry_init(&udb);
+       udebug_entry_vprintf(&udb, format, ap);
+       udebug_entry_add(&udb);
+}
+
+void procd_udebug_printf(const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       procd_udebug_vprintf(format, ap);
+       va_end(ap);
+}
+
+void procd_udebug_set_enabled(bool val)
+{
+       static const struct udebug_buf_meta meta = {
+               .name = "procd_log",
+               .format = UDEBUG_FORMAT_STRING,
+       };
+
+       if (udebug_enabled == val)
+               return;
+
+       udebug_enabled = val;
+       if (!val) {
+               ulog_udebug(NULL);
+               udebug_buf_free(&udb);
+               udebug_free(&ud);
+               return;
+       }
+
+       udebug_init(&ud);
+       udebug_auto_connect(&ud, NULL);
+       udebug_buf_init(&udb, 1024, 64 * 1024);
+       udebug_buf_add(&ud, &udb, &meta);
+       ulog_udebug(&udb);
+}
+
+
 static int usage(const char *prog)
 {
        fprintf(stderr, "Usage: %s [options]\n"
@@ -74,6 +123,7 @@ int main(int argc, char **argv)
        setsid();
        uloop_init();
        procd_signal();
+       procd_udebug_set_enabled(true);
        if (getpid() != 1)
                procd_connect_ubus();
        else
diff --git a/procd.h b/procd.h
index 5aa3aea068aa019965fa76b85525e710085c17b4..bca3c42dfaeca614891243b3d8b5bcfa88968633 100644 (file)
--- a/procd.h
+++ b/procd.h
@@ -18,6 +18,7 @@
 #include <libubox/uloop.h>
 #include <libubox/utils.h>
 #include <libubus.h>
+#include <udebug.h>
 
 #include <stdio.h>
 #include <syslog.h>
@@ -30,6 +31,7 @@ extern char *ubus_socket;
 
 void procd_connect_ubus(void);
 void procd_reconnect_ubus(int reconnect);
+void ubus_init_hotplug(struct ubus_context *ctx);
 void ubus_init_service(struct ubus_context *ctx);
 void ubus_init_system(struct ubus_context *ctx);
 
@@ -42,6 +44,7 @@ void procd_signal(void);
 void procd_signal_preinit(void);
 void procd_inittab(void);
 void procd_inittab_run(const char *action);
+void procd_inittab_kill(void);
 void procd_bcast_event(char *event, struct blob_attr *msg);
 
 struct trigger;
@@ -53,4 +56,7 @@ void watch_add(const char *_name, void *id);
 void watch_del(void *id);
 void watch_ubus(struct ubus_context *ctx);
 
+void procd_udebug_printf(const char *format, ...);
+void procd_udebug_set_enabled(bool val);
+
 #endif
diff --git a/rcS.c b/rcS.c
index c2e1abb5d44a433cf541f307aad5bc0cb772e947..2851fae99bef13085cd047912d804c8c45f40114 100644 (file)
--- a/rcS.c
+++ b/rcS.c
@@ -18,6 +18,7 @@
 
 #include <libubox/uloop.h>
 #include <libubox/runqueue.h>
+#include <inttypes.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -118,7 +119,7 @@ static void q_initd_complete(struct runqueue *q, struct runqueue_task *p)
                ts_res.tv_nsec += 1000000000;
        }
 
-       DEBUG(2, "stop %s %s - took %lu.%09lus\n", s->file, s->param, ts_res.tv_sec, ts_res.tv_nsec);
+       DEBUG(2, "stop %s %s - took %" PRId64 ".%09" PRId64 "s\n", s->file, s->param, (int64_t)ts_res.tv_sec, (int64_t)ts_res.tv_nsec);
        ustream_free(&s->fd.stream);
        close(s->fd.fd.fd);
        free(s);
index b4284e7560545938875e2774df2380b65f380d29..ed5d0a490532e915fdc83976f190fd6078432320 100644 (file)
@@ -36,6 +36,8 @@
 #include "service.h"
 #include "instance.h"
 
+#define UJAIL_BIN_PATH "/sbin/ujail"
+#define CGROUP_BASEDIR "/sys/fs/cgroup/services"
 
 enum {
        INSTANCE_ATTR_COMMAND,
@@ -57,10 +59,16 @@ enum {
        INSTANCE_ATTR_JAIL,
        INSTANCE_ATTR_TRACE,
        INSTANCE_ATTR_SECCOMP,
+       INSTANCE_ATTR_CAPABILITIES,
        INSTANCE_ATTR_PIDFILE,
        INSTANCE_ATTR_RELOADSIG,
        INSTANCE_ATTR_TERMTIMEOUT,
        INSTANCE_ATTR_FACILITY,
+       INSTANCE_ATTR_EXTROOT,
+       INSTANCE_ATTR_OVERLAYDIR,
+       INSTANCE_ATTR_TMPOVERLAYSIZE,
+       INSTANCE_ATTR_BUNDLE,
+       INSTANCE_ATTR_WATCHDOG,
        __INSTANCE_ATTR_MAX
 };
 
@@ -84,10 +92,16 @@ static const struct blobmsg_policy instance_attr[__INSTANCE_ATTR_MAX] = {
        [INSTANCE_ATTR_JAIL] = { "jail", BLOBMSG_TYPE_TABLE },
        [INSTANCE_ATTR_TRACE] = { "trace", BLOBMSG_TYPE_BOOL },
        [INSTANCE_ATTR_SECCOMP] = { "seccomp", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
        [INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 },
        [INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 },
        [INSTANCE_ATTR_FACILITY] = { "facility", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_EXTROOT] = { "extroot", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_OVERLAYDIR] = { "overlaydir", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_TMPOVERLAYSIZE] = { "tmpoverlaysize", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_BUNDLE] = { "bundle", BLOBMSG_TYPE_STRING },
+       [INSTANCE_ATTR_WATCHDOG] = { "watchdog", BLOBMSG_TYPE_ARRAY },
 };
 
 enum {
@@ -99,6 +113,14 @@ enum {
        JAIL_ATTR_LOG,
        JAIL_ATTR_RONLY,
        JAIL_ATTR_MOUNT,
+       JAIL_ATTR_NETNS,
+       JAIL_ATTR_USERNS,
+       JAIL_ATTR_CGROUPSNS,
+       JAIL_ATTR_CONSOLE,
+       JAIL_ATTR_REQUIREJAIL,
+       JAIL_ATTR_IMMEDIATELY,
+       JAIL_ATTR_PIDFILE,
+       JAIL_ATTR_SETNS,
        __JAIL_ATTR_MAX,
 };
 
@@ -111,6 +133,25 @@ static const struct blobmsg_policy jail_attr[__JAIL_ATTR_MAX] = {
        [JAIL_ATTR_LOG] = { "log", BLOBMSG_TYPE_BOOL },
        [JAIL_ATTR_RONLY] = { "ronly", BLOBMSG_TYPE_BOOL },
        [JAIL_ATTR_MOUNT] = { "mount", BLOBMSG_TYPE_TABLE },
+       [JAIL_ATTR_NETNS] = { "netns", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_USERNS] = { "userns", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_CGROUPSNS] = { "cgroupsns", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_CONSOLE] = { "console", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_REQUIREJAIL] = { "requirejail", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_IMMEDIATELY] = { "immediately", BLOBMSG_TYPE_BOOL },
+       [JAIL_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
+       [JAIL_ATTR_SETNS] = { "setns", BLOBMSG_TYPE_ARRAY },
+};
+
+enum {
+       JAIL_SETNS_ATTR_PID,
+       JAIL_SETNS_ATTR_NS,
+       __JAIL_SETNS_ATTR_MAX,
+};
+
+static const struct blobmsg_policy jail_setns_attr[__JAIL_SETNS_ATTR_MAX] = {
+       [JAIL_SETNS_ATTR_PID] = { "pid", BLOBMSG_TYPE_INT32 },
+       [JAIL_SETNS_ATTR_NS] = { "namespaces", BLOBMSG_TYPE_ARRAY },
 };
 
 struct instance_netdev {
@@ -198,14 +239,67 @@ instance_limits(const char *limit, const char *value)
        }
 }
 
+static char *
+instance_gen_setns_argstr(struct blob_attr *attr)
+{
+       struct blob_attr *tb[__JAIL_SETNS_ATTR_MAX];
+       struct blob_attr *cur;
+       int rem, len, total;
+       char *ret;
+
+       blobmsg_parse(jail_setns_attr, __JAIL_SETNS_ATTR_MAX, tb,
+               blobmsg_data(attr), blobmsg_data_len(attr));
+
+       if (!tb[JAIL_SETNS_ATTR_PID] || !tb[JAIL_SETNS_ATTR_NS])
+               return NULL;
+
+       len = snprintf(NULL, 0, "%d:", blobmsg_get_u32(tb[JAIL_SETNS_ATTR_PID]));
+
+       blobmsg_for_each_attr(cur, tb[JAIL_SETNS_ATTR_NS], rem) {
+               char *tmp;
+
+               if (blobmsg_type(cur) != BLOBMSG_TYPE_STRING)
+                       return NULL;
+
+               tmp = blobmsg_get_string(cur);
+               if (!tmp)
+                       return NULL;
+
+               len += strlen(tmp) + 1;
+       }
+
+       total = len;
+       ret = malloc(total);
+       if (!ret)
+               return NULL;
+
+       len = snprintf(ret, total, "%d:", blobmsg_get_u32(tb[JAIL_SETNS_ATTR_PID]));
+
+       blobmsg_for_each_attr(cur, tb[JAIL_SETNS_ATTR_NS], rem) {
+               strncpy(&ret[len], blobmsg_get_string(cur), total - len);
+               len += strlen(blobmsg_get_string(cur));
+               ret[len++] = ',';
+       }
+       ret[total - 1] = '\0';
+
+       return ret;
+}
+
 static inline int
 jail_run(struct service_instance *in, char **argv)
 {
+       char *term_timeout_str;
        struct blobmsg_list_node *var;
        struct jail *jail = &in->jail;
        int argc = 0;
 
-       argv[argc++] = "/sbin/ujail";
+       argv[argc++] = UJAIL_BIN_PATH;
+
+       if (asprintf(&term_timeout_str, "%d", in->term_timeout) == -1)
+               exit(ENOMEM);
+
+       argv[argc++] = "-t";
+       argv[argc++] = term_timeout_str;
 
        if (jail->name) {
                argv[argc++] = "-n";
@@ -222,6 +316,21 @@ jail_run(struct service_instance *in, char **argv)
                argv[argc++] = in->seccomp;
        }
 
+       if (in->user) {
+               argv[argc++] = "-U";
+               argv[argc++] = in->user;
+       }
+
+       if (in->group) {
+               argv[argc++] = "-G";
+               argv[argc++] = in->group;
+       }
+
+       if (in->capabilities) {
+               argv[argc++] = "-C";
+               argv[argc++] = in->capabilities;
+       }
+
        if (in->no_new_privs)
                argv[argc++] = "-c";
 
@@ -240,6 +349,54 @@ jail_run(struct service_instance *in, char **argv)
        if (jail->ronly)
                argv[argc++] = "-o";
 
+       if (jail->netns)
+               argv[argc++] = "-N";
+
+       if (jail->userns)
+               argv[argc++] = "-f";
+
+       if (jail->cgroupsns)
+               argv[argc++] = "-F";
+
+       if (jail->console)
+               argv[argc++] = "-y";
+
+       if (in->extroot) {
+               argv[argc++] = "-R";
+               argv[argc++] = in->extroot;
+       }
+
+       if (in->overlaydir) {
+               argv[argc++] = "-O";
+               argv[argc++] = in->overlaydir;
+       }
+
+       if (in->tmpoverlaysize) {
+               argv[argc++] = "-T";
+               argv[argc++] = in->tmpoverlaysize;
+       }
+
+       if (in->immediately)
+               argv[argc++] = "-i";
+
+       if (jail->pidfile) {
+               argv[argc++] = "-P";
+               argv[argc++] = jail->pidfile;
+       }
+
+       if (in->bundle) {
+               argv[argc++] = "-J";
+               argv[argc++] = in->bundle;
+       }
+
+       if (in->require_jail)
+               argv[argc++] = "-E";
+
+       blobmsg_list_for_each(&in->env, var) {
+               argv[argc++] = "-e";
+               argv[argc++] = (char *) blobmsg_name(var->data);
+       }
+
        blobmsg_list_for_each(&jail->mount, var) {
                const char *type = blobmsg_data(var->data);
 
@@ -250,6 +407,15 @@ jail_run(struct service_instance *in, char **argv)
                argv[argc++] = (char *) blobmsg_name(var->data);
        }
 
+       blobmsg_list_for_each(&jail->setns, var) {
+               char *setns_arg = instance_gen_setns_argstr(var->data);
+
+               if (setns_arg) {
+                       argv[argc++] = "-j";
+                       argv[argc++] = setns_arg;
+               }
+       }
+
        argv[argc++] = "--";
 
        return argc;
@@ -260,7 +426,7 @@ instance_removepid(struct service_instance *in) {
        if (!in->pidfile)
                return 0;
        if (unlink(in->pidfile)) {
-               ERROR("Failed to removed pidfile: %s: %m\n", in->pidfile);
+               ERROR("Failed to remove pidfile: %s: %m\n", in->pidfile);
                return 1;
        }
        return 0;
@@ -337,8 +503,12 @@ instance_run(struct service_instance *in, int _stdout, int _stderr)
                ULOG_WARN("Seccomp support for %s::%s not available\n", in->srv->name, in->name);
 #endif
 
-       if (in->has_jail)
+       if (in->has_jail) {
                argc = jail_run(in, argv);
+               if (argc != in->jail.argc)
+                       ULOG_WARN("expected %i jail params, used %i for %s::%s\n",
+                               in->jail.argc, argc, in->srv->name, in->name);
+       }
 
        blobmsg_for_each_attr(cur, in->command, rem)
                argv[argc++] = blobmsg_data(cur);
@@ -366,15 +536,15 @@ instance_run(struct service_instance *in, int _stdout, int _stderr)
                closefd(_stderr);
        }
 
-       if (in->user && in->pw_gid && initgroups(in->user, in->pw_gid)) {
+       if (!in->has_jail && in->user && in->pw_gid && initgroups(in->user, in->pw_gid)) {
                ERROR("failed to initgroups() for user %s: %m\n", in->user);
                exit(127);
        }
-       if (in->gr_gid && setgid(in->gr_gid)) {
+       if (!in->has_jail && in->gr_gid && setgid(in->gr_gid)) {
                ERROR("failed to set group id %d: %m\n", in->gr_gid);
                exit(127);
        }
-       if (in->uid && setuid(in->uid)) {
+       if (!in->has_jail && in->uid && setuid(in->uid)) {
                ERROR("failed to set user id %d: %m\n", in->uid);
                exit(127);
        }
@@ -383,6 +553,32 @@ instance_run(struct service_instance *in, int _stdout, int _stderr)
        exit(127);
 }
 
+static void
+instance_add_cgroup(const char *service, const char *instance)
+{
+       struct stat sb;
+       char cgnamebuf[256];
+       int fd;
+
+       if (stat("/sys/fs/cgroup/cgroup.subtree_control", &sb))
+               return;
+
+       mkdir(CGROUP_BASEDIR, 0700);
+
+       snprintf(cgnamebuf, sizeof(cgnamebuf), "%s/%s", CGROUP_BASEDIR, service);
+       mkdir(cgnamebuf, 0700);
+       snprintf(cgnamebuf, sizeof(cgnamebuf), "%s/%s/%s", CGROUP_BASEDIR, service, instance);
+       mkdir(cgnamebuf, 0700);
+       strcat(cgnamebuf, "/cgroup.procs");
+
+       fd = open(cgnamebuf, O_WRONLY);
+       if (fd == -1)
+               return;
+
+       dprintf(fd, "%d", getpid());
+       close(fd);
+}
+
 static void
 instance_free_stdio(struct service_instance *in)
 {
@@ -397,6 +593,18 @@ instance_free_stdio(struct service_instance *in)
                close(in->_stderr.fd.fd);
                in->_stderr.fd.fd = -1;
        }
+
+       if (in->console.fd.fd > -1) {
+               ustream_free(&in->console.stream);
+               close(in->console.fd.fd);
+               in->console.fd.fd = -1;
+       }
+
+       if (in->console_client.fd.fd > -1) {
+               ustream_free(&in->console_client.stream);
+               close(in->console_client.fd.fd);
+               in->console_client.fd.fd = -1;
+       }
 }
 
 void
@@ -411,7 +619,7 @@ instance_start(struct service_instance *in)
                return;
        }
 
-       if (!in->command) {
+       if (!in->bundle && !in->command) {
                LOG("Not starting instance %s::%s, command not set\n", in->srv->name, in->name);
                return;
        }
@@ -451,11 +659,12 @@ instance_start(struct service_instance *in)
                uloop_done();
                closefd(opipe[0]);
                closefd(epipe[0]);
+               instance_add_cgroup(in->srv->name, in->name);
                instance_run(in, opipe[1], epipe[1]);
                return;
        }
 
-       DEBUG(2, "Started instance %s::%s[%d]\n", in->srv->name, in->name, pid);
+       P_DEBUG(2, "Started instance %s::%s[%d]\n", in->srv->name, in->name, pid);
        in->proc.pid = pid;
        instance_writepid(in);
        clock_gettime(CLOCK_MONOTONIC, &in->start);
@@ -473,6 +682,11 @@ instance_start(struct service_instance *in)
                fcntl(epipe[0], F_SETFD, FD_CLOEXEC);
        }
 
+       if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {
+               uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);
+               P_DEBUG(2, "Started instance %s::%s watchdog timer : timeout = %d\n", in->srv->name, in->name, in->watchdog.freq);
+       }
+
        service_event("instance.start", in->srv->name, in->name);
 }
 
@@ -514,6 +728,46 @@ instance_stdout(struct ustream *s, int bytes)
                       container_of(s, struct service_instance, _stdout.stream));
 }
 
+static void
+instance_console(struct ustream *s, int bytes)
+{
+       struct service_instance *in = container_of(s, struct service_instance, console.stream);
+       char *buf;
+       int len;
+
+       do {
+               buf = ustream_get_read_buf(s, &len);
+               if (!buf)
+                       break;
+
+               ulog(LOG_INFO, "out: %s\n", buf);
+
+               /* test if console client is attached */
+               if (in->console_client.fd.fd > -1)
+                       ustream_write(&in->console_client.stream, buf, len, false);
+
+               ustream_consume(s, len);
+       } while (1);
+}
+
+static void
+instance_console_client(struct ustream *s, int bytes)
+{
+       struct service_instance *in = container_of(s, struct service_instance, console_client.stream);
+       char *buf;
+       int len;
+
+       do {
+               buf = ustream_get_read_buf(s, &len);
+               if (!buf)
+                       break;
+
+               ulog(LOG_INFO, "in: %s\n", buf);
+               ustream_write(&in->console.stream, buf, len, false);
+               ustream_consume(s, len);
+       } while (1);
+}
+
 static void
 instance_stderr(struct ustream *s, int bytes)
 {
@@ -546,6 +800,24 @@ instance_delete(struct service_instance *in)
        service_stopped(s);
 }
 
+static int
+instance_exit_code(int ret)
+{
+       if (WIFEXITED(ret)) {
+               return WEXITSTATUS(ret);
+       }
+
+       if (WIFSIGNALED(ret)) {
+               return SIGNALLED_OFFSET + WTERMSIG(ret);
+       }
+
+       if (WIFSTOPPED(ret)) {
+               return WSTOPSIG(ret);
+       }
+
+       return 1;
+}
+
 static void
 instance_exit(struct uloop_process *p, int ret)
 {
@@ -558,9 +830,11 @@ instance_exit(struct uloop_process *p, int ret)
        clock_gettime(CLOCK_MONOTONIC, &tp);
        runtime = tp.tv_sec - in->start.tv_sec;
 
-       DEBUG(2, "Instance %s::%s exit with error code %d after %ld seconds\n", in->srv->name, in->name, ret, runtime);
+       P_DEBUG(2, "Instance %s::%s exit with error code %d after %ld seconds\n", in->srv->name, in->name, ret, runtime);
 
+       in->exit_code = instance_exit_code(ret);
        uloop_timeout_cancel(&in->timeout);
+       uloop_timeout_cancel(&in->watchdog.timeout);
        service_event("instance.stop", in->srv->name, in->name);
 
        if (in->halt) {
@@ -600,7 +874,8 @@ instance_stop(struct service_instance *in, bool halt)
        in->halt = halt;
        in->restart = in->respawn = false;
        kill(in->proc.pid, SIGTERM);
-       uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
+       if (!in->has_jail)
+               uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
 static void
@@ -617,7 +892,21 @@ instance_restart(struct service_instance *in)
        in->halt = true;
        in->restart = true;
        kill(in->proc.pid, SIGTERM);
-       uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
+       if (!in->has_jail)
+               uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
+}
+
+static void
+instance_watchdog(struct uloop_timeout *t)
+{
+       struct service_instance *in = container_of(t, struct service_instance, watchdog.timeout);
+
+       P_DEBUG(3, "instance %s::%s watchdog timer expired\n", in->srv->name, in->name);
+
+       if (in->respawn)
+               instance_restart(in);
+       else
+               instance_stop(in, true);
 }
 
 static bool string_changed(const char *a, const char *b)
@@ -634,6 +923,18 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
        if (!blob_attr_equal(in->command, in_new->command))
                return true;
 
+       if (string_changed(in->bundle, in_new->bundle))
+               return true;
+
+       if (string_changed(in->extroot, in_new->extroot))
+               return true;
+
+       if (string_changed(in->overlaydir, in_new->overlaydir))
+               return true;
+
+       if (string_changed(in->tmpoverlaysize, in_new->tmpoverlaysize))
+               return true;
+
        if (!blobmsg_list_equal(&in->env, &in_new->env))
                return true;
 
@@ -661,6 +962,9 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
        if (in->pw_gid != in_new->pw_gid)
                return true;
 
+       if (in->gr_gid != in_new->gr_gid)
+               return true;
+
        if (string_changed(in->pidfile, in_new->pidfile))
                return true;
 
@@ -671,9 +975,16 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
        if (in->respawn_timeout != in_new->respawn_timeout)
                return true;
 
-       if ((!in->seccomp && in_new->seccomp) ||
-           (in->seccomp && !in_new->seccomp) ||
-           (in->seccomp && in_new->seccomp && strcmp(in->seccomp, in_new->seccomp)))
+       if (in->reload_signal != in_new->reload_signal)
+               return true;
+
+       if (in->term_timeout != in_new->term_timeout)
+               return true;
+
+       if (string_changed(in->seccomp, in_new->seccomp))
+               return true;
+
+       if (string_changed(in->capabilities, in_new->capabilities))
                return true;
 
        if (!blobmsg_list_equal(&in->limits, &in_new->limits))
@@ -682,9 +993,69 @@ instance_config_changed(struct service_instance *in, struct service_instance *in
        if (!blobmsg_list_equal(&in->jail.mount, &in_new->jail.mount))
                return true;
 
+       if (!blobmsg_list_equal(&in->jail.setns, &in_new->jail.setns))
+               return true;
+
        if (!blobmsg_list_equal(&in->errors, &in_new->errors))
                return true;
 
+       if (in->has_jail != in_new->has_jail)
+               return true;
+
+       if (in->trace != in_new->trace)
+               return true;
+
+       if (in->require_jail != in_new->require_jail)
+               return true;
+
+       if (in->immediately != in_new->immediately)
+               return true;
+
+       if (in->no_new_privs != in_new->no_new_privs)
+               return true;
+
+       if (string_changed(in->jail.name, in_new->jail.name))
+               return true;
+
+       if (string_changed(in->jail.hostname, in_new->jail.hostname))
+               return true;
+
+       if (string_changed(in->jail.pidfile, in_new->jail.pidfile))
+               return true;
+
+       if (in->jail.procfs != in_new->jail.procfs)
+               return true;
+
+       if (in->jail.sysfs != in_new->jail.sysfs)
+               return true;
+
+       if (in->jail.ubus != in_new->jail.ubus)
+               return true;
+
+       if (in->jail.log != in_new->jail.log)
+               return true;
+
+       if (in->jail.ronly != in_new->jail.ronly)
+               return true;
+
+       if (in->jail.netns != in_new->jail.netns)
+               return true;
+
+       if (in->jail.userns != in_new->jail.userns)
+               return true;
+
+       if (in->jail.cgroupsns != in_new->jail.cgroupsns)
+               return true;
+
+       if (in->jail.console != in_new->jail.console)
+               return true;
+
+       if (in->watchdog.mode != in_new->watchdog.mode)
+               return true;
+
+       if (in->watchdog.freq != in_new->watchdog.freq)
+               return true;
+
        return false;
 }
 
@@ -780,44 +1151,80 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr)
 {
        struct blob_attr *tb[__JAIL_ATTR_MAX];
        struct jail *jail = &in->jail;
-       struct stat s;
-
-       if (stat("/sbin/ujail", &s))
-               return 0;
+       struct blobmsg_list_node *var;
 
        blobmsg_parse(jail_attr, __JAIL_ATTR_MAX, tb,
                blobmsg_data(attr), blobmsg_data_len(attr));
 
-       jail->argc = 2;
+       jail->argc = 4;
 
+       if (tb[JAIL_ATTR_REQUIREJAIL] && blobmsg_get_bool(tb[JAIL_ATTR_REQUIREJAIL])) {
+               in->require_jail = true;
+               jail->argc++;
+       }
+       if (tb[JAIL_ATTR_IMMEDIATELY] && blobmsg_get_bool(tb[JAIL_ATTR_IMMEDIATELY])) {
+               in->immediately = true;
+               jail->argc++;
+       }
        if (tb[JAIL_ATTR_NAME]) {
-               jail->name = blobmsg_get_string(tb[JAIL_ATTR_NAME]);
+               jail->name = strdup(blobmsg_get_string(tb[JAIL_ATTR_NAME]));
                jail->argc += 2;
        }
        if (tb[JAIL_ATTR_HOSTNAME]) {
-               jail->hostname = blobmsg_get_string(tb[JAIL_ATTR_HOSTNAME]);
+               jail->hostname = strdup(blobmsg_get_string(tb[JAIL_ATTR_HOSTNAME]));
                jail->argc += 2;
        }
-       if (tb[JAIL_ATTR_PROCFS]) {
-               jail->procfs = blobmsg_get_bool(tb[JAIL_ATTR_PROCFS]);
+       if (tb[JAIL_ATTR_PROCFS] && blobmsg_get_bool(tb[JAIL_ATTR_PROCFS])) {
+               jail->procfs = true;
+               jail->argc++;
+       }
+       if (tb[JAIL_ATTR_SYSFS] && blobmsg_get_bool(tb[JAIL_ATTR_SYSFS])) {
+               jail->sysfs = true;
+               jail->argc++;
+       }
+       if (tb[JAIL_ATTR_UBUS] && blobmsg_get_bool(tb[JAIL_ATTR_UBUS])) {
+               jail->ubus = true;
+               jail->argc++;
+       }
+       if (tb[JAIL_ATTR_LOG] && blobmsg_get_bool(tb[JAIL_ATTR_LOG])) {
+               jail->log = true;
+               jail->argc++;
+       }
+       if (tb[JAIL_ATTR_RONLY] && blobmsg_get_bool(tb[JAIL_ATTR_RONLY])) {
+               jail->ronly = true;
                jail->argc++;
        }
-       if (tb[JAIL_ATTR_SYSFS]) {
-               jail->sysfs = blobmsg_get_bool(tb[JAIL_ATTR_SYSFS]);
+       if (tb[JAIL_ATTR_NETNS] && blobmsg_get_bool(tb[JAIL_ATTR_NETNS])) {
+               jail->netns = true;
                jail->argc++;
        }
-       if (tb[JAIL_ATTR_UBUS]) {
-               jail->ubus = blobmsg_get_bool(tb[JAIL_ATTR_UBUS]);
+       if (tb[JAIL_ATTR_USERNS] && blobmsg_get_bool(tb[JAIL_ATTR_USERNS])) {
+               jail->userns = true;
                jail->argc++;
        }
-       if (tb[JAIL_ATTR_LOG]) {
-               jail->log = blobmsg_get_bool(tb[JAIL_ATTR_LOG]);
+       if (tb[JAIL_ATTR_CGROUPSNS] && blobmsg_get_bool(tb[JAIL_ATTR_CGROUPSNS])) {
+               jail->cgroupsns = true;
                jail->argc++;
        }
-       if (tb[JAIL_ATTR_RONLY]) {
-               jail->ronly = blobmsg_get_bool(tb[JAIL_ATTR_RONLY]);
+       if (tb[JAIL_ATTR_CONSOLE] && blobmsg_get_bool(tb[JAIL_ATTR_CONSOLE])) {
+               jail->console = true;
                jail->argc++;
        }
+       if (tb[JAIL_ATTR_PIDFILE]) {
+               jail->pidfile = strdup(blobmsg_get_string(tb[JAIL_ATTR_PIDFILE]));
+               jail->argc += 2;
+       }
+
+       if (tb[JAIL_ATTR_SETNS]) {
+               struct blob_attr *cur;
+               int rem;
+
+               blobmsg_for_each_attr(cur, tb[JAIL_ATTR_SETNS], rem)
+                       jail->argc += 2;
+               blobmsg_list_fill(&jail->setns, blobmsg_data(tb[JAIL_ATTR_SETNS]),
+                                 blobmsg_data_len(tb[JAIL_ATTR_SETNS]), true);
+       }
+
        if (tb[JAIL_ATTR_MOUNT]) {
                struct blob_attr *cur;
                int rem;
@@ -826,10 +1233,38 @@ instance_jail_parse(struct service_instance *in, struct blob_attr *attr)
                        jail->argc += 2;
                instance_fill_array(&jail->mount, tb[JAIL_ATTR_MOUNT], NULL, false);
        }
+
+       blobmsg_list_for_each(&in->env, var)
+               jail->argc += 2;
+
        if (in->seccomp)
                jail->argc += 2;
 
-       return 1;
+       if (in->capabilities)
+               jail->argc += 2;
+
+       if (in->user)
+               jail->argc += 2;
+
+       if (in->group)
+               jail->argc += 2;
+
+       if (in->extroot)
+               jail->argc += 2;
+
+       if (in->overlaydir)
+               jail->argc += 2;
+
+       if (in->tmpoverlaysize)
+               jail->argc += 2;
+
+       if (in->no_new_privs)
+               jail->argc++;
+
+       if (in->bundle)
+               jail->argc += 2;
+
+       return true;
 }
 
 static bool
@@ -862,13 +1297,14 @@ instance_config_parse(struct service_instance *in)
 {
        struct blob_attr *tb[__INSTANCE_ATTR_MAX];
        struct blob_attr *cur, *cur2;
-       int rem;
+       struct stat s;
+       int rem, r;
 
        blobmsg_parse(instance_attr, __INSTANCE_ATTR_MAX, tb,
                blobmsg_data(in->config), blobmsg_data_len(in->config));
 
-       if (!instance_config_parse_command(in, tb))
-               return false;
+       if (!tb[INSTANCE_ATTR_BUNDLE] && !instance_config_parse_command(in, tb))
+                       return false;
 
        if (tb[INSTANCE_ATTR_TERMTIMEOUT])
                in->term_timeout = blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
@@ -897,7 +1333,7 @@ instance_config_parse(struct service_instance *in)
                blobmsg_for_each_attr(cur2, tb[INSTANCE_ATTR_WATCH], rem) {
                        if (blobmsg_type(cur2) != BLOBMSG_TYPE_STRING)
                                continue;
-                       DEBUG(3, "watch for %s\n", blobmsg_get_string(cur2));
+                       P_DEBUG(3, "watch for %s\n", blobmsg_get_string(cur2));
                        watch_add(blobmsg_get_string(cur2), in);
                }
        }
@@ -934,20 +1370,32 @@ instance_config_parse(struct service_instance *in)
                in->no_new_privs = blobmsg_get_bool(tb[INSTANCE_ATTR_NO_NEW_PRIVS]);
 
        if (!in->trace && tb[INSTANCE_ATTR_SECCOMP])
-               in->seccomp = blobmsg_get_string(tb[INSTANCE_ATTR_SECCOMP]);
+               in->seccomp = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_SECCOMP]));
+
+       if (tb[INSTANCE_ATTR_CAPABILITIES])
+               in->capabilities = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_CAPABILITIES]));
+
+       if (tb[INSTANCE_ATTR_EXTROOT])
+               in->extroot = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_EXTROOT]));
+
+       if (tb[INSTANCE_ATTR_OVERLAYDIR])
+               in->overlaydir = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_OVERLAYDIR]));
+
+       if (tb[INSTANCE_ATTR_TMPOVERLAYSIZE])
+               in->tmpoverlaysize = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_TMPOVERLAYSIZE]));
+
+       if (tb[INSTANCE_ATTR_BUNDLE])
+               in->bundle = strdup(blobmsg_get_string(tb[INSTANCE_ATTR_BUNDLE]));
 
        if (tb[INSTANCE_ATTR_PIDFILE]) {
                char *pidfile = blobmsg_get_string(tb[INSTANCE_ATTR_PIDFILE]);
                if (pidfile)
-                       in->pidfile = pidfile;
+                       in->pidfile = strdup(pidfile);
        }
 
        if (tb[INSTANCE_ATTR_RELOADSIG])
                in->reload_signal = blobmsg_get_u32(tb[INSTANCE_ATTR_RELOADSIG]);
 
-       if (!in->trace && tb[INSTANCE_ATTR_JAIL])
-               in->has_jail = instance_jail_parse(in, tb[INSTANCE_ATTR_JAIL]);
-
        if (tb[INSTANCE_ATTR_STDOUT] && blobmsg_get_bool(tb[INSTANCE_ATTR_STDOUT]))
                in->_stdout.fd.fd = -1;
 
@@ -975,9 +1423,54 @@ instance_config_parse(struct service_instance *in)
                int facility = syslog_facility_str_to_int(blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
                if (facility != -1) {
                        in->syslog_facility = facility;
-                       DEBUG(3, "setting facility '%s'\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
+                       P_DEBUG(3, "setting facility '%s'\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
                } else
-                       DEBUG(3, "unknown syslog facility '%s' given, using default (LOG_DAEMON)\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
+                       P_DEBUG(3, "unknown syslog facility '%s' given, using default (LOG_DAEMON)\n", blobmsg_get_string(tb[INSTANCE_ATTR_FACILITY]));
+       }
+
+       if (tb[INSTANCE_ATTR_WATCHDOG]) {
+               int i = 0;
+               uint32_t vals[2] = { 0, 30 };
+
+               blobmsg_for_each_attr(cur2, tb[INSTANCE_ATTR_WATCHDOG], rem) {
+                       if (i >= 2)
+                               break;
+
+                       vals[i] = atoi(blobmsg_get_string(cur2));
+                       i++;
+               }
+
+               if (vals[0] >= 0 && vals[0] < __INSTANCE_WATCHDOG_MODE_MAX) {
+                       in->watchdog.mode = vals[0];
+                       P_DEBUG(3, "setting watchdog mode (%d)\n", vals[0]);
+               } else {
+                       in->watchdog.mode = 0;
+                       P_DEBUG(3, "unknown watchdog mode (%d) given, using default (0)\n", vals[0]);
+               }
+
+               if (vals[1] > 0) {
+                       in->watchdog.freq = vals[1];
+                       P_DEBUG(3, "setting watchdog timeout (%d)\n", vals[0]);
+               } else {
+                       in->watchdog.freq = 30;
+                       P_DEBUG(3, "invalid watchdog timeout (%d) given, using default (30)\n", vals[1]);
+               }
+       }
+
+       if (!in->trace && tb[INSTANCE_ATTR_JAIL])
+               in->has_jail = instance_jail_parse(in, tb[INSTANCE_ATTR_JAIL]);
+
+       if (in->has_jail) {
+               r = stat(UJAIL_BIN_PATH, &s);
+               if (r < 0) {
+                       if (in->require_jail) {
+                               ERROR("Cannot jail service %s::%s. %s: %m (%d)\n",
+                                               in->srv->name, in->name, UJAIL_BIN_PATH, r);
+                               return false;
+                       }
+                       P_DEBUG(2, "unable to find %s: %m (%d)\n", UJAIL_BIN_PATH, r);
+                       in->has_jail = false;
+               }
        }
 
        return true;
@@ -993,6 +1486,21 @@ instance_config_cleanup(struct service_instance *in)
        blobmsg_list_free(&in->limits);
        blobmsg_list_free(&in->errors);
        blobmsg_list_free(&in->jail.mount);
+       blobmsg_list_free(&in->jail.setns);
+}
+
+static void
+instance_config_move_strdup(char **dst, char *src)
+{
+       if (*dst) {
+               free(*dst);
+               *dst = NULL;
+       }
+
+       if (!src)
+               return;
+
+       *dst = strdup(src);
 }
 
 static void
@@ -1006,18 +1514,53 @@ instance_config_move(struct service_instance *in, struct service_instance *in_sr
        blobmsg_list_move(&in->limits, &in_src->limits);
        blobmsg_list_move(&in->errors, &in_src->errors);
        blobmsg_list_move(&in->jail.mount, &in_src->jail.mount);
+       blobmsg_list_move(&in->jail.setns, &in_src->jail.setns);
        in->trigger = in_src->trigger;
        in->command = in_src->command;
-       in->pidfile = in_src->pidfile;
        in->respawn = in_src->respawn;
        in->respawn_retry = in_src->respawn_retry;
        in->respawn_threshold = in_src->respawn_threshold;
        in->respawn_timeout = in_src->respawn_timeout;
+       in->reload_signal = in_src->reload_signal;
+       in->term_timeout = in_src->term_timeout;
+       in->watchdog.mode = in_src->watchdog.mode;
+       in->watchdog.freq = in_src->watchdog.freq;
+       in->watchdog.timeout = in_src->watchdog.timeout;
        in->name = in_src->name;
+       in->nice = in_src->nice;
        in->trace = in_src->trace;
-       in->seccomp = in_src->seccomp;
        in->node.avl.key = in_src->node.avl.key;
        in->syslog_facility = in_src->syslog_facility;
+       in->require_jail = in_src->require_jail;
+       in->no_new_privs = in_src->no_new_privs;
+       in->immediately = in_src->immediately;
+       in->uid = in_src->uid;
+       in->pw_gid = in_src->pw_gid;
+       in->gr_gid = in_src->gr_gid;
+
+       in->has_jail = in_src->has_jail;
+       in->jail.procfs = in_src->jail.procfs;
+       in->jail.sysfs = in_src->jail.sysfs;
+       in->jail.ubus = in_src->jail.ubus;
+       in->jail.log = in_src->jail.log;
+       in->jail.ronly = in_src->jail.ronly;
+       in->jail.netns = in_src->jail.netns;
+       in->jail.cgroupsns = in_src->jail.cgroupsns;
+       in->jail.console = in_src->jail.console;
+       in->jail.argc = in_src->jail.argc;
+
+       instance_config_move_strdup(&in->pidfile, in_src->pidfile);
+       instance_config_move_strdup(&in->seccomp, in_src->seccomp);
+       instance_config_move_strdup(&in->capabilities, in_src->capabilities);
+       instance_config_move_strdup(&in->bundle, in_src->bundle);
+       instance_config_move_strdup(&in->extroot, in_src->extroot);
+       instance_config_move_strdup(&in->overlaydir, in_src->overlaydir);
+       instance_config_move_strdup(&in->tmpoverlaysize, in_src->tmpoverlaysize);
+       instance_config_move_strdup(&in->user, in_src->user);
+       instance_config_move_strdup(&in->group, in_src->group);
+       instance_config_move_strdup(&in->jail.name, in_src->jail.name);
+       instance_config_move_strdup(&in->jail.hostname, in_src->jail.hostname);
+       instance_config_move_strdup(&in->jail.pidfile, in_src->jail.pidfile);
 
        free(in->config);
        in->config = in_src->config;
@@ -1048,12 +1591,23 @@ instance_free(struct service_instance *in)
        instance_free_stdio(in);
        uloop_process_delete(&in->proc);
        uloop_timeout_cancel(&in->timeout);
+       uloop_timeout_cancel(&in->watchdog.timeout);
        trigger_del(in);
        watch_del(in);
        instance_config_cleanup(in);
        free(in->config);
        free(in->user);
        free(in->group);
+       free(in->extroot);
+       free(in->overlaydir);
+       free(in->tmpoverlaysize);
+       free(in->bundle);
+       free(in->jail.name);
+       free(in->jail.hostname);
+       free(in->jail.pidfile);
+       free(in->seccomp);
+       free(in->capabilities);
+       free(in->pidfile);
        free(in);
 }
 
@@ -1068,6 +1622,9 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *
        in->proc.cb = instance_exit;
        in->term_timeout = 5;
        in->syslog_facility = LOG_DAEMON;
+       in->exit_code = 0;
+       in->require_jail = false;
+       in->immediately = false;
 
        in->_stdout.fd.fd = -2;
        in->_stdout.stream.string_data = true;
@@ -1077,6 +1634,14 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *
        in->_stderr.stream.string_data = true;
        in->_stderr.stream.notify_read = instance_stderr;
 
+       in->console.fd.fd = -2;
+       in->console.stream.string_data = true;
+       in->console.stream.notify_read = instance_console;
+
+       in->console_client.fd.fd = -2;
+       in->console_client.stream.string_data = true;
+       in->console_client.stream.notify_read = instance_console_client;
+
        blobmsg_list_init(&in->netdev, struct instance_netdev, node, instance_netdev_cmp);
        blobmsg_list_init(&in->file, struct instance_file, node, instance_file_cmp);
        blobmsg_list_simple_init(&in->env);
@@ -1084,6 +1649,10 @@ instance_init(struct service_instance *in, struct service *s, struct blob_attr *
        blobmsg_list_simple_init(&in->limits);
        blobmsg_list_simple_init(&in->errors);
        blobmsg_list_simple_init(&in->jail.mount);
+       blobmsg_list_simple_init(&in->jail.setns);
+
+       in->watchdog.timeout.cb = instance_watchdog;
+
        in->valid = instance_config_parse(in);
 }
 
@@ -1100,7 +1669,11 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                blobmsg_add_u32(b, "pid", in->proc.pid);
        if (in->command)
                blobmsg_add_blob(b, in->command);
+       if (in->bundle)
+               blobmsg_add_string(b, "bundle", in->bundle);
        blobmsg_add_u32(b, "term_timeout", in->term_timeout);
+       if (!in->proc.pending)
+               blobmsg_add_u32(b, "exit_code", in->exit_code);
 
        if (!avl_is_empty(&in->errors.avl)) {
                struct blobmsg_list_node *var;
@@ -1134,6 +1707,15 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                blobmsg_close_table(b, e);
        }
 
+       if (!avl_is_empty(&in->netdev.avl)) {
+               struct blobmsg_list_node *var;
+               void *n = blobmsg_open_array(b, "netdev");
+
+               blobmsg_list_for_each(&in->netdev, var)
+                       blobmsg_add_string(b, NULL, blobmsg_data(var->data));
+               blobmsg_close_array(b, n);
+       }
+
        if (in->reload_signal)
                blobmsg_add_u32(b, "reload_signal", in->reload_signal);
 
@@ -1154,6 +1736,9 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
        if (in->seccomp)
                blobmsg_add_string(b, "seccomp", in->seccomp);
 
+       if (in->capabilities)
+               blobmsg_add_string(b, "capabilities", in->capabilities);
+
        if (in->pidfile)
                blobmsg_add_string(b, "pidfile", in->pidfile);
 
@@ -1167,13 +1752,25 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                void *r = blobmsg_open_table(b, "jail");
                if (in->jail.name)
                        blobmsg_add_string(b, "name", in->jail.name);
-               if (in->jail.hostname)
-                       blobmsg_add_string(b, "hostname", in->jail.hostname);
-               blobmsg_add_u8(b, "procfs", in->jail.procfs);
-               blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
-               blobmsg_add_u8(b, "ubus", in->jail.ubus);
-               blobmsg_add_u8(b, "log", in->jail.log);
-               blobmsg_add_u8(b, "ronly", in->jail.ronly);
+               if (!in->bundle) {
+                       if (in->jail.hostname)
+                               blobmsg_add_string(b, "hostname", in->jail.hostname);
+
+                       blobmsg_add_u8(b, "procfs", in->jail.procfs);
+                       blobmsg_add_u8(b, "sysfs", in->jail.sysfs);
+                       blobmsg_add_u8(b, "ubus", in->jail.ubus);
+                       blobmsg_add_u8(b, "log", in->jail.log);
+                       blobmsg_add_u8(b, "ronly", in->jail.ronly);
+                       blobmsg_add_u8(b, "netns", in->jail.netns);
+                       blobmsg_add_u8(b, "userns", in->jail.userns);
+                       blobmsg_add_u8(b, "cgroupsns", in->jail.cgroupsns);
+               } else {
+                       if (in->jail.pidfile)
+                               blobmsg_add_string(b, "pidfile", in->jail.pidfile);
+
+                       blobmsg_add_u8(b, "immediately", in->immediately);
+               }
+               blobmsg_add_u8(b, "console", (in->console.fd.fd > -1));
                blobmsg_close_table(b, r);
                if (!avl_is_empty(&in->jail.mount.avl)) {
                        struct blobmsg_list_node *var;
@@ -1182,10 +1779,32 @@ void instance_dump(struct blob_buf *b, struct service_instance *in, int verbose)
                                blobmsg_add_string(b, blobmsg_name(var->data), blobmsg_data(var->data));
                        blobmsg_close_table(b, e);
                }
+
+               if (!avl_is_empty(&in->jail.setns.avl)) {
+                       struct blobmsg_list_node *var;
+                       void *s = blobmsg_open_array(b, "setns");
+                       blobmsg_list_for_each(&in->jail.setns, var)
+                               blobmsg_add_blob(b, var->data);
+                       blobmsg_close_array(b, s);
+               }
        }
 
+       if (in->extroot)
+               blobmsg_add_string(b, "extroot", in->extroot);
+       if (in->overlaydir)
+               blobmsg_add_string(b, "overlaydir", in->overlaydir);
+       if (in->tmpoverlaysize)
+               blobmsg_add_string(b, "tmpoverlaysize", in->tmpoverlaysize);
+
        if (verbose && in->trigger)
                blobmsg_add_blob(b, in->trigger);
 
+       if (in->watchdog.mode != INSTANCE_WATCHDOG_MODE_DISABLED) {
+               void *r = blobmsg_open_table(b, "watchdog");
+               blobmsg_add_u32(b, "mode", in->watchdog.mode);
+               blobmsg_add_u32(b, "timeout", in->watchdog.freq);
+               blobmsg_close_table(b, r);
+       }
+
        blobmsg_close_table(b, i);
 }
index 42cc4be1899383ee01a6aa8e470cef1f38c860d5..15eb99714c65bffd51d46d046b0bd19d0ada503a 100644 (file)
@@ -21,6 +21,7 @@
 #include "../utils/utils.h"
 
 #define RESPAWN_ERROR  (5 * 60)
+#define SIGNALLED_OFFSET 128
 
 struct jail {
        bool procfs;
@@ -28,12 +29,31 @@ struct jail {
        bool ubus;
        bool log;
        bool ronly;
+       bool netns;
+       bool userns;
+       bool cgroupsns;
+       bool console;
        char *name;
        char *hostname;
+       char *pidfile;
        struct blobmsg_list mount;
+       struct blobmsg_list setns;
        int argc;
 };
 
+typedef enum instance_watchdog {
+       INSTANCE_WATCHDOG_MODE_DISABLED,
+       INSTANCE_WATCHDOG_MODE_PASSIVE,
+       INSTANCE_WATCHDOG_MODE_ACTIVE,
+       __INSTANCE_WATCHDOG_MODE_MAX,
+} instance_watchdog_mode_t;
+
+struct watchdog {
+       instance_watchdog_mode_t mode;
+       uint32_t freq;
+       struct uloop_timeout timeout;
+};
+
 struct service_instance {
        struct vlist_node node;
        struct service *srv;
@@ -57,11 +77,19 @@ struct service_instance {
 
        bool trace;
        bool has_jail;
+       bool require_jail;
+       bool immediately;
        bool no_new_privs;
        struct jail jail;
        char *seccomp;
+       char *capabilities;
        char *pidfile;
+       char *extroot;
+       char *overlaydir;
+       char *tmpoverlaysize;
+       char *bundle;
        int syslog_facility;
+       int exit_code;
 
        uint32_t term_timeout;
        uint32_t respawn_timeout;
@@ -73,6 +101,8 @@ struct service_instance {
        struct uloop_timeout timeout;
        struct ustream_fd _stdout;
        struct ustream_fd _stderr;
+       struct ustream_fd console;
+       struct ustream_fd console_client;
 
        struct blob_attr *command;
        struct blob_attr *trigger;
@@ -82,6 +112,8 @@ struct service_instance {
        struct blobmsg_list file;
        struct blobmsg_list limits;
        struct blobmsg_list errors;
+
+       struct watchdog watchdog;
 };
 
 void instance_start(struct service_instance *in);
index 755147c45b59a39a6e8327864e086c123f421ea8..bd0e2903a439a8a8748ddc2507b213e3c8260b8a 100644 (file)
  * GNU General Public License for more details.
  */
 
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/utsname.h>
+#include <sys/types.h>
+#include <fcntl.h>
+
+#include <unistd.h>
+#include <sched.h>
+
 #include <libubox/blobmsg_json.h>
 #include <libubox/avl-cmp.h>
 
@@ -23,6 +32,7 @@
 #include "../rcS.h"
 
 AVL_TREE(services, avl_strcmp, false, NULL);
+AVL_TREE(containers, avl_strcmp, false, NULL);
 static struct blob_buf b;
 static struct ubus_context *ctx;
 static struct ubus_object main_object;
@@ -56,14 +66,14 @@ service_instance_update(struct vlist_tree *tree, struct vlist_node *node_new,
                in_n = container_of(node_new, struct service_instance, node);
 
        if (in_o && in_n) {
-               DEBUG(2, "Update instance %s::%s\n", in_o->srv->name, in_o->name);
+               P_DEBUG(2, "Update instance %s::%s\n", in_o->srv->name, in_o->name);
                instance_update(in_o, in_n);
                instance_free(in_n);
        } else if (in_o) {
-               DEBUG(2, "Stop instance %s::%s\n", in_o->srv->name, in_o->name);
+               P_DEBUG(2, "Stop instance %s::%s\n", in_o->srv->name, in_o->name);
                instance_stop(in_o, true);
        } else if (in_n && in_n->srv->autostart) {
-               DEBUG(2, "Start instance %s::%s\n", in_n->srv->name, in_n->name);
+               P_DEBUG(2, "Start instance %s::%s\n", in_n->srv->name, in_n->name);
                instance_start(in_n);
        }
        blob_buf_init(&b, 0);
@@ -173,7 +183,7 @@ service_update(struct service *s, struct blob_attr **tb, bool add)
 }
 
 static void
-service_delete(struct service *s)
+service_delete(struct service *s, bool container)
 {
        blobmsg_list_free(&s->data_blob);
        free(s->data);
@@ -274,6 +284,116 @@ static const struct blobmsg_policy get_data_policy[] = {
        [DATA_TYPE] = { "type", BLOBMSG_TYPE_STRING },
 };
 
+enum {
+       CONTAINER_CONSOLE_NAME,
+       CONTAINER_CONSOLE_INSTANCE,
+       __CONTAINER_CONSOLE_MAX,
+};
+
+static const struct blobmsg_policy container_console_policy[__CONTAINER_CONSOLE_MAX] = {
+       [CONTAINER_CONSOLE_NAME] = { "name", BLOBMSG_TYPE_STRING },
+       [CONTAINER_CONSOLE_INSTANCE] = { "instance", BLOBMSG_TYPE_STRING },
+};
+
+static inline bool is_container_obj(struct ubus_object *obj)
+{
+       return (obj && (strcmp(obj->name, "container") == 0));
+}
+
+static inline void put_namespace(struct blob_buf *b, char *name)
+{
+       char nsfname[32];
+       struct stat statbuf;
+
+       snprintf(nsfname, sizeof(nsfname), "/proc/self/ns/%s", name);
+
+       if (!stat(nsfname, &statbuf))
+               blobmsg_add_string(b, NULL, name);
+}
+
+static void put_cgroups(struct blob_buf *b)
+{
+       int fd, ret;
+       static char buf[512] = "";
+       char *t, *z;
+
+       fd = open("/sys/fs/cgroup/cgroup.controllers", O_RDONLY);
+       if (fd == -1)
+               return;
+
+       ret = read(fd, &buf, sizeof(buf));
+       /* make sure buffer is NULL-terminated */
+       buf[sizeof(buf)-1] = '\0';
+
+       close(fd);
+
+       if (ret < 2)
+               return;
+
+       t = buf;
+       while(t) {
+               z = t;
+               /* replace space with \0 and direct next entry */
+               t = strchr(z, ' ');
+               if (t) {
+                       *(t++) = '\0';
+               } else { /* replace trailing new-line with \0 */
+                       t = strchr(z, '\n');
+                       if (!t) /* shouldn't happen, but don't segfault if it does */
+                               break;
+
+                       *t = '\0';
+                       t = NULL;
+               }
+               blobmsg_add_string(b, NULL, z);
+       }
+}
+
+static int
+container_handle_features(struct ubus_context *ctx, struct ubus_object *obj,
+                   struct ubus_request_data *req, const char *method,
+                   struct blob_attr *msg)
+{
+       struct utsname utsbuf;
+       struct stat statbuf;
+       void *nsarray, *cgarray;
+
+       if (stat("/sbin/ujail", &statbuf))
+               return UBUS_STATUS_NOT_SUPPORTED;
+
+       if (uname(&utsbuf) < 0)
+               return UBUS_STATUS_UNKNOWN_ERROR;
+
+       blob_buf_init(&b, 0);
+       blobmsg_add_string(&b, "machine", utsbuf.machine);
+
+#ifdef SECCOMP_SUPPORT
+       blobmsg_add_u8(&b, "seccomp", true);
+#else
+       blobmsg_add_u8(&b, "seccomp", false);
+#endif
+
+       cgarray = blobmsg_open_array(&b, "cgroup");
+       put_cgroups(&b);
+       blobmsg_close_array(&b, cgarray);
+
+       nsarray = blobmsg_open_array(&b, "namespaces");
+       put_namespace(&b, "cgroup");
+       put_namespace(&b, "ipc");
+       put_namespace(&b, "mnt");
+       put_namespace(&b, "net");
+       put_namespace(&b, "pid");
+#ifdef CLONE_NEWTIME
+       put_namespace(&b, "time");
+#endif
+       put_namespace(&b, "user");
+       put_namespace(&b, "uts");
+       blobmsg_close_array(&b, nsarray);
+       ubus_send_reply(ctx, req, b.head);
+
+       return UBUS_STATUS_OK;
+}
+
 static int
 service_handle_set(struct ubus_context *ctx, struct ubus_object *obj,
                   struct ubus_request_data *req, const char *method,
@@ -282,6 +402,7 @@ service_handle_set(struct ubus_context *ctx, struct ubus_object *obj,
        struct blob_attr *tb[__SERVICE_SET_MAX], *cur;
        struct service *s = NULL;
        const char *name;
+       bool container = is_container_obj(obj);
        bool add = !strcmp(method, "add");
        int ret;
 
@@ -292,25 +413,36 @@ service_handle_set(struct ubus_context *ctx, struct ubus_object *obj,
 
        name = blobmsg_data(cur);
 
-       s = avl_find_element(&services, name, s, avl);
+       if (container)
+               s = avl_find_element(&containers, name, s, avl);
+       else
+               s = avl_find_element(&services, name, s, avl);
+
        if (s) {
-               DEBUG(2, "Update service %s\n", name);
+               P_DEBUG(2, "Update service %s\n", name);
                return service_update(s, tb, add);
        }
 
-       DEBUG(2, "Create service %s\n", name);
+       P_DEBUG(2, "Create service %s\n", name);
        s = service_alloc(name);
        if (!s)
                return UBUS_STATUS_UNKNOWN_ERROR;
 
+       s->container = container;
+
        ret = service_update(s, tb, add);
        if (ret)
                return ret;
 
-       avl_insert(&services, &s->avl);
+       if (container) {
+               avl_insert(&containers, &s->avl);
 
-       service_event("service.start", s->name, NULL);
+               service_event("container.start", s->name, NULL);
+       } else {
+               avl_insert(&services, &s->avl);
 
+               service_event("service.start", s->name, NULL);
+       }
        return 0;
 }
 
@@ -355,6 +487,8 @@ service_handle_list(struct ubus_context *ctx, struct ubus_object *obj,
        struct service *s;
        const char *name = NULL;
        bool verbose = false;
+       bool container = is_container_obj(obj);
+       const struct avl_tree *tree = container?&containers:&services;
 
        blobmsg_parse(service_list_attrs, __SERVICE_LIST_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
 
@@ -364,7 +498,7 @@ service_handle_list(struct ubus_context *ctx, struct ubus_object *obj,
                name = blobmsg_get_string(tb[SERVICE_LIST_ATTR_NAME]);
 
        blob_buf_init(&b, 0);
-       avl_for_each_element(&services, s, avl) {
+       avl_for_each_element(tree, s, avl) {
                if (name && strcmp(s->name, name) != 0)
                        continue;
 
@@ -384,6 +518,7 @@ service_handle_delete(struct ubus_context *ctx, struct ubus_object *obj,
        struct blob_attr *tb[__SERVICE_DEL_ATTR_MAX], *cur;
        struct service *s;
        struct service_instance *in;
+       bool container = is_container_obj(obj);
 
        blobmsg_parse(service_del_attrs, __SERVICE_DEL_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
 
@@ -391,13 +526,17 @@ service_handle_delete(struct ubus_context *ctx, struct ubus_object *obj,
        if (!cur)
                return UBUS_STATUS_NOT_FOUND;
 
-       s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+       if (container)
+               s = avl_find_element(&containers, blobmsg_data(cur), s, avl);
+       else
+               s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+
        if (!s)
                return UBUS_STATUS_NOT_FOUND;
 
        cur = tb[SERVICE_DEL_ATTR_INSTANCE];
        if (!cur) {
-               service_delete(s);
+               service_delete(s, container);
                return 0;
        }
 
@@ -435,6 +574,7 @@ service_handle_signal(struct ubus_context *ctx, struct ubus_object *obj,
        struct blob_attr *tb[__SERVICE_SIGNAL_ATTR_MAX], *cur;
        struct service *s;
        struct service_instance *in;
+       bool container = is_container_obj(obj);
        int sig = SIGHUP;
        int rv = 0;
 
@@ -448,7 +588,11 @@ service_handle_signal(struct ubus_context *ctx, struct ubus_object *obj,
        if (!cur)
                return UBUS_STATUS_NOT_FOUND;
 
-       s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+       if (container)
+               s = avl_find_element(&containers, blobmsg_data(cur), s, avl);
+       else
+               s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+
        if (!s)
                return UBUS_STATUS_NOT_FOUND;
 
@@ -477,6 +621,7 @@ service_handle_state(struct ubus_context *ctx, struct ubus_object *obj,
        struct blob_attr *tb[__SERVICE_STATE_ATTR_MAX];
        struct service *s;
        struct service_instance *in;
+       bool container = is_container_obj(obj);
        int spawn;
 
        blobmsg_parse(service_state_attrs, __SERVICE_STATE_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
@@ -487,7 +632,11 @@ service_handle_state(struct ubus_context *ctx, struct ubus_object *obj,
        if (!tb[SERVICE_STATE_ATTR_NAME])
                return UBUS_STATUS_NOT_FOUND;
 
-       s = avl_find_element(&services, blobmsg_data(tb[SERVICE_STATE_ATTR_NAME]), s, avl);
+       if (container)
+               s = avl_find_element(&containers, blobmsg_data(tb[SERVICE_STATE_ATTR_NAME]), s, avl);
+       else
+               s = avl_find_element(&services, blobmsg_data(tb[SERVICE_STATE_ATTR_NAME]), s, avl);
+
        if (!s)
                return UBUS_STATUS_NOT_FOUND;
 
@@ -504,6 +653,34 @@ service_handle_state(struct ubus_context *ctx, struct ubus_object *obj,
        return UBUS_STATUS_OK;
 }
 
+static void
+service_avl_stop_all(struct avl_tree *sctree, unsigned int *term_timeout)
+{
+       struct service *s;
+
+       avl_for_each_element(sctree, s, avl) {
+               struct service_instance *in, *ptr;
+
+               vlist_for_each_element_safe(&s->instances, in, node, ptr) {
+                       if (in->term_timeout > *term_timeout)
+                               *term_timeout = in->term_timeout;
+                       instance_stop(in, true);
+               }
+       }
+}
+
+void
+service_stop_all(void)
+{
+       unsigned int term_timeout = 0;
+
+       service_avl_stop_all(&containers, &term_timeout);
+       service_avl_stop_all(&services, &term_timeout);
+       procd_inittab_kill();
+
+       sleep(term_timeout);
+}
+
 static int
 service_handle_update(struct ubus_context *ctx, struct ubus_object *obj,
                      struct ubus_request_data *req, const char *method,
@@ -511,6 +688,7 @@ service_handle_update(struct ubus_context *ctx, struct ubus_object *obj,
 {
        struct blob_attr *tb[__SERVICE_ATTR_MAX], *cur;
        struct service *s;
+       bool container = is_container_obj(obj);
 
        blobmsg_parse(service_attrs, __SERVICE_ATTR_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
 
@@ -518,7 +696,11 @@ service_handle_update(struct ubus_context *ctx, struct ubus_object *obj,
        if (!cur)
                return UBUS_STATUS_INVALID_ARGUMENT;
 
-       s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+       if (container)
+               s = avl_find_element(&containers, blobmsg_data(cur), s, avl);
+       else
+               s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+
        if (!s)
                return UBUS_STATUS_NOT_FOUND;
 
@@ -672,6 +854,129 @@ service_get_data(struct ubus_context *ctx, struct ubus_object *obj,
        return 0;
 }
 
+static int
+container_handle_console(struct ubus_context *ctx, struct ubus_object *obj,
+                        struct ubus_request_data *req, const char *method,
+                        struct blob_attr *msg)
+{
+       bool attach = !strcmp(method, "console_attach");
+       struct blob_attr *tb[__CONTAINER_CONSOLE_MAX];
+       struct service *s;
+       struct service_instance *in;
+       int console_fd = -1;
+
+       console_fd = ubus_request_get_caller_fd(req);
+       if (console_fd < 0)
+               return UBUS_STATUS_INVALID_ARGUMENT;
+
+       if (!msg)
+               goto err_console_fd;
+
+       blobmsg_parse(container_console_policy, __CONTAINER_CONSOLE_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
+       if (!tb[CONTAINER_CONSOLE_NAME])
+               goto err_console_fd;
+
+       s = avl_find_element(&containers, blobmsg_data(tb[CONTAINER_CONSOLE_NAME]), s, avl);
+       if (!s)
+               goto err_console_fd;
+
+       if (tb[CONTAINER_CONSOLE_INSTANCE]) {
+               in = vlist_find(&s->instances, blobmsg_data(tb[CONTAINER_CONSOLE_INSTANCE]), in, node);
+       } else {
+               /* use first element in instances list */
+               vlist_for_each_element(&s->instances, in, node)
+                       break;
+       }
+       if (!in)
+               goto err_console_fd;
+
+       if (attach) {
+               if (in->console.fd.fd < 0) {
+                       close(console_fd);
+                       return UBUS_STATUS_NOT_SUPPORTED;
+               }
+
+               /* close and replace existing attached console */
+               if (in->console_client.fd.fd > -1)
+                       close(in->console_client.fd.fd);
+
+               ustream_fd_init(&in->console_client, console_fd);
+       } else {
+               ustream_fd_init(&in->console, console_fd);
+       }
+
+       return UBUS_STATUS_OK;
+err_console_fd:
+       close(console_fd);
+       return UBUS_STATUS_INVALID_ARGUMENT;
+}
+
+enum {
+       SERVICE_WATCHDOG_MODE,
+       SERVICE_WATCHDOG_TIMEOUT,
+       SERVICE_WATCHDOG_NAME,
+       SERVICE_WATCHDOG_INSTANCE,
+       __SERVICE_WATCHDOG_MAX,
+};
+
+static const struct blobmsg_policy service_watchdog_policy[__SERVICE_WATCHDOG_MAX] = {
+       [SERVICE_WATCHDOG_MODE] = { "mode", BLOBMSG_TYPE_INT32 },
+       [SERVICE_WATCHDOG_NAME] = { "name", BLOBMSG_TYPE_STRING },
+       [SERVICE_WATCHDOG_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
+       [SERVICE_WATCHDOG_INSTANCE] = { "instance", BLOBMSG_TYPE_STRING },
+};
+
+static int
+service_handle_watchdog(struct ubus_context *ctx, struct ubus_object *obj,
+                   struct ubus_request_data *req, const char *method,
+                   struct blob_attr *msg)
+{
+       struct blob_attr *tb[__SERVICE_WATCHDOG_MAX] = {0};
+       struct service *s;
+       struct blob_attr *cur;
+       struct service_instance *in;
+
+       blobmsg_parse(service_watchdog_policy, __SERVICE_WATCHDOG_MAX, tb, blobmsg_data(msg), blobmsg_data_len(msg));
+       cur = tb[SERVICE_WATCHDOG_NAME];
+       if (!cur)
+               return UBUS_STATUS_NOT_FOUND;
+
+       s = avl_find_element(&services, blobmsg_data(cur), s, avl);
+       if (!s)
+               return UBUS_STATUS_NOT_FOUND;
+
+       cur = tb[SERVICE_WATCHDOG_INSTANCE];
+       if (!cur)
+               return UBUS_STATUS_NOT_FOUND;
+
+       in = vlist_find(&s->instances, blobmsg_data(cur), in, node);
+       if (!in) {
+               ERROR("instance %s not found\n", blobmsg_get_string(cur));
+               return UBUS_STATUS_NOT_FOUND;
+       }
+
+       if (tb[SERVICE_WATCHDOG_MODE])
+               in->watchdog.mode = blobmsg_get_u32(tb[SERVICE_WATCHDOG_MODE]);
+
+       if (tb[SERVICE_WATCHDOG_TIMEOUT])
+               in->watchdog.freq = blobmsg_get_u32(tb[SERVICE_WATCHDOG_TIMEOUT]);
+
+       if (in->watchdog.mode == INSTANCE_WATCHDOG_MODE_DISABLED)
+               uloop_timeout_cancel(&in->watchdog.timeout);
+       else
+               uloop_timeout_set(&in->watchdog.timeout, in->watchdog.freq * 1000);
+
+       blob_buf_init(&b, 0);
+       blobmsg_add_string(&b, "name", blobmsg_get_string(tb[SERVICE_WATCHDOG_NAME]));
+       blobmsg_add_string(&b, "instance", blobmsg_get_string(tb[SERVICE_WATCHDOG_INSTANCE]));
+       blobmsg_add_u32(&b, "mode", in->watchdog.mode);
+       blobmsg_add_u32(&b, "timeout", in->watchdog.freq);
+
+       ubus_send_reply(ctx, req, b.head);
+
+       return UBUS_STATUS_OK;
+}
+
 static struct ubus_method main_object_methods[] = {
        UBUS_METHOD("set", service_handle_set, service_set_attrs),
        UBUS_METHOD("add", service_handle_set, service_set_attrs),
@@ -684,6 +989,7 @@ static struct ubus_method main_object_methods[] = {
        UBUS_METHOD("validate", service_handle_validate, validate_policy),
        UBUS_METHOD("get_data", service_get_data, get_data_policy),
        UBUS_METHOD("state", service_handle_state, service_state_attrs),
+       UBUS_METHOD("watchdog", service_handle_watchdog, service_watchdog_policy),
 };
 
 static struct ubus_object_type main_object_type =
@@ -697,7 +1003,7 @@ static struct ubus_object main_object = {
 };
 
 int
-service_start_early(char *name, char *cmdline)
+service_start_early(char *name, char *cmdline, char *user, char *group)
 {
        void *instances, *instance, *command, *respawn;
        char *t;
@@ -718,6 +1024,11 @@ service_start_early(char *name, char *cmdline)
        blobmsg_add_string(&b, NULL, "1");
        blobmsg_add_string(&b, NULL, "0");
        blobmsg_close_array(&b, respawn);
+       if (user)
+               blobmsg_add_string(&b, "user", user);
+       if (group)
+               blobmsg_add_string(&b, "group", group);
+
        blobmsg_close_table(&b, instance);
        blobmsg_close_table(&b, instances);
 
@@ -727,8 +1038,13 @@ service_start_early(char *name, char *cmdline)
 void service_stopped(struct service *s)
 {
        if (s->deleted && avl_is_empty(&s->instances.avl)) {
-               service_event("service.stop", s->name, NULL);
-               avl_delete(&services, &s->avl);
+               if (s->container) {
+                       service_event("container.stop", s->name, NULL);
+                       avl_delete(&containers, &s->avl);
+               } else {
+                       service_event("service.stop", s->name, NULL);
+                       avl_delete(&services, &s->avl);
+               }
                trigger_del(s);
                service_validate_del(s);
                free(s->trigger);
@@ -741,8 +1057,34 @@ void service_event(const char *type, const char *service, const char *instance)
        ubus_event_bcast(type, "service", service, "instance", instance);
 }
 
+static struct ubus_method container_object_methods[] = {
+       UBUS_METHOD("set", service_handle_set, service_set_attrs),
+       UBUS_METHOD("add", service_handle_set, service_set_attrs),
+       UBUS_METHOD("list", service_handle_list, service_list_attrs),
+       UBUS_METHOD("delete", service_handle_delete, service_del_attrs),
+       UBUS_METHOD("state", service_handle_state, service_state_attrs),
+       UBUS_METHOD_NOARG("get_features", container_handle_features),
+       UBUS_METHOD("console_set", container_handle_console, container_console_policy),
+       UBUS_METHOD("console_attach", container_handle_console, container_console_policy),
+};
+
+static struct ubus_object_type container_object_type =
+       UBUS_OBJECT_TYPE("container", container_object_methods);
+
+static struct ubus_object container_object = {
+       .name = "container",
+       .type = &container_object_type,
+       .methods = container_object_methods,
+       .n_methods = ARRAY_SIZE(container_object_methods),
+};
+
 void ubus_init_service(struct ubus_context *_ctx)
 {
+       struct stat statbuf;
+
        ctx = _ctx;
        ubus_add_object(ctx, &main_object);
+
+       if (!stat("/sbin/ujail", &statbuf))
+               ubus_add_object(ctx, &container_object);
 }
index 8f94ecdaa317c6e87a5731dbbd4480d00a8010d3..6ddc04ee3aa60e19957e68e6d493705f64caccd5 100644 (file)
@@ -21,6 +21,7 @@
 #include "../utils/utils.h"
 
 extern struct avl_tree services;
+extern struct avl_tree containers;
 
 struct vrule {
        struct avl_node avl;
@@ -43,6 +44,7 @@ struct service {
        const char *name;
        bool deleted;
        bool autostart;
+       bool container;
 
        struct blob_attr *trigger;
        struct vlist_tree instances;
@@ -54,10 +56,11 @@ struct service {
 void service_validate_add(struct service *s, struct blob_attr *attr);
 void service_validate_dump(struct blob_buf *b, struct service *s);
 void service_validate_dump_all(struct blob_buf *b, char *p, char *s);
-int service_start_early(char *name, char *cmdline);
+int service_start_early(char *name, char *cmdline, char *user, char *group);
 void service_stopped(struct service *s);
 void service_validate_del(struct service *s);
 void service_event(const char *type, const char *service, const char *instance);
+void service_stop_all(void);
 
 
 
index 440830b26b72ff9afeac701cdd1fb89c6234ef58..86cbc0f257678c2c31a5d36c2af28f624bce0273 100644 (file)
@@ -105,6 +105,7 @@ static void trigger_command_run(struct runqueue *q, struct runqueue_task *t)
        pid_t pid;
        int n = 0;
        int rem;
+       int fd;
 
        pid = fork();
        if (pid < 0) {
@@ -117,10 +118,12 @@ static void trigger_command_run(struct runqueue *q, struct runqueue_task *t)
                return;
        }
 
-       if (debug < 3) {
-               close(STDIN_FILENO);
-               close(STDOUT_FILENO);
-               close(STDERR_FILENO);
+       if (debug < 3 && (fd = open("/dev/null", O_RDWR)) >= 0) {
+               dup2(fd, STDIN_FILENO);
+               dup2(fd, STDOUT_FILENO);
+               dup2(fd, STDERR_FILENO);
+               if (fd > STDERR_FILENO)
+                       close(fd);
        }
 
        blobmsg_for_each_attr(cur, cmd->data, rem)
@@ -155,7 +158,7 @@ static void trigger_command_start(struct uloop_timeout *timeout)
 static void trigger_command_add(struct trigger *t, struct blob_attr *data)
 {
        struct trigger_command *cmd;
-       int remaining;
+       int64_t remaining;
 
        cmd = avl_find_element(&trigger_pending, data, cmd, avl);
        if (cmd) {
@@ -166,7 +169,7 @@ static void trigger_command_add(struct trigger *t, struct blob_attr *data)
                }
 
                /* Extend timer if trigger timeout is bigger than remaining time */
-               remaining = uloop_timeout_remaining(&cmd->delay);
+               remaining = uloop_timeout_remaining64(&cmd->delay);
                if (remaining < t->timeout)
                        uloop_timeout_set(&cmd->delay, t->timeout);
 
index 349b484a788edb8080835b294d8132a67f6f5f04..e6f8da0d9c7d6001e25f36906c5b1b8432e6be96 100644 (file)
@@ -40,7 +40,7 @@ static void watch_subscribe_cb(struct ubus_context *ctx, struct ubus_event_handl
        struct blob_attr *attr;
        const char *path;
 
-       DEBUG(3, "ubus event %s\n", type);
+       P_DEBUG(3, "ubus event %s\n", type);
        if (strcmp(type, "ubus.object.add") != 0)
                return;
 
@@ -49,7 +49,7 @@ static void watch_subscribe_cb(struct ubus_context *ctx, struct ubus_event_handl
                return;
 
        path = blobmsg_data(attr);
-       DEBUG(3, "ubus path %s\n", path);
+       P_DEBUG(3, "ubus path %s\n", path);
 
        list_for_each_entry(o, &watch_objects, list) {
                unsigned int id;
@@ -99,7 +99,7 @@ watch_notify_cb(struct ubus_context *ctx, struct ubus_object *obj,
                char *str;
 
                str = blobmsg_format_json(msg, true);
-               DEBUG(3, "Received ubus notify '%s': %s\n", method, str);
+               P_DEBUG(3, "Received ubus notify '%s': %s\n", method, str);
                free(str);
        }
 
diff --git a/state.c b/state.c
index ccf410454711880c1f3639f4a84bb8ea42e9a05c..fb81248fd7e7bc7b588a82a5e667424af188cbab 100644 (file)
--- a/state.c
+++ b/state.c
@@ -13,6 +13,7 @@
  */
 
 #include <fcntl.h>
+#include <pwd.h>
 #include <sys/reboot.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -20,6 +21,7 @@
 #include <sys/types.h>
 #include <signal.h>
 
+#include "container.h"
 #include "procd.h"
 #include "syslog.h"
 #include "plug/hotplug.h"
@@ -93,9 +95,38 @@ static void set_console(void)
                set_stdio(tty);
 }
 
+static void perform_halt()
+{
+       if (reboot_event == RB_POWER_OFF)
+               LOG("- power down -\n");
+       else
+               LOG("- reboot -\n");
+
+       /* Allow time for last message to reach serial console, etc */
+       sleep(1);
+
+       if (is_container()) {
+               reboot(reboot_event);
+               exit(EXIT_SUCCESS);
+               return;
+       }
+
+       /* We have to fork here, since the kernel calls do_exit(EXIT_SUCCESS)
+        * in linux/kernel/sys.c, which can cause the machine to panic when
+        * the init process exits... */
+       if (!vfork()) { /* child */
+               reboot(reboot_event);
+               _exit(EXIT_SUCCESS);
+       }
+
+       while (1)
+               sleep(1);
+}
+
 static void state_enter(void)
 {
        char ubus_cmd[] = "/sbin/ubusd";
+       struct passwd *p;
 
        switch (state) {
        case STATE_EARLY:
@@ -109,9 +140,20 @@ static void state_enter(void)
                // try to reopen incase the wdt was not available before coldplug
                watchdog_init(0);
                set_stdio("console");
-               LOG("- ubus -\n");
+               p = getpwnam("ubus");
+               if (p) {
+                       int ret;
+                       LOG("- ubus -\n");
+                       mkdir(p->pw_dir, 0755);
+                       ret = chown(p->pw_dir, p->pw_uid, p->pw_gid);
+                       if (ret)
+                               LOG("- ubus - failed to chown(%s)\n", p->pw_dir);
+               } else {
+                       LOG("- ubus (running as root!) -\n");
+               }
+
                procd_connect_ubus();
-               service_start_early("ubus", ubus_cmd);
+               service_start_early("ubus", ubus_cmd, p?"ubus":NULL, p?"ubus":NULL);
                break;
 
        case STATE_INIT:
@@ -152,25 +194,9 @@ static void state_enter(void)
                sync();
                sleep(1);
 #ifndef DISABLE_INIT
-               if (reboot_event == RB_POWER_OFF)
-                       LOG("- power down -\n");
-               else
-                       LOG("- reboot -\n");
-
-               /* Allow time for last message to reach serial console, etc */
-               sleep(1);
-
-               /* We have to fork here, since the kernel calls do_exit(EXIT_SUCCESS)
-                * in linux/kernel/sys.c, which can cause the machine to panic when
-                * the init process exits... */
-               if (!vfork( )) { /* child */
-                       reboot(reboot_event);
-                       _exit(EXIT_SUCCESS);
-               }
-               while (1)
-                       sleep(1);
+               perform_halt();
 #else
-               exit(0);
+               exit(EXIT_SUCCESS);
 #endif
                break;
 
index 8ed3f93af481a544b48e5173158f446360499ab6..21ec3cdc07a2e1b1b17a292bb4a6b5ab0d478069 100644 (file)
--- a/system.c
+++ b/system.c
 #include <sys/types.h>
 #include <sys/reboot.h>
 #include <sys/stat.h>
+#include <sys/statvfs.h>
 #include <fcntl.h>
 #include <signal.h>
 #include <unistd.h>
 #include <stdlib.h>
 
+#include <json-c/json_tokener.h>
+#include <libubox/blobmsg_json.h>
 #include <libubox/uloop.h>
 
 #include "procd.h"
 #include "sysupgrade.h"
 #include "watchdog.h"
+#include "service/service.h"
 
 static struct blob_buf b;
 static int notify;
 static struct ubus_context *_ctx;
+static int initramfs;
+
+enum vjson_state {
+       VJSON_ERROR,
+       VJSON_CONTINUE,
+       VJSON_SUCCESS,
+};
+
+static const char *system_rootfs_type(void) {
+       const char proc_mounts[] = "/proc/self/mounts";
+       static char fstype[16] = { 0 };
+       char *mountstr = NULL, *mp = "/", *pos, *tmp;
+       FILE *mounts;
+       size_t len = 0;
+       bool found = false;
+
+       if (initramfs)
+               return "initramfs";
+
+       if (fstype[0])
+               return fstype;
+
+       mounts = fopen(proc_mounts, "r");
+       if (!mounts)
+               return NULL;
+
+       while (getline(&mountstr, &len, mounts) != -1) {
+               pos = strchr(mountstr, ' ');
+               if (!pos)
+                       continue;
+
+               tmp = pos + 1;
+               pos = strchr(tmp, ' ');
+               if (!pos)
+                       continue;
+
+               *pos = '\0';
+               if (strcmp(tmp, mp))
+                       continue;
+
+               tmp = pos + 1;
+               pos = strchr(tmp, ' ');
+               if (!pos)
+                       continue;
+
+               *pos = '\0';
+
+               if (!strcmp(tmp, "overlay")) {
+                       /*
+                        * there is no point in parsing overlay option string for
+                        * lowerdir, as that can point to "/" being a previous
+                        * overlay mount (after firstboot or sysuprade config
+                        * restore). Hence just assume the lowerdir is "/rom" and
+                        * restart searching for that instead if that's not
+                        * already the case.
+                        */
+                       if (!strcmp(mp, "/rom"))
+                               break;
+
+                       mp = "/rom";
+                       fseek(mounts, 0, SEEK_SET);
+                       continue;
+               }
+
+               found = true;
+               break;
+       }
+
+       if (found)
+               strncpy(fstype, tmp, sizeof(fstype) - 1);
+
+       fstype[sizeof(fstype) - 1]= '\0';
+       free(mountstr);
+       fclose(mounts);
+
+       if (found)
+               return fstype;
+       else
+               return NULL;
+}
 
 static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
                  struct ubus_request_data *req, const char *method,
@@ -42,6 +126,7 @@ static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
        void *c;
        char line[256];
        char *key, *val, *next;
+       const char *rootfs_type = system_rootfs_type();
        struct utsname utsname;
        FILE *f;
 
@@ -63,6 +148,19 @@ static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
                        if (!key || !val)
                                continue;
 
+#ifdef __aarch64__
+                       if (!strcasecmp(key, "CPU revision")) {
+                               snprintf(line, sizeof(line), "ARMv8 Processor rev %lu", strtoul(val + 2, NULL, 16));
+                               blobmsg_add_string(&b, "system", line);
+                               break;
+                       }
+#elif __riscv
+                       if (!strcasecmp(key, "isa")) {
+                               snprintf(line, sizeof(line), "RISC-V (%s)", val + 2);
+                               blobmsg_add_string(&b, "system", line);
+                               break;
+                       }
+#else
                        if (!strcasecmp(key, "system type") ||
                            !strcasecmp(key, "processor") ||
                            !strcasecmp(key, "cpu") ||
@@ -76,6 +174,7 @@ static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
                                        break;
                                }
                        }
+#endif
                }
 
                fclose(f);
@@ -149,6 +248,9 @@ static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
                fclose(f);
        }
 
+       if (rootfs_type)
+               blobmsg_add_string(&b, "rootfs_type", rootfs_type);
+
        if ((f = fopen("/etc/openwrt_release", "r")) != NULL)
        {
                c = blobmsg_open_table(&b, "release");
@@ -219,6 +321,12 @@ static int system_board(struct ubus_context *ctx, struct ubus_object *obj,
        return UBUS_STATUS_OK;
 }
 
+static unsigned long
+kscale(unsigned long b, unsigned long bs)
+{
+       return (b * (unsigned long long) bs + 1024/2) / 1024;
+}
+
 static int system_info(struct ubus_context *ctx, struct ubus_object *obj,
                 struct ubus_request_data *req, const char *method,
                 struct blob_attr *msg)
@@ -228,9 +336,42 @@ static int system_info(struct ubus_context *ctx, struct ubus_object *obj,
 #ifdef linux
        struct sysinfo info;
        void *c;
+       char line[256];
+       char *key, *val;
+       unsigned long long available, cached;
+       FILE *f;
+       int i;
+       struct statvfs s;
+       const char *fslist[] = {
+               "/",    "root",
+               "/tmp", "tmp",
+       };
 
        if (sysinfo(&info))
                return UBUS_STATUS_UNKNOWN_ERROR;
+
+       if ((f = fopen("/proc/meminfo", "r")) == NULL)
+               return UBUS_STATUS_UNKNOWN_ERROR;
+
+       /* if linux < 3.14 MemAvailable is not in meminfo */
+       available = 0;
+       cached = 0;
+
+       while (fgets(line, sizeof(line), f))
+       {
+               key = strtok(line, " :");
+               val = strtok(NULL, " ");
+
+               if (!key || !val)
+                       continue;
+
+               if (!strcasecmp(key, "MemAvailable"))
+                       available = 1024 * atoll(val);
+               else if (!strcasecmp(key, "Cached"))
+                       cached = 1024 * atoll(val);
+       }
+
+       fclose(f);
 #endif
 
        now = time(NULL);
@@ -252,15 +393,40 @@ static int system_info(struct ubus_context *ctx, struct ubus_object *obj,
        blobmsg_close_array(&b, c);
 
        c = blobmsg_open_table(&b, "memory");
-       blobmsg_add_u64(&b, "total",    info.mem_unit * info.totalram);
-       blobmsg_add_u64(&b, "free",     info.mem_unit * info.freeram);
-       blobmsg_add_u64(&b, "shared",   info.mem_unit * info.sharedram);
-       blobmsg_add_u64(&b, "buffered", info.mem_unit * info.bufferram);
+       blobmsg_add_u64(&b, "total",
+                       (uint64_t)info.mem_unit * (uint64_t)info.totalram);
+       blobmsg_add_u64(&b, "free",
+                       (uint64_t)info.mem_unit * (uint64_t)info.freeram);
+       blobmsg_add_u64(&b, "shared",
+                       (uint64_t)info.mem_unit * (uint64_t)info.sharedram);
+       blobmsg_add_u64(&b, "buffered",
+                       (uint64_t)info.mem_unit * (uint64_t)info.bufferram);
+       blobmsg_add_u64(&b, "available", available);
+       blobmsg_add_u64(&b, "cached", cached);
        blobmsg_close_table(&b, c);
 
+       for (i = 0; i < sizeof(fslist) / sizeof(fslist[0]); i += 2) {
+               if (statvfs(fslist[i], &s))
+                       continue;
+
+               c = blobmsg_open_table(&b, fslist[i+1]);
+
+               if (!s.f_frsize)
+                       s.f_frsize = s.f_bsize;
+
+               blobmsg_add_u64(&b, "total", kscale(s.f_blocks, s.f_frsize));
+               blobmsg_add_u64(&b, "free",  kscale(s.f_bfree, s.f_frsize));
+               blobmsg_add_u64(&b, "used", kscale(s.f_blocks - s.f_bfree, s.f_frsize));
+               blobmsg_add_u64(&b, "avail", kscale(s.f_bavail, s.f_frsize));
+
+               blobmsg_close_table(&b, c);
+       }
+
        c = blobmsg_open_table(&b, "swap");
-       blobmsg_add_u64(&b, "total",    info.mem_unit * info.totalswap);
-       blobmsg_add_u64(&b, "free",     info.mem_unit * info.freeswap);
+       blobmsg_add_u64(&b, "total",
+                       (uint64_t)info.mem_unit * (uint64_t)info.totalswap);
+       blobmsg_add_u64(&b, "free",
+                       (uint64_t)info.mem_unit * (uint64_t)info.freeswap);
        blobmsg_close_table(&b, c);
 #endif
 
@@ -321,7 +487,7 @@ static int watchdog_set(struct ubus_context *ctx, struct ubus_object *obj,
 
                if (timeout <= frequency)
                        timeout = frequency * 2;
-                watchdog_timeout(timeout);
+               watchdog_timeout(timeout);
        }
 
        if (tb[WDT_MAGICCLOSE])
@@ -376,24 +542,243 @@ static int proc_signal(struct ubus_context *ctx, struct ubus_object *obj,
        return 0;
 }
 
+__attribute__((format (printf, 2, 3)))
+static enum vjson_state vjson_error(char **b, const char *fmt, ...)
+{
+       static char buf[256] = { 0 };
+       const char *pfx = "Firmware image couldn't be validated: ";
+       va_list va;
+       int r;
+
+       r = snprintf(buf, sizeof(buf), "%s", pfx);
+       if (r < 0) {
+               *b = "vjson_error() snprintf failed";
+               return VJSON_ERROR;
+       }
+
+       va_start(va, fmt);
+       r = vsnprintf(buf+r, sizeof(buf)-r, fmt, va);
+       if (r < 0) {
+               *b = "vjson_error() vsnprintf failed";
+               return VJSON_ERROR;
+       }
+       va_end(va);
+
+       *b = buf;
+       return VJSON_ERROR;
+}
+
+static enum vjson_state vjson_parse_token(json_tokener *tok, char *buf, ssize_t len, char **err)
+{
+       json_object *jsobj = NULL;
+
+       jsobj = json_tokener_parse_ex(tok, buf, len);
+       if (json_tokener_get_error(tok) == json_tokener_continue)
+               return VJSON_CONTINUE;
+
+       if (json_tokener_get_error(tok) == json_tokener_success) {
+               if (json_object_get_type(jsobj) != json_type_object) {
+                       json_object_put(jsobj);
+                       return vjson_error(err, "result is not an JSON object");
+               }
+
+               blobmsg_add_object(&b, jsobj);
+               json_object_put(jsobj);
+               return VJSON_SUCCESS;
+       }
+
+       return vjson_error(err, "failed to parse JSON: %s (%d)",
+                          json_tokener_error_desc(json_tokener_get_error(tok)),
+                          json_tokener_get_error(tok));
+}
+
+static enum vjson_state vjson_parse(int fd, char **err)
+{
+       enum vjson_state r = VJSON_ERROR;
+       size_t read_count = 0;
+       char buf[64] = { 0 };
+       json_tokener *tok;
+       ssize_t len;
+       int _errno;
+
+       tok = json_tokener_new();
+       if (!tok)
+               return vjson_error(err, "json_tokener_new() failed");
+
+       vjson_error(err, "incomplete JSON input");
+
+       while ((len = read(fd, buf, sizeof(buf)))) {
+               if (len < 0 && errno == EINTR)
+                       continue;
+
+               if (len < 0) {
+                       _errno = errno;
+                       json_tokener_free(tok);
+                       return vjson_error(err, "read() failed: %s (%d)",
+                                          strerror(_errno), _errno);
+               }
+
+               read_count += len;
+               r = vjson_parse_token(tok, buf, len, err);
+               if (r != VJSON_CONTINUE)
+                       break;
+
+               memset(buf, 0, sizeof(buf));
+       }
+
+       if (read_count == 0)
+               vjson_error(err, "no JSON input");
+
+       json_tokener_free(tok);
+       return r;
+}
+
+/**
+ * validate_firmware_image_call - perform validation & store result in global b
+ *
+ * @file: firmware image path
+ */
+static enum vjson_state validate_firmware_image_call(const char *file, char **err)
+{
+       const char *path = "/usr/libexec/validate_firmware_image";
+       enum vjson_state ret = VJSON_ERROR;
+       int _errno;
+       int fds[2];
+       int fd;
+
+       blob_buf_init(&b, 0);
+       vjson_error(err, "unhandled error");
+
+       if (pipe(fds)) {
+               _errno = errno;
+               return vjson_error(err, "pipe() failed: %s (%d)",
+                                  strerror(_errno), _errno);
+       }
+
+       switch (fork()) {
+       case -1:
+               _errno = errno;
+
+               close(fds[0]);
+               close(fds[1]);
+
+               return vjson_error(err, "fork() failed: %s (%d)",
+                                  strerror(_errno), _errno);
+       case 0:
+               /* Set stdin & stderr to /dev/null */
+               fd = open("/dev/null", O_RDWR);
+               if (fd >= 0) {
+                       dup2(fd, 0);
+                       dup2(fd, 2);
+                       close(fd);
+               }
+
+               /* Set stdout to the shared pipe */
+               dup2(fds[1], 1);
+               close(fds[0]);
+               close(fds[1]);
+
+               execl(path, path, file, NULL);
+               exit(errno);
+       }
+
+       /* Parent process */
+       close(fds[1]);
+
+       ret = vjson_parse(fds[0], err);
+       close(fds[0]);
+
+       return ret;
+}
+
+enum {
+       VALIDATE_FIRMWARE_IMAGE_PATH,
+       __VALIDATE_FIRMWARE_IMAGE_MAX,
+};
+
+static const struct blobmsg_policy validate_firmware_image_policy[__VALIDATE_FIRMWARE_IMAGE_MAX] = {
+       [VALIDATE_FIRMWARE_IMAGE_PATH] = { .name = "path", .type = BLOBMSG_TYPE_STRING },
+};
+
+static int validate_firmware_image(struct ubus_context *ctx,
+                                  struct ubus_object *obj,
+                                  struct ubus_request_data *req,
+                                  const char *method, struct blob_attr *msg)
+{
+       struct blob_attr *tb[__VALIDATE_FIRMWARE_IMAGE_MAX];
+       enum vjson_state ret = VJSON_ERROR;
+       char *err;
+
+       if (!msg)
+               return UBUS_STATUS_INVALID_ARGUMENT;
+
+       blobmsg_parse(validate_firmware_image_policy, __VALIDATE_FIRMWARE_IMAGE_MAX, tb, blob_data(msg), blob_len(msg));
+       if (!tb[VALIDATE_FIRMWARE_IMAGE_PATH])
+               return UBUS_STATUS_INVALID_ARGUMENT;
+
+       ret = validate_firmware_image_call(blobmsg_get_string(tb[VALIDATE_FIRMWARE_IMAGE_PATH]), &err);
+       if (ret != VJSON_SUCCESS)
+               return UBUS_STATUS_UNKNOWN_ERROR;
+
+       ubus_send_reply(ctx, req, b.head);
+
+       return UBUS_STATUS_OK;
+}
+
 enum {
        SYSUPGRADE_PATH,
+       SYSUPGRADE_FORCE,
+       SYSUPGRADE_BACKUP,
        SYSUPGRADE_PREFIX,
        SYSUPGRADE_COMMAND,
+       SYSUPGRADE_OPTIONS,
        __SYSUPGRADE_MAX
 };
 
 static const struct blobmsg_policy sysupgrade_policy[__SYSUPGRADE_MAX] = {
        [SYSUPGRADE_PATH] = { .name = "path", .type = BLOBMSG_TYPE_STRING },
+       [SYSUPGRADE_FORCE] = { .name = "force", .type = BLOBMSG_TYPE_BOOL },
+       [SYSUPGRADE_BACKUP] = { .name = "backup", .type = BLOBMSG_TYPE_STRING },
        [SYSUPGRADE_PREFIX] = { .name = "prefix", .type = BLOBMSG_TYPE_STRING },
        [SYSUPGRADE_COMMAND] = { .name = "command", .type = BLOBMSG_TYPE_STRING },
+       [SYSUPGRADE_OPTIONS] = { .name = "options", .type = BLOBMSG_TYPE_TABLE },
 };
 
+static void sysupgrade_error(struct ubus_context *ctx,
+                            struct ubus_request_data *req,
+                            const char *message)
+{
+       void *c;
+
+       blob_buf_init(&b, 0);
+
+       c = blobmsg_open_table(&b, "error");
+       blobmsg_add_string(&b, "message", message);
+       blobmsg_close_table(&b, c);
+
+       ubus_send_reply(ctx, req, b.head);
+}
+
 static int sysupgrade(struct ubus_context *ctx, struct ubus_object *obj,
                      struct ubus_request_data *req, const char *method,
                      struct blob_attr *msg)
 {
+       enum {
+               VALIDATION_VALID,
+               VALIDATION_FORCEABLE,
+               VALIDATION_ALLOW_BACKUP,
+               __VALIDATION_MAX
+       };
+       static const struct blobmsg_policy validation_policy[__VALIDATION_MAX] = {
+               [VALIDATION_VALID] = { .name = "valid", .type = BLOBMSG_TYPE_BOOL },
+               [VALIDATION_FORCEABLE] = { .name = "forceable", .type = BLOBMSG_TYPE_BOOL },
+               [VALIDATION_ALLOW_BACKUP] = { .name = "allow_backup", .type = BLOBMSG_TYPE_BOOL },
+       };
+       struct blob_attr *validation[__VALIDATION_MAX];
        struct blob_attr *tb[__SYSUPGRADE_MAX];
+       bool valid, forceable, allow_backup;
+       enum vjson_state ret = VJSON_ERROR;
+       char *err;
 
        if (!msg)
                return UBUS_STATUS_INVALID_ARGUMENT;
@@ -402,9 +787,44 @@ static int sysupgrade(struct ubus_context *ctx, struct ubus_object *obj,
        if (!tb[SYSUPGRADE_PATH] || !tb[SYSUPGRADE_PREFIX])
                return UBUS_STATUS_INVALID_ARGUMENT;
 
+       ret = validate_firmware_image_call(blobmsg_get_string(tb[SYSUPGRADE_PATH]), &err);
+       if (ret != VJSON_SUCCESS) {
+               sysupgrade_error(ctx, req, err);
+               return UBUS_STATUS_UNKNOWN_ERROR;
+       }
+
+       blobmsg_parse(validation_policy, __VALIDATION_MAX, validation, blob_data(b.head), blob_len(b.head));
+
+       if (!validation[VALIDATION_VALID] || !validation[VALIDATION_FORCEABLE] ||
+           !validation[VALIDATION_ALLOW_BACKUP]) {
+               sysupgrade_error(ctx, req, "Validation script provided invalid input");
+               return UBUS_STATUS_INVALID_ARGUMENT;
+       }
+
+       valid = validation[VALIDATION_VALID] && blobmsg_get_bool(validation[VALIDATION_VALID]);
+       forceable = validation[VALIDATION_FORCEABLE] && blobmsg_get_bool(validation[VALIDATION_FORCEABLE]);
+       allow_backup = validation[VALIDATION_ALLOW_BACKUP] && blobmsg_get_bool(validation[VALIDATION_ALLOW_BACKUP]);
+
+       if (!valid) {
+               if (!forceable) {
+                       sysupgrade_error(ctx, req, "Firmware image is broken and cannot be installed");
+                       return UBUS_STATUS_NOT_SUPPORTED;
+               } else if (!tb[SYSUPGRADE_FORCE] || !blobmsg_get_bool(tb[SYSUPGRADE_FORCE])) {
+                       sysupgrade_error(ctx, req, "Firmware image is invalid");
+                       return UBUS_STATUS_NOT_SUPPORTED;
+               }
+       } else if (!allow_backup && tb[SYSUPGRADE_BACKUP]) {
+               sysupgrade_error(ctx, req, "Firmware image doesn't allow preserving a backup");
+               return UBUS_STATUS_NOT_SUPPORTED;
+       }
+
+       service_stop_all();
+
        sysupgrade_exec_upgraded(blobmsg_get_string(tb[SYSUPGRADE_PREFIX]),
                                 blobmsg_get_string(tb[SYSUPGRADE_PATH]),
-                                tb[SYSUPGRADE_COMMAND] ? blobmsg_get_string(tb[SYSUPGRADE_COMMAND]) : NULL);
+                                tb[SYSUPGRADE_BACKUP] ? blobmsg_get_string(tb[SYSUPGRADE_BACKUP]) : NULL,
+                                tb[SYSUPGRADE_COMMAND] ? blobmsg_get_string(tb[SYSUPGRADE_COMMAND]) : NULL,
+                                tb[SYSUPGRADE_OPTIONS]);
 
        /* sysupgrade_exec_upgraded() will never return unless something has gone wrong */
        return UBUS_STATUS_UNKNOWN_ERROR;
@@ -423,6 +843,7 @@ static const struct ubus_method system_methods[] = {
        UBUS_METHOD_NOARG("reboot", system_reboot),
        UBUS_METHOD("watchdog", watchdog_set, watchdog_policy),
        UBUS_METHOD("signal", proc_signal, signal_policy),
+       UBUS_METHOD("validate_firmware_image", validate_firmware_image, validate_firmware_image_policy),
        UBUS_METHOD("sysupgrade", sysupgrade, sysupgrade_policy),
 };
 
@@ -455,6 +876,11 @@ void ubus_init_system(struct ubus_context *ctx)
        int ret;
 
        _ctx = ctx;
+
+       initramfs = !!getenv("INITRAMFS");
+       if (initramfs)
+               unsetenv("INITRAMFS");
+
        ret = ubus_add_object(ctx, &system_object);
        if (ret)
                ERROR("Failed to add object: %s\n", ubus_strerror(ret));
index 07e33f752d0c3be57962771cdf42f02576d42e7f..fc588b0248353137d4b81fce130d2d35d8dfa710 100644 (file)
  */
 
 
+#define _GNU_SOURCE
 #include "watchdog.h"
 #include "sysupgrade.h"
 
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 
+#include <libubox/blobmsg.h>
 
-void sysupgrade_exec_upgraded(const char *prefix, char *path, char *command)
+void sysupgrade_exec_upgraded(const char *prefix, char *path,
+                             const char *backup, char *command,
+                             struct blob_attr *options)
 {
        char *wdt_fd = watchdog_fd();
        char *argv[] = { "/sbin/upgraded", NULL, NULL, NULL};
+       struct blob_attr *option;
+       int rem;
        int ret;
 
        ret = chroot(prefix);
@@ -41,6 +48,58 @@ void sysupgrade_exec_upgraded(const char *prefix, char *path, char *command)
                watchdog_set_cloexec(false);
                setenv("WDTFD", wdt_fd, 1);
        }
+
+       if (backup)
+               setenv("UPGRADE_BACKUP", backup, 1);
+
+       blobmsg_for_each_attr(option, options, rem) {
+               const char *prefix = "UPGRADE_OPT_";
+               char value[11];
+               char *name;
+               char *c;
+               int tmp;
+
+               if (asprintf(&name, "%s%s", prefix, blobmsg_name(option)) <= 0)
+                       continue;
+               for (c = name + strlen(prefix); *c; c++) {
+                       if (isalnum(*c) || *c == '_') {
+                               *c = toupper(*c);
+                       } else {
+                               c = NULL;
+                               break;
+                       }
+               }
+
+               if (!c) {
+                       fprintf(stderr, "Option \"%s\" contains invalid characters\n",
+                               blobmsg_name(option));
+                       free(name);
+                       continue;
+               }
+
+               switch (blobmsg_type(option)) {
+               case BLOBMSG_TYPE_INT32:
+                       tmp = blobmsg_get_u32(option);
+                       break;
+               case BLOBMSG_TYPE_INT16:
+                       tmp = blobmsg_get_u16(option);
+                       break;
+               case BLOBMSG_TYPE_INT8:
+                       tmp = blobmsg_get_u8(option);
+                       break;
+               default:
+                       fprintf(stderr, "Option \"%s\" has unsupported type: %d\n",
+                               blobmsg_name(option), blobmsg_type(option));
+                       free(name);
+                       continue;
+               }
+               snprintf(value, sizeof(value), "%u", tmp);
+
+               setenv(name, value, 1);
+
+               free(name);
+       }
+
        execvp(argv[0], argv);
 
        /* Cleanup on failure */
index 8c09fc99d191182a4f26333569a8adad2728070a..268e2fd94421ea19f9511a012a43b53430f6ccb2 100644 (file)
 #ifndef __PROCD_SYSUPGRADE_H
 #define __PROCD_SYSUPGRADE_H
 
+struct blob_attr;
 
-void sysupgrade_exec_upgraded(const char *prefix, char *path, char *command);
+void sysupgrade_exec_upgraded(const char *prefix, char *path,
+                             const char *backup, char *command,
+                             struct blob_attr *options);
 
 
 #endif
index ee974290a90412fb3529e5b924deff3f25190915..457bd826fd4f92beac87e483ffff4ef548a3a9e0 100644 (file)
@@ -49,9 +49,10 @@ int __libc_start_main(main_t main,
        start_main_t __start_main__;
 
        __start_main__ = dlsym(RTLD_NEXT, "__libc_start_main");
-       if (!__start_main__)
+       if (!__start_main__) {
                ERROR("failed to find __libc_start_main %s\n", dlerror());
-
+               return -1;
+       }
        __main__ = main;
 
        return (*__start_main__)(__preload_main__, argc, argv, auxvec,
@@ -69,8 +70,10 @@ void __uClibc_main(main_t main,
        uClibc_main __start_main__;
 
        __start_main__ = dlsym(RTLD_NEXT, "__uClibc_main");
-       if (!__start_main__)
+       if (!__start_main__) {
                ERROR("failed to find __uClibc_main %s\n", dlerror());
+               return;
+       }
 
        __main__ = main;
 
index 78b99dd053d7b91bd86ac9d7efc7a706596e0caa..e257d89c00c891b0ee8b9836bab4d1ae73dd8707 100644 (file)
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <errno.h>
-#include <string.h>
+#include <libgen.h>
 #include <syslog.h>
 #include <limits.h>
 
 #define PTRACE_EVENT_STOP 128
 #endif
 
+#ifndef PTRACE_EVENT_SECCOMP
+/* undefined with uClibc-ng */
+#define PTRACE_EVENT_SECCOMP 7
+#endif
+
 #include <libubox/ulog.h>
 #include <libubox/uloop.h>
 #include <libubox/blobmsg.h>
 #define _offsetof(a, b) __builtin_offsetof(a,b)
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 
-#ifdef __amd64__
+#if defined (__aarch64__)
+#include <linux/ptrace.h>
+#elif defined(__amd64__)
 #define reg_syscall_nr _offsetof(struct user, regs.orig_rax)
+#elif defined(__arm__)
+#include <asm/ptrace.h>                /* for PTRACE_SET_SYSCALL */
+#define reg_syscall_nr _offsetof(struct user, regs.uregs[7])
+# if defined(__ARM_EABI__)
+# define reg_retval_nr _offsetof(struct user, regs.uregs[0])
+# endif
 #elif defined(__i386__)
 #define reg_syscall_nr _offsetof(struct user, regs.orig_eax)
 #elif defined(__mips)
 # define EF_REG2       8
 # endif
 #define reg_syscall_nr (EF_REG2 / 4)
-#elif defined(__arm__)
-#include <asm/ptrace.h>                /* for PTRACE_SET_SYSCALL */
-#define reg_syscall_nr _offsetof(struct user, regs.uregs[7])
-# if defined(__ARM_EABI__)
-# define reg_retval_nr _offsetof(struct user, regs.uregs[0])
-# endif
 #elif defined(__PPC__)
 #define reg_syscall_nr _offsetof(struct user, regs.gpr[0])
 #define reg_retval_nr  _offsetof(struct user, regs.gpr[3])
@@ -108,8 +115,9 @@ static int cmp_count(const void *a, const void *b)
 
 static void print_syscalls(int policy, const char *json)
 {
-       void *c;
+       void *c, *d, *e;
        int i;
+       char *tmp;
 
        if (mode == UTRACE) {
                set_syscall("rt_sigaction", 1);
@@ -129,7 +137,10 @@ static void print_syscalls(int policy, const char *json)
        qsort(sorted, SYSCALL_COUNT, sizeof(sorted[0]), cmp_count);
 
        blob_buf_init(&b, 0);
-       c = blobmsg_open_array(&b, "whitelist");
+       blobmsg_add_string(&b, "defaultAction", "SCMP_ACT_KILL_PROCESS");
+       c = blobmsg_open_array(&b, "syscalls");
+       d = blobmsg_open_table(&b, "");
+       e = blobmsg_open_array(&b, "names");
 
        for (i = 0; i < SYSCALL_COUNT; i++) {
                int sc = sorted[i].syscall;
@@ -144,22 +155,34 @@ static void print_syscalls(int policy, const char *json)
                        ULOG_ERR("no name found for syscall(%d)\n", sc);
                }
        }
+       blobmsg_close_array(&b, e);
+       blobmsg_add_string(&b, "action", "SCMP_ACT_ALLOW");
+       blobmsg_close_table(&b, d);
        blobmsg_close_array(&b, c);
-       blobmsg_add_u32(&b, "policy", policy);
        if (json) {
                FILE *fp = fopen(json, "w");
                if (fp) {
-                       fprintf(fp, "%s", blobmsg_format_json_indent(b.head, true, 0));
+                       tmp = blobmsg_format_json_indent(b.head, true, 0);
+                       if (!tmp) {
+                               fclose(fp);
+                               return;
+                       }
+
+                       fprintf(fp, "%s\n", tmp);
+                       free(tmp);
                        fclose(fp);
                        ULOG_INFO("saving syscall trace to %s\n", json);
                } else {
                        ULOG_ERR("failed to open %s\n", json);
                }
        } else {
-               printf("%s\n",
-                       blobmsg_format_json_indent(b.head, true, 0));
-       }
+               tmp = blobmsg_format_json_indent(b.head, true, 0);
+               if (!tmp)
+                       return;
 
+               printf("%s\n", tmp);
+               free(tmp);
+       }
 }
 
 static void report_seccomp_vialation(pid_t pid, unsigned syscall)
@@ -167,7 +190,12 @@ static void report_seccomp_vialation(pid_t pid, unsigned syscall)
        char buf[200];
        snprintf(buf, sizeof(buf), "/proc/%d/cmdline", pid);
        int f = open(buf, O_RDONLY);
+       if (f < 0)
+               return;
+
        int r = read(f, buf, sizeof(buf) - 1);
+       buf[sizeof(buf) - 1] = '\0';
+
        if (r >= 0)
                buf[r] = 0;
        else
@@ -198,7 +226,14 @@ static void tracer_cb(struct uloop_process *c, int ret)
        if (WIFSTOPPED(ret) || (ret >> 16)) {
                if (WSTOPSIG(ret) & 0x80) {
                        if (!tracee->in_syscall) {
+#ifdef __aarch64__
+                               int syscall = -1;
+                               struct ptrace_syscall_info ptsi = {.op=PTRACE_SYSCALL_INFO_ENTRY};
+                               if (ptrace(PTRACE_GET_SYSCALL_INFO, c->pid, sizeof(ptsi), &ptsi) != -1)
+                                       syscall = ptsi.entry.nr;
+#else
                                int syscall = ptrace(PTRACE_PEEKUSER, c->pid, reg_syscall_nr);
+#endif
                                int i = syscall_index(syscall);
                                if (i >= 0) {
                                        syscall_count[i]++;
@@ -225,12 +260,19 @@ static void tracer_cb(struct uloop_process *c, int ret)
                } else if ((ret >> 16) == PTRACE_EVENT_STOP) {
                        /* Nothing special to do here */
                } else if ((ret >> 8) == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8))) {
+#ifdef __aarch64__
+                       int syscall = -1;
+                       struct ptrace_syscall_info ptsi = {.op=PTRACE_SYSCALL_INFO_SECCOMP};
+                       if (ptrace(PTRACE_GET_SYSCALL_INFO, c->pid, sizeof(ptsi), &ptsi) != -1)
+                               syscall = ptsi.entry.nr;
+#else
                        int syscall = ptrace(PTRACE_PEEKUSER, c->pid, reg_syscall_nr);
 #if defined(__arm__)
                        ptrace(PTRACE_SET_SYSCALL, c->pid, 0, -1);
                        ptrace(PTRACE_POKEUSER, c->pid, reg_retval_nr, -ENOSYS);
 #else
                        ptrace(PTRACE_POKEUSER, c->pid, reg_syscall_nr, -1);
+#endif
 #endif
                        report_seccomp_vialation(c->pid, syscall);
                } else {
@@ -339,6 +381,10 @@ int main(int argc, char **argv, char **envp)
                ULOG_ERR("failed to exec %s: %m\n", _argv[0]);
 
                free(_argv);
+               if (_envp[0])
+                       free(_envp[0]);
+               if (newenv == 2 && _envp[1])
+                       free(_envp[1]);
                free(_envp);
                return ret;
        }
diff --git a/ubus.c b/ubus.c
index 8d521acf11e3ef17c36e9865c6b0cc1056f97c7f..b0b7c9a487af5e1d751aa636cad509b1bc90fdf6 100644 (file)
--- a/ubus.c
+++ b/ubus.c
@@ -23,6 +23,13 @@ char *ubus_socket = NULL;
 static struct ubus_context *ctx;
 static struct uloop_timeout ubus_timer;
 static int timeout;
+static struct udebug_ubus udebug;
+
+static void
+procd_udebug_cb(struct udebug_ubus *ctx, struct blob_attr *data, bool enabled)
+{
+       procd_udebug_set_enabled(enabled);
+}
 
 static void reset_timeout(void)
 {
@@ -67,7 +74,9 @@ ubus_connect_cb(struct uloop_timeout *timeout)
                return;
        }
 
+       udebug_ubus_init(&udebug, ctx, "procd", procd_udebug_cb);
        ctx->connection_lost = ubus_disconnect_cb;
+       ubus_init_hotplug(ctx);
        ubus_init_service(ctx);
        ubus_init_system(ctx);
        watch_ubus(ctx);
index fd7d6bb58b784d39b5386301efe0c99b1642c072..09cf472e47ee304d29793851b1d1785f4ed8ca8e 100644 (file)
@@ -5,7 +5,7 @@ FIND_PATH(ubox_include_dir libubox/uloop.h)
 INCLUDE_DIRECTORIES(${ubox_include_dir})
 ADD_DEFINITIONS(-Os -ggdb -Wall -Werror --std=gnu99 -Wmissing-declarations)
 ADD_EXECUTABLE(upgraded upgraded.c ../watchdog.c)
-TARGET_LINK_LIBRARIES(upgraded ubox)
+TARGET_LINK_LIBRARIES(upgraded ${ubox})
 INSTALL(TARGETS upgraded
        RUNTIME DESTINATION sbin
 )
index 0b82f20241337921a8b8bccc81ff0917f4ac1577..db98701d8ab64da4c78f105a6182ae0db9edb062 100644 (file)
@@ -56,7 +56,7 @@ static void sysupgrade(char *path, char *command)
                /* Child */
                execvp(args[0], args);
                fprintf(stderr, "Failed to exec sysupgrade\n");
-               _exit(-1);
+               _exit(EXIT_FAILURE);
        }
 
        uloop_process_add(&upgrade_proc);
index c5b951356a7e0251e11f227b2693f6ecce108f50..e5b1297e6388831edeb00049311272a74b654507 100644 (file)
@@ -135,10 +135,38 @@ blobmsg_list_equal(struct blobmsg_list *l1, struct blobmsg_list *l2)
        return true;
 }
 
-char* get_cmdline_val(const char* name, char* out, int len)
+char *get_active_console(char *out, int len)
+{
+       char line[CMDLINE_SIZE + 1];
+       int fd = open("/sys/class/tty/console/active", O_RDONLY);
+       ssize_t r;
+
+       if (fd < 0)
+               return NULL;
+
+       r = read(fd, line, sizeof(line) - 1);
+       line[CMDLINE_SIZE] = '\0';
+
+       close(fd);
+
+       if (r <= 0)
+               return NULL;
+
+       /* The active file is terminated by a newline which we need to strip */
+       char *newline = strtok(line, "\n");
+
+       if (newline != NULL) {
+               strncpy(out, newline, len);
+               return out;
+       }
+
+       return NULL;
+}
+
+char *get_cmdline_val_offset(const char *name, char *out, int len, int offset)
 {
        char line[CMDLINE_SIZE + 1], *c, *sptr;
-       int fd = open("/proc/cmdline", O_RDONLY);
+       int i, fd = open("/proc/cmdline", O_RDONLY);
        ssize_t r = read(fd, line, sizeof(line) - 1);
        close(fd);
 
@@ -147,13 +175,18 @@ char* get_cmdline_val(const char* name, char* out, int len)
 
        line[r] = 0;
 
-       for (c = strtok_r(line, " \t\n", &sptr); c;
+       for (i = 0, c = strtok_r(line, " \t\n", &sptr); c;
                        c = strtok_r(NULL, " \t\n", &sptr)) {
                char *sep = strchr(c, '=');
+               if (sep == NULL)
+                       continue;
+
                ssize_t klen = sep - c;
-               if (klen < 0 || strncmp(name, c, klen) || name[klen] != 0)
+               if (strncmp(name, c, klen) || name[klen] != 0)
                        continue;
 
+               if (i++ < offset)
+                       continue;
                strncpy(out, &sep[1], len);
                out[len-1] = 0;
                return out;
index 908c3145177e0e3195c9ac1652a370aa4fef15b7..d0c621e796e6c84182a9bc69be27a83f0aab3a11 100644 (file)
@@ -51,7 +51,11 @@ int blobmsg_list_fill(struct blobmsg_list *list, void *data, int len, bool array
 void blobmsg_list_free(struct blobmsg_list *list);
 bool blobmsg_list_equal(struct blobmsg_list *l1, struct blobmsg_list *l2);
 void blobmsg_list_move(struct blobmsg_list *list, struct blobmsg_list *src);
-char* get_cmdline_val(const char* name, char* out, int len);
+char *get_cmdline_val_offset(const char *name, char *out, int len, int offset);
+char *get_active_console(char *out, int len);
+
+#define get_cmdline_val(name, out, len) \
+       get_cmdline_val_offset(name, out, len, 0)
 
 int patch_fd(const char *device, int fd, int flags);
 int patch_stdio(const char *device);
diff --git a/uxc.c b/uxc.c
new file mode 100644 (file)
index 0000000..c15b4bc
--- /dev/null
+++ b/uxc.c
@@ -0,0 +1,1594 @@
+/*
+ * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License version 2.1
+ * as published by the Free Software Foundation
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <glob.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <signal.h>
+#include <termios.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <libubus.h>
+#include <libubox/avl-cmp.h>
+#include <libubox/blobmsg.h>
+#include <libubox/blobmsg_json.h>
+#include <libubox/ustream.h>
+
+#include "log.h"
+
+#define UXC_VERSION "0.3"
+#define OCI_VERSION_STRING "1.0.2"
+#define UXC_ETC_CONFDIR "/etc/uxc"
+#define UXC_VOL_CONFDIR "/tmp/run/uvol/.meta/uxc"
+
+static bool verbose = false;
+static bool json_output = false;
+static char *confdir = UXC_ETC_CONFDIR;
+static struct ustream_fd cufd;
+static struct ustream_fd lufd;
+
+
+struct runtime_state {
+       struct avl_node avl;
+       char *container_name;
+       char *instance_name;
+       char *jail_name;
+       bool running;
+       int runtime_pid;
+       int exitcode;
+       struct blob_attr *ocistate;
+};
+
+struct settings {
+       struct avl_node avl;
+       char *container_name;
+       const char *fname;
+       char *tmprwsize;
+       char *writepath;
+       signed char autostart;
+       struct blob_attr *volumes;
+};
+
+enum uxc_cmd {
+       CMD_ATTACH,
+       CMD_LIST,
+       CMD_BOOT,
+       CMD_START,
+       CMD_STATE,
+       CMD_KILL,
+       CMD_ENABLE,
+       CMD_DISABLE,
+       CMD_DELETE,
+       CMD_CREATE,
+       CMD_UNKNOWN
+};
+
+#define OPT_ARGS "ab:fjm:p:t:vVw:"
+static struct option long_options[] = {
+       {"autostart",           no_argument,            0,      'a'     },
+       {"console",             no_argument,            0,      'c'     },
+       {"bundle",              required_argument,      0,      'b'     },
+       {"force",               no_argument,            0,      'f'     },
+       {"json",                no_argument,            0,      'j'     },
+       {"mounts",              required_argument,      0,      'm'     },
+       {"pid-file",            required_argument,      0,      'p'     },
+       {"temp-overlay-size",   required_argument,      0,      't'     },
+       {"write-overlay-path",  required_argument,      0,      'w'     },
+       {"verbose",             no_argument,            0,      'v'     },
+       {"version",             no_argument,            0,      'V'     },
+       {0,                     0,                      0,      0       }
+};
+
+AVL_TREE(runtime, avl_strcmp, false, NULL);
+AVL_TREE(settings, avl_strcmp, false, NULL);
+static struct blob_buf conf;
+static struct blob_buf settingsbuf;
+static struct blob_attr *blockinfo;
+static struct blob_attr *fstabinfo;
+static struct ubus_context *ctx;
+
+static int usage(void) {
+       printf("syntax: uxc <command> [parameters ...]\n");
+       printf("commands:\n");
+       printf("\tlist [--json]\t\t\t\tlist all configured containers\n");
+       printf("\tattach <conf>\t\t\t\tattach to container console\n");
+       printf("\tcreate <conf>\t\t\t\t(re-)create <conf>\n");
+       printf("\t\t[--bundle <path>]\t\t\tOCI bundle at <path>\n");
+       printf("\t\t[--autostart]\t\t\t\tstart on boot\n");
+       printf("\t\t[--temp-overlay-size <size>]\t\tuse tmpfs overlay with {size}\n");
+       printf("\t\t[--write-overlay-path <path>]\t\tuse overlay on {path}\n");
+       printf("\t\t[--mounts <v1>,<v2>,...,<vN>]\t\trequire filesystems to be available\n");
+       printf("\tstart [--console] <conf>\t\tstart container <conf>\n");
+       printf("\tstate <conf>\t\t\t\tget state of container <conf>\n");
+       printf("\tkill <conf> [<signal>]\t\t\tsend signal to container <conf>\n");
+       printf("\tenable <conf>\t\t\t\tstart container <conf> on boot\n");
+       printf("\tdisable <conf>\t\t\t\tdon't start container <conf> on boot\n");
+       printf("\tdelete <conf> [--force]\t\t\tdelete <conf>\n");
+       return -EINVAL;
+}
+
+enum {
+       CONF_NAME,
+       CONF_PATH,
+       CONF_JAIL,
+       CONF_AUTOSTART,
+       CONF_PIDFILE,
+       CONF_TEMP_OVERLAY_SIZE,
+       CONF_WRITE_OVERLAY_PATH,
+       CONF_VOLUMES,
+       __CONF_MAX,
+};
+
+static const struct blobmsg_policy conf_policy[__CONF_MAX] = {
+       [CONF_NAME] = { .name = "name", .type = BLOBMSG_TYPE_STRING },
+       [CONF_PATH] = { .name = "path", .type = BLOBMSG_TYPE_STRING },
+       [CONF_JAIL] = { .name = "jail", .type = BLOBMSG_TYPE_STRING },
+       [CONF_AUTOSTART] = { .name = "autostart", .type = BLOBMSG_TYPE_BOOL },
+       [CONF_PIDFILE] = { .name = "pidfile", .type = BLOBMSG_TYPE_STRING },
+       [CONF_TEMP_OVERLAY_SIZE] = { .name = "temp-overlay-size", .type = BLOBMSG_TYPE_STRING },
+       [CONF_WRITE_OVERLAY_PATH] = { .name = "write-overlay-path", .type = BLOBMSG_TYPE_STRING },
+       [CONF_VOLUMES] = { .name = "volumes", .type = BLOBMSG_TYPE_ARRAY },
+};
+
+static int conf_load(bool load_settings)
+{
+       int gl_flags = GLOB_NOESCAPE | GLOB_MARK;
+       int j, res;
+       glob_t gl;
+       char *globstr;
+       void *c, *o;
+       struct stat sb;
+       struct blob_buf *target;
+
+       if (asprintf(&globstr, "%s/%s*.json", UXC_ETC_CONFDIR, load_settings?"settings/":"") == -1)
+               return -ENOMEM;
+
+       res = glob(globstr, gl_flags, NULL, &gl);
+       if (res == 0)
+               gl_flags |= GLOB_APPEND;
+
+       free(globstr);
+
+       if (!stat(UXC_VOL_CONFDIR, &sb)) {
+               if (sb.st_mode & S_IFDIR) {
+                       if (asprintf(&globstr, "%s/%s*.json",  UXC_VOL_CONFDIR, load_settings?"settings/":"") == -1)
+                               return -ENOMEM;
+
+                       res = glob(globstr, gl_flags, NULL, &gl);
+                       free(globstr);
+               }
+       }
+
+       target = load_settings ? &settingsbuf : &conf;
+       blob_buf_init(target, 0);
+       c = blobmsg_open_table(target, NULL);
+
+       if (res < 0)
+               return 0;
+
+       for (j = 0; j < gl.gl_pathc; j++) {
+               o = blobmsg_open_table(target, strdup(gl.gl_pathv[j]));
+               if (!blobmsg_add_json_from_file(target, gl.gl_pathv[j])) {
+                       ERROR("uxc: failed to load %s\n", gl.gl_pathv[j]);
+                       continue;
+               }
+               blobmsg_close_table(target, o);
+       }
+       blobmsg_close_table(target, c);
+       globfree(&gl);
+
+       return 0;
+}
+
+static struct settings *
+settings_alloc(const char *container_name)
+{
+       struct settings *s;
+       char *new_name;
+       s = calloc_a(sizeof(*s), &new_name, strlen(container_name) + 1);
+       strcpy(new_name, container_name);
+       s->container_name = new_name;
+       s->avl.key = s->container_name;
+       s->autostart = -1;
+       s->tmprwsize = NULL;
+       s->writepath = NULL;
+       s->volumes = NULL;
+       return s;
+}
+
+static int settings_add(void)
+{
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       struct settings *s;
+       int rem, err;
+
+       avl_init(&settings, avl_strcmp, false, NULL);
+
+       blobmsg_for_each_attr(cur, blob_data(settingsbuf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME])
+                       continue;
+
+               if (tb[CONF_TEMP_OVERLAY_SIZE] && tb[CONF_WRITE_OVERLAY_PATH])
+                       return -EINVAL;
+
+               s = settings_alloc(blobmsg_get_string(tb[CONF_NAME]));
+
+               if (tb[CONF_AUTOSTART])
+                       s->autostart = blobmsg_get_bool(tb[CONF_AUTOSTART]);
+
+               if (tb[CONF_TEMP_OVERLAY_SIZE])
+                       s->tmprwsize = blobmsg_get_string(tb[CONF_TEMP_OVERLAY_SIZE]);
+
+               if (tb[CONF_WRITE_OVERLAY_PATH])
+                       s->writepath = blobmsg_get_string(tb[CONF_WRITE_OVERLAY_PATH]);
+
+               s->volumes = tb[CONF_VOLUMES];
+               s->fname = blobmsg_name(cur);
+
+               err = avl_insert(&settings, &s->avl);
+               if (err) {
+                       fprintf(stderr, "error adding settings for %s\n", blobmsg_get_string(tb[CONF_NAME]));
+                       free(s);
+               }
+       }
+
+       return 0;
+}
+
+static void settings_free(void)
+{
+       struct settings *item, *tmp;
+
+       avl_for_each_element_safe(&settings, item, avl, tmp) {
+               avl_delete(&settings, &item->avl);
+               free(item);
+       }
+
+       return;
+}
+
+enum {
+       LIST_INSTANCES,
+       __LIST_MAX,
+};
+
+static const struct blobmsg_policy list_policy[__LIST_MAX] = {
+       [LIST_INSTANCES] = { .name = "instances", .type = BLOBMSG_TYPE_TABLE },
+};
+
+enum {
+       INSTANCE_RUNNING,
+       INSTANCE_PID,
+       INSTANCE_EXITCODE,
+       INSTANCE_JAIL,
+       __INSTANCE_MAX,
+};
+
+static const struct blobmsg_policy instance_policy[__INSTANCE_MAX] = {
+       [INSTANCE_RUNNING] = { .name = "running", .type = BLOBMSG_TYPE_BOOL },
+       [INSTANCE_PID] = { .name = "pid", .type = BLOBMSG_TYPE_INT32 },
+       [INSTANCE_EXITCODE] = { .name = "exit_code", .type = BLOBMSG_TYPE_INT32 },
+       [INSTANCE_JAIL] = { .name = "jail", .type = BLOBMSG_TYPE_TABLE },
+};
+
+enum {
+       JAIL_NAME,
+       __JAIL_MAX,
+};
+
+static const struct blobmsg_policy jail_policy[__JAIL_MAX] = {
+       [JAIL_NAME] = { .name = "name", .type = BLOBMSG_TYPE_STRING },
+};
+
+static struct runtime_state *
+runtime_alloc(const char *container_name)
+{
+       struct runtime_state *s;
+       char *new_name;
+       s = calloc_a(sizeof(*s), &new_name, strlen(container_name) + 1);
+       strcpy(new_name, container_name);
+       s->container_name = new_name;
+       s->avl.key = s->container_name;
+       return s;
+}
+
+enum {
+       STATE_OCIVERSION,
+       STATE_ID,
+       STATE_STATUS,
+       STATE_PID,
+       STATE_BUNDLE,
+       STATE_ANNOTATIONS,
+       __STATE_MAX,
+};
+
+static const struct blobmsg_policy state_policy[__STATE_MAX] = {
+       [STATE_OCIVERSION] = { .name = "ociVersion", .type = BLOBMSG_TYPE_STRING },
+       [STATE_ID] = { .name = "id", .type = BLOBMSG_TYPE_STRING },
+       [STATE_STATUS] = { .name = "status", .type = BLOBMSG_TYPE_STRING },
+       [STATE_PID] = { .name = "pid", .type = BLOBMSG_TYPE_INT32 },
+       [STATE_BUNDLE] = { .name = "bundle", .type = BLOBMSG_TYPE_STRING },
+       [STATE_ANNOTATIONS] = { .name = "annotations", .type = BLOBMSG_TYPE_TABLE },
+};
+
+
+static void ocistate_cb(struct ubus_request *req, int type, struct blob_attr *msg)
+{
+       struct blob_attr **ocistate = (struct blob_attr **)req->priv;
+       struct blob_attr *tb[__STATE_MAX];
+
+       blobmsg_parse(state_policy, __STATE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
+
+       if (!tb[STATE_OCIVERSION] ||
+           !tb[STATE_ID] ||
+           !tb[STATE_STATUS] ||
+           !tb[STATE_BUNDLE])
+               return;
+
+       *ocistate = blob_memdup(msg);
+}
+
+static void get_ocistate(struct blob_attr **ocistate, const char *name)
+{
+       char *objname;
+       unsigned int id;
+       int ret;
+       *ocistate = NULL;
+
+       if (asprintf(&objname, "container.%s", name) == -1)
+               exit(ENOMEM);
+
+       ret = ubus_lookup_id(ctx, objname, &id);
+       free(objname);
+       if (ret)
+               return;
+
+       ubus_invoke(ctx, id, "state", NULL, ocistate_cb, ocistate, 3000);
+}
+
+static void list_cb(struct ubus_request *req, int type, struct blob_attr *msg)
+{
+       struct blob_attr *cur, *curi, *tl[__LIST_MAX], *ti[__INSTANCE_MAX], *tj[__JAIL_MAX];
+       int rem, remi;
+       const char *container_name, *instance_name, *jail_name;
+       bool running;
+       int pid, exitcode;
+       struct runtime_state *rs;
+
+       blobmsg_for_each_attr(cur, msg, rem) {
+               container_name = blobmsg_name(cur);
+               blobmsg_parse(list_policy, __LIST_MAX, tl, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tl[LIST_INSTANCES])
+                       continue;
+
+               blobmsg_for_each_attr(curi, tl[LIST_INSTANCES], remi) {
+                       instance_name = blobmsg_name(curi);
+                       blobmsg_parse(instance_policy, __INSTANCE_MAX, ti, blobmsg_data(curi), blobmsg_len(curi));
+
+                       if (!ti[INSTANCE_JAIL])
+                               continue;
+
+                       blobmsg_parse(jail_policy, __JAIL_MAX, tj, blobmsg_data(ti[INSTANCE_JAIL]), blobmsg_len(ti[INSTANCE_JAIL]));
+                       if (!tj[JAIL_NAME])
+                               continue;
+
+                       jail_name = blobmsg_get_string(tj[JAIL_NAME]);
+
+                       running = ti[INSTANCE_RUNNING] && blobmsg_get_bool(ti[INSTANCE_RUNNING]);
+
+                       if (ti[INSTANCE_PID])
+                               pid = blobmsg_get_u32(ti[INSTANCE_PID]);
+                       else
+                               pid = -1;
+
+                       if (ti[INSTANCE_EXITCODE])
+                               exitcode = blobmsg_get_u32(ti[INSTANCE_EXITCODE]);
+                       else
+                               exitcode = -1;
+
+                       rs = runtime_alloc(container_name);
+                       rs->instance_name = strdup(instance_name);
+                       rs->jail_name = strdup(jail_name);
+                       rs->runtime_pid = pid;
+                       rs->exitcode = exitcode;
+                       rs->running = running;
+                       avl_insert(&runtime, &rs->avl);
+               }
+       }
+
+       return;
+}
+
+static int runtime_load(void)
+{
+       struct runtime_state *item, *tmp;
+       uint32_t id;
+
+       avl_init(&runtime, avl_strcmp, false, NULL);
+       if (ubus_lookup_id(ctx, "container", &id) ||
+               ubus_invoke(ctx, id, "list", NULL, list_cb, &runtime, 3000))
+               return -EIO;
+
+       avl_for_each_element_safe(&runtime, item, avl, tmp)
+               get_ocistate(&item->ocistate, item->jail_name);
+
+       return 0;
+}
+
+static void runtime_free(void)
+{
+       struct runtime_state *item, *tmp;
+
+       avl_for_each_element_safe(&runtime, item, avl, tmp) {
+               avl_delete(&runtime, &item->avl);
+               free(item->instance_name);
+               free(item->jail_name);
+               free(item->ocistate);
+               free(item);
+       }
+
+       return;
+}
+
+static inline int setup_tios(int fd, struct termios *oldtios)
+{
+       struct termios newtios;
+
+       if (!isatty(fd)) {
+               return -EIO;
+       }
+
+       /* Get current termios */
+       if (tcgetattr(fd, oldtios) < 0)
+               return -errno;
+
+       newtios = *oldtios;
+
+       /* We use the same settings that ssh does. */
+       newtios.c_iflag |= IGNPAR;
+       newtios.c_iflag &= ~(ISTRIP | INLCR | IGNCR | ICRNL | IXON | IXANY | IXOFF);
+       newtios.c_lflag &= ~(TOSTOP | ISIG | ICANON | ECHO | ECHOE | ECHOK | ECHONL);
+       newtios.c_oflag &= ~ONLCR;
+       newtios.c_oflag |= OPOST;
+       newtios.c_cc[VMIN] = 1;
+       newtios.c_cc[VTIME] = 0;
+
+       /* Set new attributes */
+       if (tcsetattr(fd, TCSAFLUSH, &newtios) < 0)
+               return -errno;
+
+       return 0;
+}
+
+
+static void client_cb(struct ustream *s, int bytes)
+{
+       char *buf;
+       int len, rv;
+
+       do {
+               buf = ustream_get_read_buf(s, &len);
+               if (!buf)
+                       break;
+
+               rv = ustream_write(&lufd.stream, buf, len, false);
+
+               if (rv > 0)
+                       ustream_consume(s, rv);
+
+               if (rv <= len)
+                       break;
+       } while(1);
+}
+
+static void local_cb(struct ustream *s, int bytes)
+{
+       char *buf;
+       int len, rv;
+
+       do {
+               buf = ustream_get_read_buf(s, &len);
+               if (!buf)
+                       break;
+
+               if ((len > 0) && (buf[0] == 2))
+                               uloop_end();
+
+               rv = ustream_write(&cufd.stream, buf, len, false);
+
+               if (rv > 0)
+                       ustream_consume(s, rv);
+
+               if (rv <= len)
+                       break;
+       } while(1);
+}
+
+static int uxc_attach(const char *container_name)
+{
+       struct ubus_context *ctx;
+       uint32_t id;
+       static struct blob_buf req;
+       int client_fd, server_fd, tty_fd;
+       struct termios oldtermios;
+
+       ctx = ubus_connect(NULL);
+       if (!ctx) {
+               fprintf(stderr, "can't connect to ubus!\n");
+               return -ECONNREFUSED;
+       }
+
+       /* open pseudo-terminal pair */
+       client_fd = posix_openpt(O_RDWR | O_NOCTTY);
+       if (client_fd < 0) {
+               fprintf(stderr, "can't create virtual console!\n");
+               ubus_free(ctx);
+               return -EIO;
+       }
+       setup_tios(client_fd, &oldtermios);
+       grantpt(client_fd);
+       unlockpt(client_fd);
+       server_fd = open(ptsname(client_fd), O_RDWR | O_NOCTTY);
+       if (server_fd < 0) {
+               fprintf(stderr, "can't open virtual console!\n");
+               close(client_fd);
+               ubus_free(ctx);
+               return -EIO;
+       }
+       setup_tios(server_fd, &oldtermios);
+
+       tty_fd = open("/dev/tty", O_RDWR);
+       if (tty_fd < 0) {
+               fprintf(stderr, "can't open local console!\n");
+               close(server_fd);
+               close(client_fd);
+               ubus_free(ctx);
+               return -EIO;
+       }
+       setup_tios(tty_fd, &oldtermios);
+
+       /* register server-side with procd */
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", container_name);
+       blobmsg_add_string(&req, "instance", container_name);
+
+       if (ubus_lookup_id(ctx, "container", &id) ||
+           ubus_invoke_fd(ctx, id, "console_attach", req.head, NULL, NULL, 3000, server_fd)) {
+               fprintf(stderr, "ubus request failed\n");
+               close(tty_fd);
+               close(server_fd);
+               close(client_fd);
+               blob_buf_free(&req);
+               ubus_free(ctx);
+               return -ENXIO;
+       }
+
+       close(server_fd);
+       blob_buf_free(&req);
+       ubus_free(ctx);
+
+       uloop_init();
+
+       /* forward between stdio and client_fd until detach is requested */
+       lufd.stream.notify_read = local_cb;
+       ustream_fd_init(&lufd, tty_fd);
+
+       cufd.stream.notify_read = client_cb;
+/* ToDo: handle remote close and other events */
+//     cufd.stream.notify_state = client_state_cb;
+       ustream_fd_init(&cufd, client_fd);
+
+       fprintf(stderr, "attaching to jail console. press [CTRL]+[B] to exit.\n");
+       close(0);
+       close(1);
+       close(2);
+       uloop_run();
+
+       tcsetattr(tty_fd, TCSAFLUSH, &oldtermios);
+       ustream_free(&lufd.stream);
+       ustream_free(&cufd.stream);
+       close(client_fd);
+
+       return 0;
+}
+
+static int uxc_state(char *name)
+{
+       struct runtime_state *rsstate = avl_find_element(&runtime, name, rsstate, avl);
+       struct blob_attr *ocistate = NULL;
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       int rem;
+       char *bundle = NULL;
+       char *jail_name = NULL;
+       char *state = NULL;
+       char *tmp;
+       static struct blob_buf buf;
+
+       if (rsstate)
+               ocistate = rsstate->ocistate;
+
+       if (ocistate) {
+               state = blobmsg_format_json_indent(ocistate, true, 0);
+               if (!state)
+                       return -ENOMEM;
+
+               printf("%s\n", state);
+               free(state);
+               return 0;
+       }
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               if (!strcmp(name, blobmsg_get_string(tb[CONF_NAME]))) {
+                       if (tb[CONF_JAIL])
+                               jail_name = blobmsg_get_string(tb[CONF_JAIL]);
+                       else
+                               jail_name = name;
+
+                       bundle = blobmsg_get_string(tb[CONF_PATH]);
+                       break;
+               }
+       }
+
+       if (!bundle)
+               return -ENOENT;
+
+       blob_buf_init(&buf, 0);
+       blobmsg_add_string(&buf, "ociVersion", OCI_VERSION_STRING);
+       blobmsg_add_string(&buf, "id", jail_name);
+       blobmsg_add_string(&buf, "status", rsstate?"stopped":"uninitialized");
+       blobmsg_add_string(&buf, "bundle", bundle);
+
+       tmp = blobmsg_format_json_indent(buf.head, true, 0);
+       if (!tmp) {
+               blob_buf_free(&buf);
+               return -ENOMEM;
+       }
+
+       printf("%s\n", tmp);
+       free(tmp);
+
+       blob_buf_free(&buf);
+
+       return 0;
+}
+
+static int uxc_list(void)
+{
+       struct blob_attr *cur, *tb[__CONF_MAX], *ts[__STATE_MAX];
+       int rem;
+       struct runtime_state *rsstate = NULL;
+       struct settings *usettings = NULL;
+       char *name, *ocistatus, *status, *tmp;
+       int container_pid = -1;
+       bool autostart;
+       static struct blob_buf buf;
+       void *arr, *obj;
+
+       if (json_output) {
+               blob_buf_init(&buf, 0);
+               arr = blobmsg_open_array(&buf, "");
+       }
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               autostart = tb[CONF_AUTOSTART] && blobmsg_get_bool(tb[CONF_AUTOSTART]);
+
+               ocistatus = NULL;
+               container_pid = 0;
+               name = blobmsg_get_string(tb[CONF_NAME]);
+               rsstate = avl_find_element(&runtime, name, rsstate, avl);
+
+               if (rsstate && rsstate->ocistate) {
+                       blobmsg_parse(state_policy, __STATE_MAX, ts, blobmsg_data(rsstate->ocistate), blobmsg_len(rsstate->ocistate));
+                       ocistatus = blobmsg_get_string(ts[STATE_STATUS]);
+                       container_pid = blobmsg_get_u32(ts[STATE_PID]);
+               }
+
+               status = ocistatus?:(rsstate && rsstate->running)?"creating":"stopped";
+
+               usettings = avl_find_element(&settings, name, usettings, avl);
+
+               if (usettings && (usettings->autostart >= 0))
+                       autostart = !!(usettings->autostart);
+
+               if (json_output) {
+                       obj = blobmsg_open_table(&buf, "");
+                       blobmsg_add_string(&buf, "name", name);
+                       blobmsg_add_string(&buf, "status", status);
+                       blobmsg_add_u8(&buf, "autostart", autostart);
+               } else {
+                       printf("[%c] %s %s", autostart?'*':' ', name, status);
+               }
+
+               if (rsstate && !rsstate->running && (rsstate->exitcode >= 0)) {
+                       if (json_output)
+                               blobmsg_add_u32(&buf, "exitcode", rsstate->exitcode);
+                       else
+                               printf(" exitcode: %d (%s)", rsstate->exitcode, strerror(rsstate->exitcode));
+               }
+
+               if (rsstate && rsstate->running && (rsstate->runtime_pid >= 0)) {
+                       if (json_output)
+                               blobmsg_add_u32(&buf, "runtime_pid", rsstate->runtime_pid);
+                       else
+                               printf(" runtime pid: %d", rsstate->runtime_pid);
+               }
+
+               if (rsstate && rsstate->running && (container_pid >= 0)) {
+                       if (json_output)
+                               blobmsg_add_u32(&buf, "container_pid", container_pid);
+                       else
+                               printf(" container pid: %d", container_pid);
+               }
+
+               if (!json_output)
+                       printf("\n");
+               else
+                       blobmsg_close_table(&buf, obj);
+       }
+
+       if (json_output) {
+               blobmsg_close_array(&buf, arr);
+               tmp = blobmsg_format_json_indent(buf.head, true, 0);
+               if (!tmp) {
+                       blob_buf_free(&buf);
+                       return -ENOMEM;
+               }
+               printf("%s\n", tmp);
+               free(tmp);
+               blob_buf_free(&buf);
+       };
+
+       return 0;
+}
+
+static int uxc_exists(char *name)
+{
+       struct runtime_state *rsstate = NULL;
+       rsstate = avl_find_element(&runtime, name, rsstate, avl);
+
+       if (rsstate && (rsstate->running))
+               return -EEXIST;
+
+       return 0;
+}
+
+static int uxc_create(char *name, bool immediately)
+{
+       static struct blob_buf req;
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       int rem, ret = 0;
+       uint32_t id;
+       struct settings *usettings = NULL;
+       char *path = NULL, *jailname = NULL, *pidfile = NULL, *tmprwsize = NULL, *writepath = NULL;
+
+       void *in, *ins, *j;
+       bool found = false;
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               if (strcmp(name, blobmsg_get_string(tb[CONF_NAME])))
+                       continue;
+
+               found = true;
+               break;
+       }
+
+       if (!found)
+               return -ENOENT;
+
+       path = blobmsg_get_string(tb[CONF_PATH]);
+
+       if (tb[CONF_PIDFILE])
+               pidfile = blobmsg_get_string(tb[CONF_PIDFILE]);
+
+       if (tb[CONF_TEMP_OVERLAY_SIZE])
+               tmprwsize = blobmsg_get_string(tb[CONF_TEMP_OVERLAY_SIZE]);
+
+       if (tb[CONF_WRITE_OVERLAY_PATH])
+               writepath = blobmsg_get_string(tb[CONF_WRITE_OVERLAY_PATH]);
+
+       if (tb[CONF_JAIL])
+               jailname = blobmsg_get_string(tb[CONF_JAIL]);
+
+       usettings = avl_find_element(&settings, blobmsg_get_string(tb[CONF_NAME]), usettings, avl);
+       if (usettings) {
+               if (usettings->writepath) {
+                       writepath = usettings->writepath;
+                       tmprwsize = NULL;
+               }
+               if (usettings->tmprwsize) {
+                       tmprwsize = usettings->tmprwsize;
+                       writepath = NULL;
+               }
+       }
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", name);
+       ins = blobmsg_open_table(&req, "instances");
+       in = blobmsg_open_table(&req, name);
+       blobmsg_add_string(&req, "bundle", path);
+       j = blobmsg_open_table(&req, "jail");
+       blobmsg_add_string(&req, "name", jailname?:name);
+       blobmsg_add_u8(&req, "immediately", immediately);
+
+       if (pidfile)
+               blobmsg_add_string(&req, "pidfile", pidfile);
+
+       blobmsg_close_table(&req, j);
+
+       if (writepath)
+               blobmsg_add_string(&req, "overlaydir", writepath);
+
+       if (tmprwsize)
+               blobmsg_add_string(&req, "tmpoverlaysize", tmprwsize);
+
+       blobmsg_close_table(&req, in);
+       blobmsg_close_table(&req, ins);
+
+       if (verbose) {
+               char *tmp;
+               tmp = blobmsg_format_json_indent(req.head, true, 1);
+               if (!tmp)
+                       return -ENOMEM;
+
+               fprintf(stderr, "adding container to procd:\n\t%s\n", tmp);
+               free(tmp);
+       }
+
+       if (ubus_lookup_id(ctx, "container", &id) ||
+               ubus_invoke(ctx, id, "add", req.head, NULL, NULL, 3000)) {
+               blob_buf_free(&req);
+               ret = -EIO;
+       }
+
+       return ret;
+}
+
+static int uxc_start(const char *name, bool console)
+{
+       char *objname;
+       unsigned int id;
+       pid_t pid;
+
+       if (console) {
+               pid = fork();
+               if (pid > 0)
+                       exit(uxc_attach(name));
+       }
+
+       if (asprintf(&objname, "container.%s", name) == -1)
+               return -ENOMEM;
+
+       if (ubus_lookup_id(ctx, objname, &id))
+               return -ENOENT;
+
+       free(objname);
+       return ubus_invoke(ctx, id, "start", NULL, NULL, NULL, 3000);
+}
+
+static int uxc_kill(char *name, int signal)
+{
+       static struct blob_buf req;
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       int rem, ret;
+       char *objname;
+       unsigned int id;
+       struct runtime_state *rsstate = NULL;
+       bool found = false;
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               if (strcmp(name, blobmsg_get_string(tb[CONF_NAME])))
+                       continue;
+
+               found = true;
+               break;
+       }
+
+       if (!found)
+               return -ENOENT;
+
+       rsstate = avl_find_element(&runtime, name, rsstate, avl);
+
+       if (!rsstate || !(rsstate->running))
+               return -ENOENT;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_u32(&req, "signal", signal);
+       blobmsg_add_string(&req, "name", name);
+
+       if (asprintf(&objname, "container.%s", name) == -1)
+               return -ENOMEM;
+
+       ret = ubus_lookup_id(ctx, objname, &id);
+       free(objname);
+       if (ret)
+               return -ENOENT;
+
+       if (ubus_invoke(ctx, id, "kill", req.head, NULL, NULL, 3000))
+               return -EIO;
+
+       return 0;
+}
+
+
+static int uxc_set(char *name, char *path, signed char autostart, char *pidfile, char *tmprwsize, char *writepath, char *requiredmounts)
+{
+       static struct blob_buf req;
+       struct settings *usettings = NULL;
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       int rem, ret;
+       const char *cfname = NULL;
+       const char *sfname = NULL;
+       char *fname = NULL;
+       char *curvol, *tmp, *mnttok;
+       void *mntarr;
+       int f;
+       struct stat sb;
+
+       /* nothing to do */
+       if (!path && (autostart<0) && !pidfile && !tmprwsize && !writepath && !requiredmounts)
+               return 0;
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               if (strcmp(name, blobmsg_get_string(tb[CONF_NAME])))
+                       continue;
+
+               cfname = blobmsg_name(cur);
+               break;
+       }
+
+       if (cfname && path)
+               return -EEXIST;
+
+       if (!cfname && !path)
+               return -ENOENT;
+
+       if (path) {
+               if (stat(path, &sb) == -1)
+                       return -ENOENT;
+
+               if ((sb.st_mode & S_IFMT) != S_IFDIR)
+                       return -ENOTDIR;
+       }
+
+       usettings = avl_find_element(&settings, blobmsg_get_string(tb[CONF_NAME]), usettings, avl);
+       if (path && usettings)
+               return -EIO;
+
+       if (usettings) {
+               sfname = usettings->fname;
+               if (!tmprwsize && !writepath) {
+                       if (usettings->tmprwsize) {
+                               tmprwsize = usettings->tmprwsize;
+                               writepath = NULL;
+                       }
+                       if (usettings->writepath) {
+                               writepath = usettings->writepath;
+                               tmprwsize = NULL;
+                       }
+               }
+               if (usettings->autostart >= 0 && autostart < 0)
+                       autostart = !!(usettings->autostart);
+       }
+
+       if (path) {
+               ret = mkdir(confdir, 0755);
+
+               if (ret && errno != EEXIST)
+                       return -errno;
+
+               if (asprintf(&fname, "%s/%s.json", confdir, name) == -1)
+                       return -ENOMEM;
+
+               f = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+               if (f < 0)
+                       return -errno;
+
+               free(fname);
+       } else {
+               if (sfname) {
+                       f = open(sfname, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+               } else {
+                       char *t1, *t2;
+                       t1 = strdup(cfname);
+                       t2 = strrchr(t1, '/');
+                       if (!t2)
+                               return -EINVAL;
+
+                       *t2 = '\0';
+
+                       if (asprintf(&t2, "%s/settings", t1) == -1)
+                               return -ENOMEM;
+
+                       ret = mkdir(t2, 0755);
+                       if (ret && ret != EEXIST)
+                               return -ret;
+
+                       free(t2);
+                       if (asprintf(&t2, "%s/settings/%s.json", t1, name) == -1)
+                               return -ENOMEM;
+
+                       free(t1);
+                       f = open(t2, O_WRONLY | O_CREAT | O_TRUNC, 0644);
+                       free(t2);
+               }
+               if (f < 0)
+                       return -errno;
+       }
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "name", name);
+       if (path)
+               blobmsg_add_string(&req, "path", path);
+
+       if (autostart >= 0)
+               blobmsg_add_u8(&req, "autostart", !!autostart);
+
+       if (pidfile)
+               blobmsg_add_string(&req, "pidfile", pidfile);
+
+       if (tmprwsize)
+               blobmsg_add_string(&req, "temp-overlay-size", tmprwsize);
+
+       if (writepath)
+               blobmsg_add_string(&req, "write-overlay-path", writepath);
+
+       if (!requiredmounts && usettings && usettings->volumes)
+               blobmsg_add_blob(&req, usettings->volumes);
+
+       if (requiredmounts) {
+               mntarr = blobmsg_open_array(&req, "volumes");
+               for (mnttok = requiredmounts; ; mnttok = NULL) {
+                       curvol = strtok_r(mnttok, ",;", &tmp);
+                       if (!curvol)
+                               break;
+
+                       blobmsg_add_string(&req, NULL, curvol);
+               }
+               blobmsg_close_array(&req, mntarr);
+       }
+
+       tmp = blobmsg_format_json_indent(req.head, true, 0);
+       if (tmp) {
+               dprintf(f, "%s\n", tmp);
+               free(tmp);
+       }
+
+       blob_buf_free(&req);
+       close(f);
+
+       return 1;
+}
+
+enum {
+       BLOCK_INFO_DEVICE,
+       BLOCK_INFO_UUID,
+       BLOCK_INFO_TARGET,
+       BLOCK_INFO_TYPE,
+       BLOCK_INFO_MOUNT,
+       __BLOCK_INFO_MAX,
+};
+
+static const struct blobmsg_policy block_info_policy[__BLOCK_INFO_MAX] = {
+       [BLOCK_INFO_DEVICE] = { .name = "device", .type = BLOBMSG_TYPE_STRING },
+       [BLOCK_INFO_UUID] = { .name = "uuid", .type = BLOBMSG_TYPE_STRING },
+       [BLOCK_INFO_TARGET] = { .name = "target", .type = BLOBMSG_TYPE_STRING },
+       [BLOCK_INFO_TYPE] = { .name = "type", .type = BLOBMSG_TYPE_STRING },
+       [BLOCK_INFO_MOUNT] = { .name = "mount", .type = BLOBMSG_TYPE_STRING },
+};
+
+
+/* check if device 'devname' is mounted according to blockd */
+static bool checkblock(const char *uuid)
+{
+       struct blob_attr *tb[__BLOCK_INFO_MAX];
+       struct blob_attr *cur;
+       int rem;
+
+       blobmsg_for_each_attr(cur, blockinfo, rem) {
+               blobmsg_parse(block_info_policy, __BLOCK_INFO_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[BLOCK_INFO_UUID] || !tb[BLOCK_INFO_MOUNT])
+                       continue;
+
+               if (!strcmp(uuid, blobmsg_get_string(tb[BLOCK_INFO_UUID])))
+                       return false;
+       }
+
+       return true;
+}
+
+enum {
+       UCI_FSTAB_UUID,
+       UCI_FSTAB_ANONYMOUS,
+       __UCI_FSTAB_MAX,
+};
+
+static const struct blobmsg_policy uci_fstab_policy[__UCI_FSTAB_MAX] = {
+       [UCI_FSTAB_UUID] = { .name = "uuid", .type = BLOBMSG_TYPE_STRING },
+       [UCI_FSTAB_ANONYMOUS] = { .name = ".anonymous", .type = BLOBMSG_TYPE_BOOL },
+};
+
+static const char *resolveuuid(const char *volname)
+{
+       struct blob_attr *tb[__UCI_FSTAB_MAX];
+       struct blob_attr *cur;
+       const char *mntname;
+       char *tmpvolname, *replc;
+       int rem, res;
+
+       blobmsg_for_each_attr(cur, fstabinfo, rem) {
+               blobmsg_parse(uci_fstab_policy, __UCI_FSTAB_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+
+               if (!tb[UCI_FSTAB_UUID])
+                       continue;
+
+               if (tb[UCI_FSTAB_ANONYMOUS] && blobmsg_get_bool(tb[UCI_FSTAB_ANONYMOUS]))
+                       continue;
+
+               mntname = blobmsg_name(cur);
+               if (!mntname)
+                       continue;
+
+               tmpvolname = strdup(volname);
+               while ((replc = strchr(tmpvolname, '-')))
+                       *replc = '_';
+
+               res = strcmp(tmpvolname, mntname);
+               free(tmpvolname);
+
+               if (!res)
+                       return blobmsg_get_string(tb[UCI_FSTAB_UUID]);
+       };
+
+       return volname;
+};
+
+/* check status of each required volume */
+static bool checkvolumes(struct blob_attr *volumes)
+{
+       struct blob_attr *cur;
+       int rem;
+
+       blobmsg_for_each_attr(cur, volumes, rem) {
+               if (checkblock(resolveuuid(blobmsg_get_string(cur))))
+                       return true;
+       }
+
+       return false;
+}
+
+static void block_cb(struct ubus_request *req, int type, struct blob_attr *msg)
+{
+       blockinfo = blob_memdup(blobmsg_data(msg));
+}
+
+static void fstab_cb(struct ubus_request *req, int type, struct blob_attr *msg)
+{
+       fstabinfo = blob_memdup(blobmsg_data(msg));
+}
+
+static int uxc_boot(void)
+{
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       struct runtime_state *rsstate = NULL;
+       struct settings *usettings = NULL;
+       static struct blob_buf req;
+       int rem, ret = 0;
+       char *name;
+       unsigned int id;
+       bool autostart;
+
+       ret = ubus_lookup_id(ctx, "block", &id);
+       if (ret)
+               return -ENOENT;
+
+       ret = ubus_invoke(ctx, id, "info", NULL, block_cb, NULL, 3000);
+       if (ret)
+               return -ENXIO;
+
+       ret = ubus_lookup_id(ctx, "uci", &id);
+       if (ret)
+               return -ENOENT;
+
+       blob_buf_init(&req, 0);
+       blobmsg_add_string(&req, "config", "fstab");
+       blobmsg_add_string(&req, "type", "mount");
+
+       ret = ubus_invoke(ctx, id, "get", req.head, fstab_cb, NULL, 3000);
+       if (ret)
+               return ret;
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               rsstate = avl_find_element(&runtime, blobmsg_get_string(tb[CONF_NAME]), rsstate, avl);
+               if (rsstate)
+                       continue;
+
+               if (tb[CONF_AUTOSTART])
+                       autostart = blobmsg_get_bool(tb[CONF_AUTOSTART]);
+
+               usettings = avl_find_element(&settings, blobmsg_get_string(tb[CONF_NAME]), usettings, avl);
+               if (usettings && (usettings->autostart >= 0))
+                       autostart = !!(usettings->autostart);
+
+               if (!autostart)
+                       continue;
+
+               /* make sure all volumes are ready before starting */
+               if (tb[CONF_VOLUMES])
+                       if (checkvolumes(tb[CONF_VOLUMES]))
+                               continue;
+
+               if (usettings && usettings->volumes)
+                       if (checkvolumes(usettings->volumes))
+                               continue;
+
+               name = strdup(blobmsg_get_string(tb[CONF_NAME]));
+               if (uxc_exists(name))
+                       continue;
+
+               if (uxc_create(name, true))
+                       ++ret;
+
+               free(name);
+       }
+
+       return ret;
+}
+
+static int uxc_delete(char *name, bool force)
+{
+       struct blob_attr *cur, *tb[__CONF_MAX];
+       struct runtime_state *rsstate = NULL;
+       struct settings *usettings = NULL;
+       static struct blob_buf req;
+       uint32_t id;
+       int rem, ret = 0;
+       const char *cfname = NULL;
+       const char *sfname = NULL;
+       struct stat sb;
+
+       blobmsg_for_each_attr(cur, blob_data(conf.head), rem) {
+               blobmsg_parse(conf_policy, __CONF_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
+               if (!tb[CONF_NAME] || !tb[CONF_PATH])
+                       continue;
+
+               if (strcmp(name, blobmsg_get_string(tb[CONF_NAME])))
+                       continue;
+
+               cfname = blobmsg_name(cur);
+               break;
+       }
+
+       if (!cfname)
+               return -ENOENT;
+
+       rsstate = avl_find_element(&runtime, name, rsstate, avl);
+
+       if (rsstate && rsstate->running) {
+               if (force) {
+                       ret = uxc_kill(name, SIGKILL);
+                       if (ret)
+                               goto errout;
+
+               } else {
+                       ret = -EWOULDBLOCK;
+                       goto errout;
+               }
+       }
+
+       if (rsstate) {
+               ret = ubus_lookup_id(ctx, "container", &id);
+               if (ret)
+                       goto errout;
+
+               blob_buf_init(&req, 0);
+               blobmsg_add_string(&req, "name", rsstate->container_name);
+               blobmsg_add_string(&req, "instance", rsstate->instance_name);
+
+               if (ubus_invoke(ctx, id, "delete", req.head, NULL, NULL, 3000)) {
+                       blob_buf_free(&req);
+                       ret = -EIO;
+                       goto errout;
+               }
+       }
+
+       usettings = avl_find_element(&settings, name, usettings, avl);
+       if (usettings)
+               sfname = usettings->fname;
+
+       if (sfname) {
+               if (stat(sfname, &sb) == -1) {
+                       ret = -ENOENT;
+                       goto errout;
+               }
+
+               if (unlink(sfname) == -1) {
+                       ret = -errno;
+                       goto errout;
+               }
+       }
+
+       if (stat(cfname, &sb) == -1) {
+               ret = -ENOENT;
+               goto errout;
+       }
+
+       if (unlink(cfname) == -1)
+               ret = -errno;
+
+errout:
+       return ret;
+}
+
+static void reload_conf(void)
+{
+       blob_buf_free(&conf);
+       conf_load(false);
+       settings_free();
+       blob_buf_free(&settingsbuf);
+       conf_load(true);
+       settings_add();
+}
+
+int main(int argc, char **argv)
+{
+       enum uxc_cmd cmd = CMD_UNKNOWN;
+       int ret = -EINVAL;
+       char *bundle = NULL;
+       char *pidfile = NULL;
+       char *tmprwsize = NULL;
+       char *writepath = NULL;
+       char *requiredmounts = NULL;
+       signed char autostart = -1;
+       bool force = false;
+       bool console = false;
+       int signal = SIGTERM;
+       int c;
+
+       if (argc < 2)
+               return usage();
+
+       ctx = ubus_connect(NULL);
+       if (!ctx)
+               return -ENODEV;
+
+       ret = conf_load(false);
+       if (ret < 0)
+               goto out;
+
+       ret = conf_load(true);
+       if (ret < 0)
+               goto conf_out;
+
+       ret = settings_add();
+       if (ret < 0)
+               goto settings_out;
+
+       ret = runtime_load();
+       if (ret)
+               goto settings_avl_out;
+
+       while (true) {
+               int option_index = 0;
+               c = getopt_long(argc, argv, OPT_ARGS, long_options, &option_index);
+               if (c == -1)
+                       break;
+
+               switch (c) {
+                       case 'a':
+                               autostart = 1;
+                               break;
+
+                       case 'b':
+                               bundle = optarg;
+                               break;
+
+                       case 'c':
+                               console = true;
+                               break;
+
+                       case 'f':
+                               force = true;
+                               break;
+
+                       case 'j':
+                               json_output = true;
+                               break;
+
+                       case 'p':
+                               pidfile = optarg;
+                               break;
+
+                       case 't':
+                               tmprwsize = optarg;
+                               break;
+
+                       case 'v':
+                               verbose = true;
+                               break;
+
+                       case 'V':
+                               printf("uxc %s\n", UXC_VERSION);
+                               exit(0);
+
+                       case 'w':
+                               writepath = optarg;
+                               break;
+
+                       case 'm':
+                               requiredmounts = optarg;
+                               break;
+               }
+       }
+
+       if (optind == argc)
+               goto usage_out;
+
+       if (!strcmp("list", argv[optind]))
+               cmd = CMD_LIST;
+       else if (!strcmp("attach", argv[optind]))
+               cmd = CMD_ATTACH;
+       else if (!strcmp("boot", argv[optind]))
+               cmd = CMD_BOOT;
+       else if(!strcmp("start", argv[optind]))
+               cmd = CMD_START;
+       else if(!strcmp("state", argv[optind]))
+               cmd = CMD_STATE;
+       else if(!strcmp("kill", argv[optind]))
+               cmd = CMD_KILL;
+       else if(!strcmp("enable", argv[optind]))
+               cmd = CMD_ENABLE;
+       else if(!strcmp("disable", argv[optind]))
+               cmd = CMD_DISABLE;
+       else if(!strcmp("delete", argv[optind]))
+               cmd = CMD_DELETE;
+       else if(!strcmp("create", argv[optind]))
+               cmd = CMD_CREATE;
+
+       switch (cmd) {
+               case CMD_ATTACH:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_attach(argv[optind + 1]);
+                       break;
+
+               case CMD_LIST:
+                       ret = uxc_list();
+                       break;
+
+               case CMD_BOOT:
+                       ret = uxc_boot();
+                       break;
+
+               case CMD_START:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_start(argv[optind + 1], console);
+                       break;
+
+               case CMD_STATE:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_state(argv[optind + 1]);
+                       break;
+
+               case CMD_KILL:
+                       if (optind == (argc - 3))
+                               signal = atoi(argv[optind + 2]);
+                       else if (optind > argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_kill(argv[optind + 1], signal);
+                       break;
+
+               case CMD_ENABLE:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_set(argv[optind + 1], NULL, 1, NULL, NULL, NULL, NULL);
+                       break;
+
+               case CMD_DISABLE:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_set(argv[optind + 1], NULL, 0, NULL, NULL, NULL, NULL);
+                       break;
+
+               case CMD_DELETE:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_delete(argv[optind + 1], force);
+                       break;
+
+               case CMD_CREATE:
+                       if (optind != argc - 2)
+                               goto usage_out;
+
+                       ret = uxc_exists(argv[optind + 1]);
+                       if (ret)
+                               goto runtime_out;
+
+                       ret = uxc_set(argv[optind + 1], bundle, autostart, pidfile, tmprwsize, writepath, requiredmounts);
+                       if (ret < 0)
+                               goto runtime_out;
+
+                       if (ret > 0)
+                               reload_conf();
+
+                       ret = uxc_create(argv[optind + 1], false);
+                       break;
+
+               default:
+                       goto usage_out;
+       }
+
+       goto runtime_out;
+
+usage_out:
+       ret = usage();
+runtime_out:
+       runtime_free();
+settings_avl_out:
+       settings_free();
+settings_out:
+       blob_buf_free(&settingsbuf);
+conf_out:
+       blob_buf_free(&conf);
+out:
+       ubus_free(ctx);
+
+       if (ret < 0)
+               fprintf(stderr, "uxc error: %s\n", strerror(-ret));
+
+       return ret;
+}
index 9d770b4470a946ec37cc65d5769445ecd544997b..9b50d909d170ab6e91b1b1b9bf1e0e8d0b21b6a8 100644 (file)
@@ -71,13 +71,15 @@ static int watchdog_open(bool cloexec)
        return wdt_fd;
 }
 
-static void watchdog_close(void)
+static void watchdog_close(bool with_release)
 {
        if (wdt_fd < 0)
                return;
 
-       if (write(wdt_fd, "V", 1) < 0)
-               ERROR("WDT failed to write release: %m\n");
+       if (with_release) {
+               if (write(wdt_fd, "V", 1) < 0)
+                       ERROR("WDT failed to write release: %m\n");
+       }
 
        if (close(wdt_fd) == -1)
                ERROR("WDT failed to close watchdog: %m\n");
@@ -93,6 +95,35 @@ static int watchdog_set_drv_timeout(void)
        return ioctl(wdt_fd, WDIOC_SETTIMEOUT, &wdt_drv_timeout);
 }
 
+static void watchdog_print_status(void)
+{
+       struct watchdog_info wdt_info;
+       int bootstatus;
+
+       if (wdt_fd < 0)
+               return;
+
+       if (ioctl(wdt_fd, WDIOC_GETSUPPORT, &wdt_info)) {
+               DEBUG(2, "Watchdog GETSUPPORT failed\n");
+               return;
+       }
+
+       if (!(wdt_info.options & WDIOF_CARDRESET)) {
+               DEBUG(2, "Watchdog does not have CARDRESET support\n");
+               return;
+       }
+
+       if (ioctl(wdt_fd, WDIOC_GETBOOTSTATUS, &bootstatus)) {
+               DEBUG(2, "Watchdog GETBOOTSTATUS failed\n");
+               return;
+       }
+
+       if (bootstatus & WDIOF_CARDRESET)
+               LOG("Watchdog has previously reset the system\n");
+       else
+               DEBUG(2, "Watchdog did not previously reset the system\n");
+}
+
 void watchdog_set_magicclose(bool val)
 {
        wdt_magicclose = val;
@@ -108,8 +139,7 @@ void watchdog_set_stopped(bool val)
        if (val) {
                uloop_timeout_cancel(&wdt_timeout);
 
-               if (wdt_magicclose)
-                       watchdog_close();
+               watchdog_close(wdt_magicclose);
        }
        else {
                watchdog_open(true);
@@ -170,6 +200,8 @@ void watchdog_init(int preinit)
        watchdog_timeout_cb(&wdt_timeout);
 
        DEBUG(4, "Opened watchdog with timeout %ds\n", watchdog_timeout(0));
+
+       watchdog_print_status();
 }