0d13f5871b14b63cc5fce94c63f94437556f8ba9
[project/procd.git] / jail / jail.c
1 /*
2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License version 2.1
7 * as published by the Free Software Foundation
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14
15 #define _GNU_SOURCE
16 #include <sys/mount.h>
17 #include <sys/prctl.h>
18 #include <sys/wait.h>
19 #include <sys/types.h>
20 #include <sys/time.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <sys/sysmacros.h>
24
25 /* musl only defined 15 limit types, make sure all 16 are supported */
26 #ifndef RLIMIT_RTTIME
27 #define RLIMIT_RTTIME 15
28 #undef RLIMIT_NLIMITS
29 #define RLIMIT_NLIMITS 16
30 #undef RLIM_NLIMITS
31 #define RLIM_NLIMITS 16
32 #endif
33
34 #include <stdlib.h>
35 #include <unistd.h>
36 #include <errno.h>
37 #include <pwd.h>
38 #include <grp.h>
39 #include <string.h>
40 #include <fcntl.h>
41 #include <sched.h>
42 #include <linux/filter.h>
43 #include <linux/limits.h>
44 #include <linux/nsfs.h>
45 #include <signal.h>
46 #include <inttypes.h>
47
48 #include "capabilities.h"
49 #include "elf.h"
50 #include "fs.h"
51 #include "jail.h"
52 #include "log.h"
53 #include "seccomp-oci.h"
54
55 #include <libubox/utils.h>
56 #include <libubox/blobmsg.h>
57 #include <libubox/blobmsg_json.h>
58 #include <libubox/list.h>
59 #include <libubox/vlist.h>
60 #include <libubox/uloop.h>
61 #include <libubus.h>
62
63 #ifndef CLONE_NEWCGROUP
64 #define CLONE_NEWCGROUP 0x02000000
65 #endif
66
67 #define STACK_SIZE (1024 * 1024)
68 #define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:"
69
70 struct hook_execvpe {
71 char *file;
72 char **argv;
73 char **envp;
74 int timeout;
75 };
76
77 struct sysctl_val {
78 char *entry;
79 char *value;
80 };
81
82 struct mknod_args {
83 char *path;
84 mode_t mode;
85 dev_t dev;
86 uid_t uid;
87 gid_t gid;
88 };
89
90 static struct {
91 char *name;
92 char *hostname;
93 char **jail_argv;
94 char *cwd;
95 char *seccomp;
96 struct sock_fprog *ociseccomp;
97 char *capabilities;
98 struct jail_capset capset;
99 char *user;
100 char *group;
101 char *extroot;
102 char *overlaydir;
103 char *tmpoverlaysize;
104 char **envp;
105 char *uidmap;
106 char *gidmap;
107 struct sysctl_val **sysctl;
108 int no_new_privs;
109 int namespace;
110 struct {
111 int pid;
112 int net;
113 int ns;
114 int ipc;
115 int uts;
116 int user;
117 int cgroup;
118 #ifdef CLONE_NEWTIME
119 int time;
120 #endif
121 } setns;
122 int procfs;
123 int ronly;
124 int sysfs;
125 int console;
126 int pw_uid;
127 int pw_gid;
128 int gr_gid;
129 gid_t *additional_gids;
130 size_t num_additional_gids;
131 mode_t umask;
132 bool set_umask;
133 int require_jail;
134 struct {
135 struct hook_execvpe **createRuntime;
136 struct hook_execvpe **createContainer;
137 struct hook_execvpe **startContainer;
138 struct hook_execvpe **poststart;
139 struct hook_execvpe **poststop;
140 } hooks;
141 struct rlimit *rlimits[RLIM_NLIMITS];
142 int oom_score_adj;
143 bool set_oom_score_adj;
144 struct mknod_args **devices;
145 } opts;
146
147 static inline bool has_namespaces(void)
148 {
149 return ((opts.setns.pid != -1) ||
150 (opts.setns.net != -1) ||
151 (opts.setns.ns != -1) ||
152 (opts.setns.ipc != -1) ||
153 (opts.setns.uts != -1) ||
154 (opts.setns.user != -1) ||
155 (opts.setns.cgroup != -1) ||
156 #ifdef CLONE_NEWTIME
157 (opts.setns.time != -1) ||
158 #endif
159 opts.namespace);
160 }
161
162 static void free_hooklist(struct hook_execvpe **hooklist)
163 {
164 struct hook_execvpe *cur;
165 char **tmp;
166
167 if (!hooklist)
168 return;
169
170 cur = *hooklist;
171 while (cur) {
172 free(cur->file);
173 tmp = cur->argv;
174 while (tmp)
175 free(*(tmp++));
176
177 free(cur->argv);
178
179 tmp = cur->envp;
180 while (tmp)
181 free(*(tmp++));
182
183 free(cur->envp);
184 free(cur++);
185 }
186 free(hooklist);
187 }
188
189 static void free_sysctl(void) {
190 struct sysctl_val *cur;
191 cur = *opts.sysctl;
192
193 while (cur) {
194 free(cur->entry);
195 free(cur->value);
196 free(cur++);
197 }
198 free(opts.sysctl);
199 }
200
201 static void free_devices(void) {
202 struct mknod_args **cur;
203
204 if (!opts.devices)
205 return;
206
207 cur = opts.devices;
208
209 while (*cur) {
210 free((*cur)->path);
211 free(*(cur++));
212 }
213 free(opts.devices);
214 }
215
216 static void free_rlimits(void) {
217 int type;
218
219 for (type = 0; type < RLIM_NLIMITS; ++type)
220 free(opts.rlimits[type]);
221 }
222
223 static void free_opts(bool child) {
224 char **tmp;
225
226 /* we need to keep argv, envp and seccomp filter in child */
227 if (child) {
228 if (opts.ociseccomp) {
229 free(opts.ociseccomp->filter);
230 free(opts.ociseccomp);
231 }
232
233 tmp = opts.jail_argv;
234 while(tmp)
235 free(*(tmp++));
236
237 free(opts.jail_argv);
238
239 tmp = opts.envp;
240 while (tmp)
241 free(*(tmp++));
242
243 free(opts.envp);
244 };
245
246 free_rlimits();
247 free_sysctl();
248 free_devices();
249 free(opts.hostname);
250 free(opts.cwd);
251 free(opts.extroot);
252 free(opts.uidmap);
253 free(opts.gidmap);
254 free_hooklist(opts.hooks.createRuntime);
255 free_hooklist(opts.hooks.createContainer);
256 free_hooklist(opts.hooks.startContainer);
257 free_hooklist(opts.hooks.poststart);
258 free_hooklist(opts.hooks.poststop);
259 }
260
261 static struct blob_buf ocibuf;
262
263 extern int pivot_root(const char *new_root, const char *put_old);
264
265 int debug = 0;
266
267 static char child_stack[STACK_SIZE];
268
269 int console_fd;
270
271 static int mount_overlay(char *jail_root, char *overlaydir) {
272 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
273 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
274 int ret = -1, fd;
275
276 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
277 goto out;
278
279 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
280 goto upper_printf;
281
282 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
283 goto work_printf;
284
285 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
286 goto opts_printf;
287
288 /*
289 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
290 * this is to work-around a bug in overlayfs described in the overlayfs-userns
291 * patch:
292 * 3. modification of a file 'hithere' which is in l but not yet
293 * in u, and which is not owned by T, is not allowed, even if
294 * writes to u are allowed. This may be a bug in overlayfs,
295 * but it is safe behavior.
296 */
297 if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
298 goto opts_printf;
299
300 if (mkdir_p(upperetc, 0755))
301 goto upper_etc_printf;
302
303 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
304 goto upper_etc_printf;
305
306 fd = creat(upperresolvconf, 0644);
307 if (fd == -1) {
308 ERROR("creat(%s) failed: %m\n", upperresolvconf);
309 goto upper_resolvconf_printf;
310 }
311 close(fd);
312
313 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
314
315 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
316 goto opts_printf;
317
318 ret = 0;
319
320 upper_resolvconf_printf:
321 free(upperresolvconf);
322 upper_etc_printf:
323 free(upperetc);
324 opts_printf:
325 free(optsstr);
326 work_printf:
327 free(workdir);
328 upper_printf:
329 free(upperdir);
330 out:
331 return ret;
332 }
333
334 static void pass_console(int console_fd)
335 {
336 struct ubus_context *ctx = ubus_connect(NULL);
337 static struct blob_buf req;
338 uint32_t id;
339
340 if (!ctx)
341 return;
342
343 blob_buf_init(&req, 0);
344 blobmsg_add_string(&req, "name", opts.name);
345
346 if (ubus_lookup_id(ctx, "container", &id) ||
347 ubus_invoke_fd(ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
348 INFO("ubus request failed\n");
349 else
350 close(console_fd);
351
352 blob_buf_free(&req);
353 ubus_free(ctx);
354 }
355
356 static int create_dev_console(const char *jail_root)
357 {
358 char *console_fname;
359 char dev_console_path[PATH_MAX];
360 int slave_console_fd;
361
362 /* Open UNIX/98 virtual console */
363 console_fd = posix_openpt(O_RDWR | O_NOCTTY);
364 if (console_fd == -1)
365 return -1;
366
367 console_fname = ptsname(console_fd);
368 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
369 if (!console_fname)
370 goto no_console;
371
372 grantpt(console_fd);
373 unlockpt(console_fd);
374
375 /* pass PTY master to procd */
376 pass_console(console_fd);
377
378 /* mount-bind PTY slave to /dev/console in jail */
379 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
380 close(creat(dev_console_path, 0620));
381
382 if (mount(console_fname, dev_console_path, NULL, MS_BIND, NULL))
383 goto no_console;
384
385 /* use PTY slave for stdio */
386 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
387 dup2(slave_console_fd, 0);
388 dup2(slave_console_fd, 1);
389 dup2(slave_console_fd, 2);
390 close(slave_console_fd);
391
392 INFO("using guest console %s\n", console_fname);
393
394 return 0;
395
396 no_console:
397 close(console_fd);
398 return 1;
399 }
400
401 static int hook_running = 0;
402 static int hook_return_code = 0;
403
404 static void hook_process_timeout_cb(struct uloop_timeout *t);
405 static struct uloop_timeout hook_process_timeout = {
406 .cb = hook_process_timeout_cb,
407 };
408
409 static void hook_process_handler(struct uloop_process *c, int ret)
410 {
411 uloop_timeout_cancel(&hook_process_timeout);
412 if (WIFEXITED(ret)) {
413 hook_return_code = WEXITSTATUS(ret);
414 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
415 } else {
416 hook_return_code = WTERMSIG(ret);
417 DEBUG("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
418 }
419 hook_running = 0;
420 uloop_end();
421 }
422
423 static struct uloop_process hook_process = {
424 .cb = hook_process_handler,
425 };
426
427 static void hook_process_timeout_cb(struct uloop_timeout *t)
428 {
429 DEBUG("hook process failed to stop, sending SIGKILL\n");
430 kill(hook_process.pid, SIGKILL);
431 }
432
433 static int run_hook(struct hook_execvpe *hook)
434 {
435 struct stat s;
436
437 DEBUG("executing hook %s\n", hook->file);
438
439 if (stat(hook->file, &s))
440 return ENOENT;
441
442 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
443 return EPERM;
444
445 if (!((unsigned long)s.st_mode & (S_IRUSR | S_IRGRP | S_IROTH)))
446 return EPERM;
447
448 uloop_init();
449
450 hook_running = 1;
451 hook_process.pid = fork();
452 if (hook_process.pid > 0) {
453 /* parent */
454 uloop_process_add(&hook_process);
455
456 if (hook->timeout > 0)
457 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
458
459 uloop_run();
460 if (hook_running) {
461 DEBUG("uloop interrupted, killing hook process\n");
462 kill(hook_process.pid, SIGTERM);
463 uloop_timeout_set(&hook_process_timeout, 1000);
464 uloop_run();
465 }
466 uloop_done();
467
468 waitpid(hook_process.pid, NULL, WCONTINUED);
469
470 return hook_return_code;
471 } else if (hook_process.pid == 0) {
472 /* child */
473 execvpe(hook->file, hook->argv, hook->envp);
474 hook_running = 0;
475 _exit(errno);
476 } else {
477 /* fork error */
478 hook_running = 0;
479 return errno;
480 }
481 }
482
483 static int run_hooks(struct hook_execvpe **hooklist)
484 {
485 struct hook_execvpe **cur;
486 int res;
487
488 if (!hooklist)
489 return 0; /* Nothing to do */
490
491 cur = hooklist;
492
493 while (*cur) {
494 res = run_hook(*cur);
495 if (res)
496 DEBUG(" error running hook %s\n", (*cur)->file);
497 else
498 DEBUG(" success running hook %s\n", (*cur)->file);
499
500 ++cur;
501 }
502
503 return 0;
504 }
505
506 static int apply_sysctl(const char *jail_root)
507 {
508 struct sysctl_val **cur;
509 char *procdir, *fname;
510 int f;
511
512 if (!opts.sysctl)
513 return 0;
514
515 asprintf(&procdir, "%s/proc", jail_root);
516 if (!procdir)
517 return ENOMEM;
518
519 mkdir(procdir, 0700);
520 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0))
521 return EPERM;
522
523 cur = opts.sysctl;
524
525 while (*cur) {
526 asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry);
527 if (!fname)
528 return ENOMEM;
529
530 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
531
532 f = open(fname, O_WRONLY);
533 if (f == -1) {
534 ERROR("sysctl: can't open %s\n", fname);
535 return errno;
536 }
537 write(f, (*cur)->value, strlen((*cur)->value));
538
539 free(fname);
540 close(f);
541 ++cur;
542 }
543 umount(procdir);
544 rmdir(procdir);
545 free(procdir);
546
547 return 0;
548 }
549
550 static struct mknod_args default_devices[] = {
551 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) },
552 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) },
553 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) },
554 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) },
555 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) },
556 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 },
557 { 0 },
558 };
559
560 static int create_devices(void)
561 {
562 struct mknod_args **cur, *curdef;
563
564 if (!opts.devices)
565 goto only_default_devices;
566
567 cur = opts.devices;
568
569 while (*cur) {
570 DEBUG("creating %s (mode=%08o)\n", (*cur)->path, (*cur)->mode);
571 if (mknod((*cur)->path, (*cur)->mode, (*cur)->dev))
572 return errno;
573
574 if (((*cur)->uid || (*cur)->gid) &&
575 chown((*cur)->path, (*cur)->uid, (*cur)->gid))
576 return errno;
577
578 ++cur;
579 }
580
581 only_default_devices:
582 curdef = default_devices;
583 while(curdef->path) {
584 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode);
585 if (mknod(curdef->path, curdef->mode, curdef->dev)) {
586 ++curdef;
587 continue; /* may already exist, eg. due to a bind-mount */
588 }
589 if ((curdef->uid || curdef->gid) &&
590 chown(curdef->path, curdef->uid, curdef->gid))
591 return errno;
592
593 ++curdef;
594 }
595
596 /* Dev symbolic links as defined in OCI spec */
597 symlink("/dev/pts/ptmx", "/dev/ptmx");
598 symlink("/proc/self/fd", "/dev/fd");
599 symlink("/proc/self/fd/0", "/dev/stdin");
600 symlink("/proc/self/fd/1", "/dev/stdout");
601 symlink("/proc/self/fd/2", "/dev/stderr");
602
603 return 0;
604 }
605
606 static int build_jail_fs(void)
607 {
608 char jail_root[] = "/tmp/ujail-XXXXXX";
609 char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
610 char *overlaydir = NULL;
611 mode_t old_umask;
612
613 old_umask = umask(0);
614
615 if (mkdtemp(jail_root) == NULL) {
616 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
617 return -1;
618 }
619
620 if (apply_sysctl(jail_root)) {
621 ERROR("failed to apply sysctl values\n");
622 return -1;
623 }
624
625 /* oldroot can't be MS_SHARED else pivot_root() fails */
626 if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
627 ERROR("private mount failed %m\n");
628 return -1;
629 }
630
631 if (opts.extroot) {
632 if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) {
633 ERROR("extroot mount failed %m\n");
634 return -1;
635 }
636 } else {
637 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
638 ERROR("tmpfs mount failed %m\n");
639 return -1;
640 }
641 }
642
643 if (opts.tmpoverlaysize) {
644 char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
645
646 snprintf(mountoptsstr, sizeof(mountoptsstr),
647 "mode=0755,size=%s", opts.tmpoverlaysize);
648 if (mkdtemp(tmpovdir) == NULL) {
649 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
650 return -1;
651 }
652 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
653 mountoptsstr)) {
654 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
655 return -1;
656 }
657 overlaydir = tmpovdir;
658 }
659
660 if (opts.overlaydir)
661 overlaydir = opts.overlaydir;
662
663 if (overlaydir)
664 mount_overlay(jail_root, overlaydir);
665
666 if (chdir(jail_root)) {
667 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
668 return -1;
669 }
670
671 if (mount_all(jail_root)) {
672 ERROR("mount_all() failed\n");
673 return -1;
674 }
675
676 if (opts.console)
677 create_dev_console(jail_root);
678
679 /* make sure /etc/resolv.conf exists if in new network namespace */
680 if (opts.namespace & CLONE_NEWNET) {
681 char jailetc[PATH_MAX], jaillink[PATH_MAX];
682
683 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
684 mkdir_p(jailetc, 0755);
685 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
686 if (overlaydir)
687 unlink(jaillink);
688
689 symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
690 }
691
692 run_hooks(opts.hooks.createContainer);
693
694 char dirbuf[sizeof(jail_root) + 4];
695 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
696 mkdir(dirbuf, 0755);
697
698 if (pivot_root(jail_root, dirbuf) == -1) {
699 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
700 return -1;
701 }
702 if (chdir("/")) {
703 ERROR("chdir(/) (after pivot_root) failed: %m\n");
704 return -1;
705 }
706
707 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
708 umount2(dirbuf, MNT_DETACH);
709 rmdir(dirbuf);
710 if (opts.tmpoverlaysize) {
711 char tmpdirbuf[sizeof(tmpovdir) + 4];
712 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
713 umount2(tmpdirbuf, MNT_DETACH);
714 rmdir(tmpdirbuf);
715 }
716
717 umount2("/old", MNT_DETACH);
718 rmdir("/old");
719
720 if (create_devices()) {
721 ERROR("create_devices() failed\n");
722 return -1;
723 }
724 if (opts.ronly)
725 mount(NULL, "/", NULL, MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
726
727 umask(old_umask);
728
729 return 0;
730 }
731
732 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
733 {
734 int map_file;
735 char map_path[64];
736
737 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
738 child_pid, gidmap?"gid_map":"uid_map") < 0)
739 return -1;
740
741 if ((map_file = open(map_path, O_WRONLY)) == -1)
742 return -1;
743
744 if (dprintf(map_file, "%s", mapstr)) {
745 close(map_file);
746 return -1;
747 }
748
749 close(map_file);
750 free(mapstr);
751 return 0;
752 }
753
754 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
755 {
756 int map_file;
757 char map_path[64];
758 const char *map_format = "%d %d %d\n";
759 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
760 child_pid, gidmap?"gid_map":"uid_map") < 0)
761 return -1;
762
763 if ((map_file = open(map_path, O_WRONLY)) == -1)
764 return -1;
765
766 if (dprintf(map_file, map_format, 0, id, 1) == -1) {
767 close(map_file);
768 return -1;
769 }
770
771 close(map_file);
772 return 0;
773 }
774
775 static int write_setgroups(pid_t child_pid, bool allow)
776 {
777 int setgroups_file;
778 char setgroups_path[64];
779
780 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
781 child_pid) < 0) {
782 return -1;
783 }
784
785 if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) {
786 return -1;
787 }
788
789 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
790 close(setgroups_file);
791 return -1;
792 }
793
794 close(setgroups_file);
795 return 0;
796 }
797
798 static void get_jail_user(int *user, int *user_gid, int *gr_gid)
799 {
800 struct passwd *p = NULL;
801 struct group *g = NULL;
802
803 if (opts.user) {
804 p = getpwnam(opts.user);
805 if (!p) {
806 ERROR("failed to get uid/gid for user %s: %d (%s)\n",
807 opts.user, errno, strerror(errno));
808 exit(EXIT_FAILURE);
809 }
810 *user = p->pw_uid;
811 *user_gid = p->pw_gid;
812 } else {
813 *user = -1;
814 *user_gid = -1;
815 }
816
817 if (opts.group) {
818 g = getgrnam(opts.group);
819 if (!g) {
820 ERROR("failed to get gid for group %s: %m\n", opts.group);
821 exit(EXIT_FAILURE);
822 }
823 *gr_gid = g->gr_gid;
824 } else {
825 *gr_gid = -1;
826 }
827 };
828
829 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
830 {
831 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
832 ERROR("failed to initgroups() for user %s: %m\n", opts.user);
833 exit(EXIT_FAILURE);
834 }
835
836 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
837 ERROR("failed to set group id %d: %m\n", gr_gid);
838 exit(EXIT_FAILURE);
839 }
840
841 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
842 ERROR("failed to set user id %d: %m\n", pw_uid);
843 exit(EXIT_FAILURE);
844 }
845 }
846
847 static int apply_rlimits(void)
848 {
849 int resource;
850
851 for (resource = 0; resource < RLIM_NLIMITS; ++resource) {
852 if (opts.rlimits[resource])
853 DEBUG("applying limits to resource %u\n", resource);
854
855 if (opts.rlimits[resource] &&
856 setrlimit(resource, opts.rlimits[resource]))
857 return errno;
858 }
859
860 return 0;
861 }
862
863 #define MAX_ENVP 8
864 static char** build_envp(const char *seccomp, char **ocienvp)
865 {
866 static char *envp[MAX_ENVP];
867 static char preload_var[PATH_MAX];
868 static char seccomp_var[PATH_MAX];
869 static char debug_var[] = "LD_DEBUG=all";
870 static char container_var[] = "container=ujail";
871 const char *preload_lib = find_lib("libpreload-seccomp.so");
872 char **addenv;
873
874 int count = 0;
875
876 if (seccomp && !preload_lib) {
877 ERROR("failed to add preload-lib to env\n");
878 return NULL;
879 }
880 if (seccomp) {
881 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
882 envp[count++] = seccomp_var;
883 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
884 envp[count++] = preload_var;
885 }
886
887 envp[count++] = container_var;
888
889 if (debug > 1)
890 envp[count++] = debug_var;
891
892 addenv = ocienvp;
893 while (addenv && *addenv) {
894 envp[count++] = *(addenv++);
895 if (count >= MAX_ENVP) {
896 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
897 break;
898 }
899 }
900 return envp;
901 }
902
903 static void usage(void)
904 {
905 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
906 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n");
907 fprintf(stderr, " -S <file>\tseccomp filter config\n");
908 fprintf(stderr, " -C <file>\tcapabilities drop config\n");
909 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n");
910 fprintf(stderr, " -n <name>\tthe name of the jail\n");
911 fprintf(stderr, "namespace jail options:\n");
912 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n");
913 fprintf(stderr, " -N\t\tjail has network namespace\n");
914 fprintf(stderr, " -f\t\tjail has user namespace\n");
915 fprintf(stderr, " -F\t\tjail has cgroups namespace\n");
916 fprintf(stderr, " -r <file>\treadonly files that should be staged\n");
917 fprintf(stderr, " -w <file>\twriteable files that should be staged\n");
918 fprintf(stderr, " -p\t\tjail has /proc\n");
919 fprintf(stderr, " -s\t\tjail has /sys\n");
920 fprintf(stderr, " -l\t\tjail has /dev/log\n");
921 fprintf(stderr, " -u\t\tjail has a ubus socket\n");
922 fprintf(stderr, " -U <name>\tuser to run jailed process\n");
923 fprintf(stderr, " -G <name>\tgroup to run jailed process\n");
924 fprintf(stderr, " -o\t\tremont jail root (/) read only\n");
925 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n");
926 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n");
927 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
928 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n");
929 fprintf(stderr, " -y\t\tprovide jail console\n");
930 fprintf(stderr, " -J <dir>\tstart OCI bundle\n");
931 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
932 and he has the same powers as root outside the jail,\n\
933 thus he can escape the jail and/or break stuff.\n\
934 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
935 If you use none of the namespace jail options,\n\
936 ujail will not use namespace/build a jail,\n\
937 and will only drop capabilities/apply seccomp filter.\n\n");
938 }
939
940 static int* get_namespace_fd(const unsigned int nstype)
941 {
942 switch (nstype) {
943 case CLONE_NEWPID:
944 return &opts.setns.pid;
945 case CLONE_NEWNET:
946 return &opts.setns.net;
947 case CLONE_NEWNS:
948 return &opts.setns.ns;
949 case CLONE_NEWIPC:
950 return &opts.setns.ipc;
951 case CLONE_NEWUTS:
952 return &opts.setns.uts;
953 case CLONE_NEWUSER:
954 return &opts.setns.user;
955 case CLONE_NEWCGROUP:
956 return &opts.setns.cgroup;
957 #ifdef CLONE_NEWTIME
958 case CLONE_NEWTIME:
959 return &opts.setns.time;
960 #endif
961 default:
962 return NULL;
963 }
964 }
965
966 static int setns_open(unsigned long nstype)
967 {
968 int *fd = get_namespace_fd(nstype);
969
970 if (!*fd)
971 return EFAULT;
972
973 if (*fd == -1)
974 return 0;
975
976 if (setns(*fd, nstype) == -1) {
977 close(*fd);
978 return errno;
979 }
980
981 close(*fd);
982 return 0;
983 }
984
985 static int exec_jail(void *pipes_ptr)
986 {
987 int *pipes = (int*)pipes_ptr;
988 char buf[1];
989 int pw_uid, pw_gid, gr_gid;
990
991 close(pipes[0]);
992 close(pipes[3]);
993
994 setns_open(CLONE_NEWUSER);
995 setns_open(CLONE_NEWNET);
996 setns_open(CLONE_NEWNS);
997 setns_open(CLONE_NEWIPC);
998 setns_open(CLONE_NEWUTS);
999 #ifdef CLONE_NEWTIME
1000 setns_open(CLONE_NEWTIME);
1001 #endif
1002
1003 buf[0] = 'i';
1004 if (write(pipes[1], buf, 1) < 1) {
1005 ERROR("can't write to parent\n");
1006 exit(EXIT_FAILURE);
1007 }
1008 if (read(pipes[2], buf, 1) < 1) {
1009 ERROR("can't read from parent\n");
1010 exit(EXIT_FAILURE);
1011 }
1012 if (buf[0] != 'O') {
1013 ERROR("parent had an error, child exiting\n");
1014 exit(EXIT_FAILURE);
1015 }
1016
1017 close(pipes[1]);
1018 close(pipes[2]);
1019
1020 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
1021 if (setregid(0, 0) < 0) {
1022 ERROR("setgid\n");
1023 exit(EXIT_FAILURE);
1024 }
1025 if (setreuid(0, 0) < 0) {
1026 ERROR("setuid\n");
1027 exit(EXIT_FAILURE);
1028 }
1029 if (setgroups(0, NULL) < 0) {
1030 ERROR("setgroups\n");
1031 exit(EXIT_FAILURE);
1032 }
1033 }
1034
1035 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
1036 && sethostname(opts.hostname, strlen(opts.hostname))) {
1037 ERROR("sethostname(%s) failed: %m\n", opts.hostname);
1038 exit(EXIT_FAILURE);
1039 }
1040
1041 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
1042 ERROR("failed to build jail fs\n");
1043 exit(EXIT_FAILURE);
1044 }
1045 run_hooks(opts.hooks.startContainer);
1046
1047 if (!(opts.namespace & CLONE_NEWUSER) && (opts.setns.user == -1)) {
1048 get_jail_user(&pw_uid, &pw_gid, &gr_gid);
1049
1050 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
1051 }
1052
1053 if (opts.additional_gids &&
1054 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) {
1055 ERROR("setgroups failed: %m\n");
1056 exit(EXIT_FAILURE);
1057 }
1058
1059 if (opts.set_umask)
1060 umask(opts.umask);
1061
1062 if (applyOCIcapabilities(opts.capset))
1063 exit(EXIT_FAILURE);
1064
1065 if (opts.capabilities && drop_capabilities(opts.capabilities))
1066 exit(EXIT_FAILURE);
1067
1068 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
1069 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
1070 exit(EXIT_FAILURE);
1071 }
1072
1073 char **envp = build_envp(opts.seccomp, opts.envp);
1074 if (!envp)
1075 exit(EXIT_FAILURE);
1076
1077 if (opts.cwd && chdir(opts.cwd))
1078 exit(EXIT_FAILURE);
1079
1080 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
1081 exit(EXIT_FAILURE);
1082
1083 uloop_end();
1084 free_opts(false);
1085 INFO("exec-ing %s\n", *opts.jail_argv);
1086 if (opts.envp) /* respect PATH if potentially set in ENV */
1087 execvpe(*opts.jail_argv, opts.jail_argv, envp);
1088 else
1089 execve(*opts.jail_argv, opts.jail_argv, envp);
1090
1091 /* we get there only if execve fails */
1092 ERROR("failed to execve %s: %m\n", *opts.jail_argv);
1093 exit(EXIT_FAILURE);
1094 }
1095
1096 static int jail_running = 0;
1097 static int jail_return_code = 0;
1098
1099 static void jail_process_timeout_cb(struct uloop_timeout *t);
1100 static struct uloop_timeout jail_process_timeout = {
1101 .cb = jail_process_timeout_cb,
1102 };
1103
1104 static void jail_process_handler(struct uloop_process *c, int ret)
1105 {
1106 uloop_timeout_cancel(&jail_process_timeout);
1107 if (WIFEXITED(ret)) {
1108 jail_return_code = WEXITSTATUS(ret);
1109 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
1110 } else {
1111 jail_return_code = WTERMSIG(ret);
1112 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
1113 }
1114 jail_running = 0;
1115 uloop_end();
1116 }
1117
1118 static struct uloop_process jail_process = {
1119 .cb = jail_process_handler,
1120 };
1121
1122 static void jail_process_timeout_cb(struct uloop_timeout *t)
1123 {
1124 DEBUG("jail process failed to stop, sending SIGKILL\n");
1125 kill(jail_process.pid, SIGKILL);
1126 }
1127
1128 static void jail_handle_signal(int signo)
1129 {
1130 if (hook_running) {
1131 DEBUG("forwarding signal %d to the hook process\n", signo);
1132 kill(hook_process.pid, signo);
1133 }
1134
1135 if (jail_running) {
1136 DEBUG("forwarding signal %d to the jailed process\n", signo);
1137 kill(jail_process.pid, signo);
1138 }
1139 }
1140
1141 static int netns_open_pid(const pid_t target_ns)
1142 {
1143 char pid_net_path[PATH_MAX];
1144
1145 snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns);
1146
1147 return open(pid_net_path, O_RDONLY);
1148 }
1149
1150 static int pidns_open_pid(const pid_t target_ns)
1151 {
1152 char pid_pid_path[PATH_MAX];
1153
1154 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/pid", target_ns);
1155
1156 return open(pid_pid_path, O_RDONLY);
1157 }
1158
1159 static void netns_updown(pid_t pid, bool start)
1160 {
1161 struct ubus_context *ctx = ubus_connect(NULL);
1162 static struct blob_buf req;
1163 uint32_t id;
1164
1165 if (!ctx)
1166 return;
1167
1168 blob_buf_init(&req, 0);
1169 blobmsg_add_string(&req, "jail", opts.name);
1170 blobmsg_add_u32(&req, "pid", pid);
1171 blobmsg_add_u8(&req, "start", start);
1172
1173 if (ubus_lookup_id(ctx, "network", &id) ||
1174 ubus_invoke(ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
1175 INFO("ubus request failed\n");
1176
1177 blob_buf_free(&req);
1178 ubus_free(ctx);
1179 }
1180
1181 static int parseOCIenvarray(struct blob_attr *msg, char ***envp)
1182 {
1183 struct blob_attr *cur;
1184 int sz = 0, rem;
1185
1186 blobmsg_for_each_attr(cur, msg, rem)
1187 ++sz;
1188
1189 if (sz > 0) {
1190 *envp = calloc(1 + sz, sizeof(char*));
1191 if (!(*envp))
1192 return ENOMEM;
1193 } else {
1194 *envp = NULL;
1195 return 0;
1196 }
1197
1198 sz = 0;
1199 blobmsg_for_each_attr(cur, msg, rem)
1200 (*envp)[sz++] = strdup(blobmsg_get_string(cur));
1201
1202 if (sz)
1203 (*envp)[sz] = NULL;
1204
1205 return 0;
1206 }
1207
1208 enum {
1209 OCI_ROOT_PATH,
1210 OCI_ROOT_READONLY,
1211 __OCI_ROOT_MAX,
1212 };
1213
1214 static const struct blobmsg_policy oci_root_policy[] = {
1215 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
1216 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
1217 };
1218
1219 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
1220 {
1221 static char rootpath[PATH_MAX] = { 0 };
1222 struct blob_attr *tb[__OCI_ROOT_MAX];
1223 char *cur;
1224
1225 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1226
1227 if (!tb[OCI_ROOT_PATH])
1228 return ENODATA;
1229
1230 strncpy(rootpath, jsonfile, PATH_MAX);
1231 cur = strrchr(rootpath, '/');
1232
1233 if (!cur)
1234 return ENOTDIR;
1235
1236 *(++cur) = '\0';
1237 strncat(rootpath, blobmsg_get_string(tb[OCI_ROOT_PATH]), PATH_MAX - (strlen(rootpath) + 1));
1238
1239 opts.extroot = rootpath;
1240
1241 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
1242
1243 return 0;
1244 }
1245
1246
1247 enum {
1248 OCI_HOOK_PATH,
1249 OCI_HOOK_ARGS,
1250 OCI_HOOK_ENV,
1251 OCI_HOOK_TIMEOUT,
1252 __OCI_HOOK_MAX,
1253 };
1254
1255 static const struct blobmsg_policy oci_hook_policy[] = {
1256 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING },
1257 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1258 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1259 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
1260 };
1261
1262
1263 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg)
1264 {
1265 struct blob_attr *tb[__OCI_HOOK_MAX];
1266 struct blob_attr *cur;
1267 int rem, ret = 0;
1268 int idx = 0;
1269
1270 blobmsg_for_each_attr(cur, msg, rem)
1271 ++idx;
1272
1273 if (!idx)
1274 return 0;
1275
1276 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *));
1277 idx = 0;
1278
1279 if (!(*hooklist))
1280 return ENOMEM;
1281
1282 blobmsg_for_each_attr(cur, msg, rem) {
1283 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1284
1285 if (!tb[OCI_HOOK_PATH]) {
1286 ret = EINVAL;
1287 goto errout;
1288 }
1289
1290 (*hooklist)[idx] = malloc(sizeof(struct hook_execvpe));
1291 if (tb[OCI_HOOK_ARGS]) {
1292 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv));
1293 if (ret)
1294 goto errout;
1295 } else {
1296 (*hooklist)[idx]->argv = calloc(2, sizeof(char *));
1297 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1298 ((*hooklist)[idx]->argv)[1] = NULL;
1299 };
1300
1301
1302 if (tb[OCI_HOOK_ENV]) {
1303 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp));
1304 if (ret)
1305 goto errout;
1306 }
1307
1308 if (tb[OCI_HOOK_TIMEOUT])
1309 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]);
1310
1311 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1312
1313 ++idx;
1314 }
1315
1316 (*hooklist)[idx] = NULL;
1317
1318 DEBUG("added %d hooks\n", idx);
1319
1320 return 0;
1321
1322 errout:
1323 free_hooklist(*hooklist);
1324 *hooklist = NULL;
1325
1326 return ret;
1327 };
1328
1329
1330 enum {
1331 OCI_HOOKS_PRESTART,
1332 OCI_HOOKS_CREATERUNTIME,
1333 OCI_HOOKS_CREATECONTAINER,
1334 OCI_HOOKS_STARTCONTAINER,
1335 OCI_HOOKS_POSTSTART,
1336 OCI_HOOKS_POSTSTOP,
1337 __OCI_HOOKS_MAX,
1338 };
1339
1340 static const struct blobmsg_policy oci_hooks_policy[] = {
1341 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY },
1342 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY },
1343 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY },
1344 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY },
1345 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY },
1346 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY },
1347 };
1348
1349 static int parseOCIhooks(struct blob_attr *msg)
1350 {
1351 struct blob_attr *tb[__OCI_HOOKS_MAX];
1352 int ret;
1353
1354 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1355
1356 if (tb[OCI_HOOKS_PRESTART])
1357 INFO("warning: ignoring deprecated prestart hook\n");
1358
1359 if (tb[OCI_HOOKS_CREATERUNTIME]) {
1360 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]);
1361 if (ret)
1362 return ret;
1363 }
1364
1365 if (tb[OCI_HOOKS_CREATECONTAINER]) {
1366 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]);
1367 if (ret)
1368 goto out_createruntime;
1369 }
1370
1371 if (tb[OCI_HOOKS_STARTCONTAINER]) {
1372 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]);
1373 if (ret)
1374 goto out_createcontainer;
1375 }
1376
1377 if (tb[OCI_HOOKS_POSTSTART]) {
1378 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]);
1379 if (ret)
1380 goto out_startcontainer;
1381 }
1382
1383 if (tb[OCI_HOOKS_POSTSTOP]) {
1384 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]);
1385 if (ret)
1386 goto out_poststart;
1387 }
1388
1389 return 0;
1390
1391 out_poststart:
1392 free_hooklist(opts.hooks.poststart);
1393 out_startcontainer:
1394 free_hooklist(opts.hooks.startContainer);
1395 out_createcontainer:
1396 free_hooklist(opts.hooks.createContainer);
1397 out_createruntime:
1398 free_hooklist(opts.hooks.createRuntime);
1399
1400 return ret;
1401 };
1402
1403
1404 enum {
1405 OCI_PROCESS_USER_UID,
1406 OCI_PROCESS_USER_GID,
1407 OCI_PROCESS_USER_UMASK,
1408 OCI_PROCESS_USER_ADDITIONALGIDS,
1409 __OCI_PROCESS_USER_MAX,
1410 };
1411
1412 static const struct blobmsg_policy oci_process_user_policy[] = {
1413 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
1414 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
1415 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
1416 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
1417 };
1418
1419 static int parseOCIprocessuser(struct blob_attr *msg) {
1420 struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
1421 struct blob_attr *cur;
1422 int rem;
1423 int has_gid = 0;
1424
1425 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1426
1427 if (tb[OCI_PROCESS_USER_UID])
1428 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
1429
1430 if (tb[OCI_PROCESS_USER_GID]) {
1431 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1432 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1433 has_gid = 1;
1434 }
1435
1436 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) {
1437 size_t gidcnt = 0;
1438
1439 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1440 ++gidcnt;
1441 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1442 continue;
1443 }
1444
1445 if (gidcnt) {
1446 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t));
1447 gidcnt = 0;
1448
1449 /* always add primary GID to set of GIDs if set */
1450 if (has_gid)
1451 opts.additional_gids[gidcnt++] = opts.gr_gid;
1452
1453 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1454 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1455 continue;
1456 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur);
1457 }
1458 opts.num_additional_gids = gidcnt;
1459 }
1460 DEBUG("read %zu additional groups\n", gidcnt);
1461 }
1462
1463 if (tb[OCI_PROCESS_USER_UMASK]) {
1464 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]);
1465 opts.set_umask = true;
1466 }
1467
1468 return 0;
1469 }
1470
1471 /* from manpage GETRLIMIT(2) */
1472 static const char* const rlimit_names[RLIM_NLIMITS] = {
1473 [RLIMIT_AS] = "AS",
1474 [RLIMIT_CORE] = "CORE",
1475 [RLIMIT_CPU] = "CPU",
1476 [RLIMIT_DATA] = "DATA",
1477 [RLIMIT_FSIZE] = "FSIZE",
1478 [RLIMIT_LOCKS] = "LOCKS",
1479 [RLIMIT_MEMLOCK] = "MEMLOCK",
1480 [RLIMIT_MSGQUEUE] = "MSGQUEUE",
1481 [RLIMIT_NICE] = "NICE",
1482 [RLIMIT_NOFILE] = "NOFILE",
1483 [RLIMIT_NPROC] = "NPROC",
1484 [RLIMIT_RSS] = "RSS",
1485 [RLIMIT_RTPRIO] = "RTPRIO",
1486 [RLIMIT_RTTIME] = "RTTIME",
1487 [RLIMIT_SIGPENDING] = "SIGPENDING",
1488 [RLIMIT_STACK] = "STACK",
1489 };
1490
1491 static int resolve_rlimit(char *type) {
1492 unsigned int rltype;
1493
1494 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype)
1495 if (rlimit_names[rltype] &&
1496 !strncmp("RLIMIT_", type, 7) &&
1497 !strcmp(rlimit_names[rltype], type + 7))
1498 return rltype;
1499
1500 return -1;
1501 }
1502
1503
1504 static int parseOCIrlimits(struct blob_attr *msg)
1505 {
1506 struct blob_attr *cur, *cure;
1507 int rem, reme;
1508 int limtype = -1;
1509 struct rlimit *curlim;
1510 rlim_t soft, hard;
1511 bool sethard = false, setsoft = false;
1512
1513 blobmsg_for_each_attr(cur, msg, rem) {
1514 blobmsg_for_each_attr(cure, cur, reme) {
1515 if (!strcmp(blobmsg_name(cure), "type") && (blobmsg_type(cure) == BLOBMSG_TYPE_STRING)) {
1516 limtype = resolve_rlimit(blobmsg_get_string(cure));
1517 } else if (!strcmp(blobmsg_name(cure), "soft")) {
1518 switch (blobmsg_type(cure)) {
1519 case BLOBMSG_TYPE_INT32:
1520 soft = blobmsg_get_u32(cure);
1521 break;
1522 case BLOBMSG_TYPE_INT64:
1523 soft = blobmsg_get_u64(cure);
1524 break;
1525 default:
1526 return EINVAL;
1527 }
1528 setsoft = true;
1529 } else if (!strcmp(blobmsg_name(cure), "hard")) {
1530 switch (blobmsg_type(cure)) {
1531 case BLOBMSG_TYPE_INT32:
1532 hard = blobmsg_get_u32(cure);
1533 break;
1534 case BLOBMSG_TYPE_INT64:
1535 hard = blobmsg_get_u64(cure);
1536 break;
1537 default:
1538 return EINVAL;
1539 }
1540 sethard = true;
1541 } else {
1542 return EINVAL;
1543 }
1544 }
1545
1546 if (limtype < 0)
1547 return EINVAL;
1548
1549 if (opts.rlimits[limtype])
1550 return ENOTUNIQ;
1551
1552 if (!sethard || !setsoft)
1553 return ENODATA;
1554
1555 curlim = malloc(sizeof(struct rlimit));
1556 curlim->rlim_cur = soft;
1557 curlim->rlim_max = hard;
1558
1559 opts.rlimits[limtype] = curlim;
1560 }
1561
1562 return 0;
1563 };
1564
1565 enum {
1566 OCI_PROCESS_ARGS,
1567 OCI_PROCESS_CAPABILITIES,
1568 OCI_PROCESS_CWD,
1569 OCI_PROCESS_ENV,
1570 OCI_PROCESS_OOMSCOREADJ,
1571 OCI_PROCESS_NONEWPRIVILEGES,
1572 OCI_PROCESS_RLIMITS,
1573 OCI_PROCESS_TERMINAL,
1574 OCI_PROCESS_USER,
1575 __OCI_PROCESS_MAX,
1576 };
1577
1578 static const struct blobmsg_policy oci_process_policy[] = {
1579 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1580 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
1581 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
1582 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1583 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 },
1584 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
1585 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
1586 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
1587 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
1588 };
1589
1590
1591 static int parseOCIprocess(struct blob_attr *msg)
1592 {
1593 struct blob_attr *tb[__OCI_PROCESS_MAX];
1594 int res;
1595
1596 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1597
1598 if (!tb[OCI_PROCESS_ARGS])
1599 return ENOENT;
1600
1601 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv);
1602 if (res)
1603 return res;
1604
1605 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
1606 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
1607
1608 if (tb[OCI_PROCESS_CWD])
1609 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD]));
1610
1611 if (tb[OCI_PROCESS_ENV]) {
1612 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp);
1613 if (res)
1614 return res;
1615 }
1616
1617 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
1618 return res;
1619
1620 if (tb[OCI_PROCESS_CAPABILITIES] &&
1621 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
1622 return res;
1623
1624 if (tb[OCI_PROCESS_RLIMITS] &&
1625 (res = parseOCIrlimits(tb[OCI_PROCESS_RLIMITS])))
1626 return res;
1627
1628 if (tb[OCI_PROCESS_OOMSCOREADJ]) {
1629 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]);
1630 opts.set_oom_score_adj = true;
1631 }
1632
1633 return 0;
1634 }
1635
1636 enum {
1637 OCI_LINUX_NAMESPACE_TYPE,
1638 OCI_LINUX_NAMESPACE_PATH,
1639 __OCI_LINUX_NAMESPACE_MAX,
1640 };
1641
1642 static const struct blobmsg_policy oci_linux_namespace_policy[] = {
1643 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1644 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
1645 };
1646
1647 static int resolve_nstype(char *type) {
1648 if (!strcmp("pid", type))
1649 return CLONE_NEWPID;
1650 else if (!strcmp("network", type))
1651 return CLONE_NEWNET;
1652 else if (!strcmp("mount", type))
1653 return CLONE_NEWNS;
1654 else if (!strcmp("ipc", type))
1655 return CLONE_NEWIPC;
1656 else if (!strcmp("uts", type))
1657 return CLONE_NEWUTS;
1658 else if (!strcmp("user", type))
1659 return CLONE_NEWUSER;
1660 else if (!strcmp("cgroup", type))
1661 return CLONE_NEWCGROUP;
1662 #ifdef CLONE_NEWTIME
1663 else if (!strcmp("time", type))
1664 return CLONE_NEWTIME;
1665 #endif
1666 else
1667 return 0;
1668 }
1669
1670 static int parseOCIlinuxns(struct blob_attr *msg)
1671 {
1672 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
1673 int nstype;
1674 int *setns;
1675 int fd;
1676
1677 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1678
1679 if (!tb[OCI_LINUX_NAMESPACE_TYPE])
1680 return EINVAL;
1681
1682 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
1683 if (!nstype)
1684 return EINVAL;
1685
1686 if (opts.namespace & nstype)
1687 return ENOTUNIQ;
1688
1689 setns = get_namespace_fd(nstype);
1690
1691 if (!setns)
1692 return EFAULT;
1693
1694 if (*setns != -1)
1695 return ENOTUNIQ;
1696
1697 if (tb[OCI_LINUX_NAMESPACE_PATH]) {
1698 DEBUG("opening existing %s namespace from path %s\n",
1699 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1700 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]));
1701
1702 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY);
1703 if (fd == -1)
1704 return errno?:ESTALE;
1705
1706 if (ioctl(fd, NS_GET_NSTYPE) != nstype)
1707 return EINVAL;
1708
1709 DEBUG("opened existing %s namespace got filehandler %u\n",
1710 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1711 fd);
1712
1713 *setns = fd;
1714 } else {
1715 opts.namespace |= nstype;
1716 }
1717
1718 return 0;
1719 };
1720
1721
1722 enum {
1723 OCI_LINUX_UIDGIDMAP_CONTAINERID,
1724 OCI_LINUX_UIDGIDMAP_HOSTID,
1725 OCI_LINUX_UIDGIDMAP_SIZE,
1726 __OCI_LINUX_UIDGIDMAP_MAX,
1727 };
1728
1729 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
1730 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
1731 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
1732 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
1733 };
1734
1735 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
1736 {
1737 const char *map_format = "%d %d %d\n";
1738 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
1739 struct blob_attr *cur;
1740 int rem, len;
1741 char **mappings;
1742 char *map, *curstr;
1743 unsigned int cnt = 0;
1744 size_t totallen = 0;
1745
1746 /* count number of mappings */
1747 blobmsg_for_each_attr(cur, msg, rem)
1748 cnt++;
1749
1750 if (!cnt)
1751 return 0;
1752
1753 /* allocate array for mappings */
1754 mappings = calloc(1 + cnt, sizeof(char*));
1755 if (!mappings)
1756 return ENOMEM;
1757
1758 mappings[cnt] = NULL;
1759
1760 cnt = 0;
1761 blobmsg_for_each_attr(cur, msg, rem) {
1762 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1763
1764 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
1765 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
1766 !tb[OCI_LINUX_UIDGIDMAP_SIZE])
1767 return EINVAL;
1768
1769 /* write mapping line into allocated string */
1770 len = asprintf(&mappings[cnt++], map_format,
1771 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
1772 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
1773 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
1774
1775 if (len < 0)
1776 return ENOMEM;
1777
1778 totallen += len;
1779 }
1780
1781 /* allocate combined mapping string */
1782 map = calloc(1 + totallen, sizeof(char));
1783 if (!map)
1784 return ENOMEM;
1785
1786 map[0] = '\0';
1787
1788 /* concatenate mapping strings into combined string */
1789 curstr = mappings[0];
1790 while (curstr) {
1791 strcat(map, curstr);
1792 free(curstr++);
1793 }
1794 free(mappings);
1795
1796 if (is_gidmap)
1797 opts.gidmap = map;
1798 else
1799 opts.uidmap = map;
1800
1801 return 0;
1802 }
1803
1804 enum {
1805 OCI_DEVICES_TYPE,
1806 OCI_DEVICES_PATH,
1807 OCI_DEVICES_MAJOR,
1808 OCI_DEVICES_MINOR,
1809 OCI_DEVICES_FILEMODE,
1810 OCI_DEVICES_UID,
1811 OCI_DEVICES_GID,
1812 __OCI_DEVICES_MAX,
1813 };
1814
1815 static const struct blobmsg_policy oci_devices_policy[] = {
1816 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1817 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING },
1818 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 },
1819 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 },
1820 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 },
1821 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 },
1822 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 },
1823 };
1824
1825 static mode_t resolve_devtype(char *tstr)
1826 {
1827 if (!strcmp("c", tstr) ||
1828 !strcmp("u", tstr))
1829 return S_IFCHR;
1830 else if (!strcmp("b", tstr))
1831 return S_IFBLK;
1832 else if (!strcmp("p", tstr))
1833 return S_IFIFO;
1834 else
1835 return 0;
1836 }
1837
1838 static int parseOCIdevices(struct blob_attr *msg)
1839 {
1840 struct blob_attr *tb[__OCI_DEVICES_MAX];
1841 struct blob_attr *cur;
1842 int rem;
1843 size_t cnt = 0;
1844 struct mknod_args *tmp;
1845
1846 blobmsg_for_each_attr(cur, msg, rem)
1847 ++cnt;
1848
1849 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *));
1850
1851 cnt = 0;
1852 blobmsg_for_each_attr(cur, msg, rem) {
1853 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1854 if (!tb[OCI_DEVICES_TYPE] ||
1855 !tb[OCI_DEVICES_PATH])
1856 return ENODATA;
1857
1858 tmp = calloc(1, sizeof(struct mknod_args));
1859 if (!tmp)
1860 return ENOMEM;
1861
1862 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
1863 if (!tmp->mode)
1864 return EINVAL;
1865
1866 if (tmp->mode != S_IFIFO) {
1867 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR])
1868 return ENODATA;
1869
1870 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]),
1871 blobmsg_get_u32(tb[OCI_DEVICES_MINOR]));
1872 }
1873
1874 if (tb[OCI_DEVICES_FILEMODE]) {
1875 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]))
1876 return EINVAL;
1877
1878 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]);
1879 } else {
1880 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */
1881 }
1882
1883 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH]));
1884
1885 if (tb[OCI_DEVICES_UID])
1886 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]);
1887 else
1888 tmp->uid = -1;
1889
1890 if (tb[OCI_DEVICES_GID])
1891 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]);
1892 else
1893 tmp->gid = -1;
1894
1895 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
1896 opts.devices[cnt++] = tmp;
1897 }
1898
1899 opts.devices[cnt] = NULL;
1900
1901 return 0;
1902 }
1903
1904 enum {
1905 OCI_LINUX_RESOURCES,
1906 OCI_LINUX_SECCOMP,
1907 OCI_LINUX_SYSCTL,
1908 OCI_LINUX_NAMESPACES,
1909 OCI_LINUX_DEVICES,
1910 OCI_LINUX_UIDMAPPINGS,
1911 OCI_LINUX_GIDMAPPINGS,
1912 OCI_LINUX_MASKEDPATHS,
1913 OCI_LINUX_READONLYPATHS,
1914 OCI_LINUX_ROOTFSPROPAGATION,
1915 __OCI_LINUX_MAX,
1916 };
1917
1918 static const struct blobmsg_policy oci_linux_policy[] = {
1919 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
1920 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
1921 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
1922 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
1923 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
1924 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
1925 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
1926 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
1927 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
1928 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
1929 };
1930
1931 static int parseOCIsysctl(struct blob_attr *msg)
1932 {
1933 struct blob_attr *cur;
1934 int rem;
1935 char *tmp, *tc;
1936 size_t cnt = 0;
1937
1938 blobmsg_for_each_attr(cur, msg, rem) {
1939 if (!blobmsg_name(cur) || !blobmsg_get_string(cur))
1940 return EINVAL;
1941
1942 ++cnt;
1943 }
1944
1945 if (!cnt)
1946 return 0;
1947
1948 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *));
1949 if (!opts.sysctl)
1950 return ENOMEM;
1951
1952 cnt = 0;
1953 blobmsg_for_each_attr(cur, msg, rem) {
1954 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val));
1955 if (!opts.sysctl[cnt])
1956 return ENOMEM;
1957
1958 /* replace '.' with '/' in entry name */
1959 tc = tmp = strdup(blobmsg_name(cur));
1960 while ((tc = strchr(tc, '.')))
1961 *tc = '/';
1962
1963 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur));
1964 opts.sysctl[cnt]->entry = tmp;
1965
1966 ++cnt;
1967 }
1968
1969 opts.sysctl[cnt] = NULL;
1970
1971 return 0;
1972 }
1973
1974 static int parseOCIlinux(struct blob_attr *msg)
1975 {
1976 struct blob_attr *tb[__OCI_LINUX_MAX];
1977 struct blob_attr *cur;
1978 int rem;
1979 int res = 0;
1980
1981 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1982
1983 if (tb[OCI_LINUX_NAMESPACES]) {
1984 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
1985 res = parseOCIlinuxns(cur);
1986 if (res)
1987 return res;
1988 }
1989 }
1990
1991 if (tb[OCI_LINUX_UIDMAPPINGS]) {
1992 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
1993 if (res)
1994 return res;
1995 }
1996
1997 if (tb[OCI_LINUX_GIDMAPPINGS]) {
1998 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
1999 if (res)
2000 return res;
2001 }
2002
2003 if (tb[OCI_LINUX_READONLYPATHS]) {
2004 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) {
2005 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, NULL, 0);
2006 if (res)
2007 return res;
2008 }
2009 }
2010
2011 if (tb[OCI_LINUX_MASKEDPATHS]) {
2012 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) {
2013 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, NULL, 1);
2014 if (res)
2015 return res;
2016 }
2017 }
2018
2019 if (tb[OCI_LINUX_SYSCTL]) {
2020 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]);
2021 if (res)
2022 return res;
2023 }
2024
2025 if (tb[OCI_LINUX_SECCOMP]) {
2026 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
2027 if (!opts.ociseccomp)
2028 return EINVAL;
2029 }
2030
2031 if (tb[OCI_LINUX_DEVICES]) {
2032 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]);
2033 if (res)
2034 return res;
2035 }
2036
2037 return 0;
2038 }
2039
2040 enum {
2041 OCI_VERSION,
2042 OCI_HOSTNAME,
2043 OCI_PROCESS,
2044 OCI_ROOT,
2045 OCI_MOUNTS,
2046 OCI_HOOKS,
2047 OCI_LINUX,
2048 __OCI_MAX,
2049 };
2050
2051 static const struct blobmsg_policy oci_policy[] = {
2052 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
2053 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
2054 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
2055 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
2056 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
2057 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE },
2058 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
2059 };
2060
2061 static int parseOCI(const char *jsonfile)
2062 {
2063 struct blob_attr *tb[__OCI_MAX];
2064 struct blob_attr *cur;
2065 int rem;
2066 int res;
2067
2068 blob_buf_init(&ocibuf, 0);
2069 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile))
2070 return ENOENT;
2071
2072 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
2073
2074 if (!tb[OCI_VERSION])
2075 return ENOMSG;
2076
2077 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
2078 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
2079 return ENOTSUP;
2080 }
2081
2082 if (tb[OCI_HOSTNAME])
2083 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME]));
2084
2085 if (!tb[OCI_PROCESS])
2086 return ENODATA;
2087
2088 if ((res = parseOCIprocess(tb[OCI_PROCESS])))
2089 return res;
2090
2091 if (!tb[OCI_ROOT])
2092 return ENODATA;
2093
2094 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
2095 return res;
2096
2097 if (!tb[OCI_MOUNTS])
2098 return ENODATA;
2099
2100 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
2101 if ((res = parseOCImount(cur)))
2102 return res;
2103
2104 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
2105 return res;
2106
2107 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS])))
2108 return res;
2109
2110 blob_buf_free(&ocibuf);
2111
2112 return 0;
2113 }
2114
2115 static int set_oom_score_adj(void)
2116 {
2117 int f;
2118 char fname[32];
2119
2120 if (!opts.set_oom_score_adj)
2121 return 0;
2122
2123 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid);
2124 f = open(fname, O_WRONLY | O_TRUNC);
2125 if (f == -1)
2126 return errno;
2127
2128 dprintf(f, "%d", opts.oom_score_adj);
2129 close(f);
2130
2131 return 0;
2132 }
2133
2134
2135 int main(int argc, char **argv)
2136 {
2137 sigset_t sigmask;
2138 uid_t uid = getuid();
2139 const char log[] = "/dev/log";
2140 const char ubus[] = "/var/run/ubus.sock";
2141 char *jsonfile = NULL;
2142 int i, ch;
2143 int pipes[4];
2144 char sig_buf[1];
2145 int netns_fd;
2146 int pidns_fd;
2147
2148 if (uid) {
2149 ERROR("not root, aborting: %m\n");
2150 return EXIT_FAILURE;
2151 }
2152
2153 umask(022);
2154 mount_list_init();
2155 init_library_search();
2156
2157 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
2158 switch (ch) {
2159 case 'd':
2160 debug = atoi(optarg);
2161 break;
2162 case 'p':
2163 opts.namespace |= CLONE_NEWNS;
2164 opts.procfs = 1;
2165 break;
2166 case 'o':
2167 opts.namespace |= CLONE_NEWNS;
2168 opts.ronly = 1;
2169 break;
2170 case 'f':
2171 opts.namespace |= CLONE_NEWUSER;
2172 break;
2173 case 'F':
2174 opts.namespace |= CLONE_NEWCGROUP;
2175 break;
2176 case 'R':
2177 opts.extroot = strdup(optarg);
2178 break;
2179 case 's':
2180 opts.namespace |= CLONE_NEWNS;
2181 opts.sysfs = 1;
2182 break;
2183 case 'S':
2184 opts.seccomp = optarg;
2185 add_mount_bind(optarg, 1, -1);
2186 break;
2187 case 'C':
2188 opts.capabilities = optarg;
2189 break;
2190 case 'c':
2191 opts.no_new_privs = 1;
2192 break;
2193 case 'n':
2194 opts.name = optarg;
2195 break;
2196 case 'N':
2197 opts.namespace |= CLONE_NEWNET;
2198 break;
2199 case 'h':
2200 opts.namespace |= CLONE_NEWUTS;
2201 opts.hostname = strdup(optarg);
2202 break;
2203 case 'r':
2204 opts.namespace |= CLONE_NEWNS;
2205 add_path_and_deps(optarg, 1, 0, 0);
2206 break;
2207 case 'w':
2208 opts.namespace |= CLONE_NEWNS;
2209 add_path_and_deps(optarg, 0, 0, 0);
2210 break;
2211 case 'u':
2212 opts.namespace |= CLONE_NEWNS;
2213 add_mount_bind(ubus, 0, -1);
2214 break;
2215 case 'l':
2216 opts.namespace |= CLONE_NEWNS;
2217 add_mount_bind(log, 0, -1);
2218 break;
2219 case 'U':
2220 opts.user = optarg;
2221 break;
2222 case 'G':
2223 opts.group = optarg;
2224 break;
2225 case 'O':
2226 opts.overlaydir = optarg;
2227 break;
2228 case 'T':
2229 opts.tmpoverlaysize = optarg;
2230 break;
2231 case 'E':
2232 opts.require_jail = 1;
2233 break;
2234 case 'y':
2235 opts.console = 1;
2236 break;
2237 case 'J':
2238 asprintf(&jsonfile, "%s/config.json", optarg);
2239 break;
2240 }
2241 }
2242
2243 if (opts.namespace && !jsonfile)
2244 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
2245
2246 /* those are filehandlers, so -1 indicates unused */
2247 opts.setns.pid = -1;
2248 opts.setns.net = -1;
2249 opts.setns.ns = -1;
2250 opts.setns.ipc = -1;
2251 opts.setns.uts = -1;
2252 opts.setns.user = -1;
2253 opts.setns.cgroup = -1;
2254 #ifdef CLONE_NEWTIME
2255 opts.setns.time = -1;
2256 #endif
2257
2258 if (jsonfile) {
2259 int ocires;
2260 ocires = parseOCI(jsonfile);
2261 free(jsonfile);
2262 if (ocires) {
2263 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
2264 return ocires;
2265 }
2266 }
2267
2268 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
2269 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
2270 return -1;
2271 }
2272
2273 /* no <binary> param found */
2274 if (!jsonfile && (argc - optind < 1)) {
2275 usage();
2276 return EXIT_FAILURE;
2277 }
2278 if (!(jsonfile||opts.namespace||opts.capabilities||opts.seccomp)) {
2279 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
2280 usage();
2281 return EXIT_FAILURE;
2282 }
2283 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
2284 opts.namespace,
2285 opts.capabilities != 0 || opts.capset.apply,
2286 opts.seccomp != 0 || opts.ociseccomp != 0);
2287
2288 if (!jsonfile) {
2289 /* allocate NULL-terminated array for argv */
2290 opts.jail_argv = calloc(1 + argc - optind, sizeof(char**));
2291 if (!opts.jail_argv)
2292 return EXIT_FAILURE;
2293
2294 for (size_t s = optind; s < argc; s++)
2295 opts.jail_argv[s - optind] = strdup(argv[s]);
2296
2297 if (opts.namespace & CLONE_NEWUSER)
2298 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
2299 }
2300
2301 if (!opts.extroot) {
2302 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
2303 ERROR("failed to load dependencies\n");
2304 return -1;
2305 }
2306 }
2307
2308 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
2309 ERROR("failed to load libpreload-seccomp.so\n");
2310 opts.seccomp = 0;
2311 if (opts.require_jail)
2312 return -1;
2313 }
2314
2315 if (apply_rlimits()) {
2316 ERROR("error applying resource limits\n");
2317 exit(EXIT_FAILURE);
2318 }
2319
2320 if (opts.name)
2321 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
2322
2323 sigfillset(&sigmask);
2324 for (i = 0; i < _NSIG; i++) {
2325 struct sigaction s = { 0 };
2326
2327 if (!sigismember(&sigmask, i))
2328 continue;
2329 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
2330 continue;
2331
2332 s.sa_handler = jail_handle_signal;
2333 sigaction(i, &s, NULL);
2334 }
2335
2336 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
2337 return -1;
2338
2339 if (has_namespaces()) {
2340 if (opts.namespace & CLONE_NEWNS) {
2341 if (!opts.extroot && (opts.user || opts.group)) {
2342 add_mount_bind("/etc/passwd", 0, -1);
2343 add_mount_bind("/etc/group", 0, -1);
2344 }
2345
2346 #if defined(__GLIBC__)
2347 if (!opts.extroot)
2348 add_mount_bind("/etc/nsswitch.conf", 0, -1);
2349 #endif
2350
2351 if (!(opts.namespace & CLONE_NEWNET)) {
2352 add_mount_bind("/etc/resolv.conf", 0, -1);
2353 } else if (opts.setns.net == -1) {
2354 char hostdir[PATH_MAX];
2355
2356 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
2357 mkdir_p(hostdir, 0755);
2358 add_mount(hostdir, "/dev/resolv.conf.d", NULL, MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, NULL, -1);
2359 }
2360
2361 /* default mounts */
2362 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M", -1);
2363 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
2364
2365 if (opts.procfs || jsonfile) {
2366 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL, -1);
2367
2368 /*
2369 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only
2370 * which cannot be expressed with OCI spec, but happends to be very useful.
2371 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or
2372 * readonlyPath.
2373 * If not running in a new network namespace, only make /proc/sys read-only.
2374 * If running in a new network namespace, temporarily stash (ie. mount-bind)
2375 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net.
2376 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into
2377 * /proc/sys/net.
2378 * This works because mounts are executed in incrementing strcmp() order and
2379 * /proc/self/net appears there before /proc/sys/net and hence the operation
2380 * succeeds as the bind-mount of /proc/self/net is performed first and then
2381 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII
2382 * table (and in the alphabet).
2383 */
2384 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, NULL, -1))
2385 if (opts.namespace & CLONE_NEWNET)
2386 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, NULL, -1))
2387 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, NULL, -1);
2388
2389 }
2390 if (opts.sysfs || jsonfile)
2391 add_mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, NULL, -1);
2392
2393 if (jsonfile)
2394 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, "mode=1777", -1);
2395
2396 }
2397
2398 if (opts.setns.pid != -1) {
2399 pidns_fd = pidns_open_pid(getpid());
2400 setns_open(CLONE_NEWPID);
2401 } else {
2402 pidns_fd = -1;
2403 }
2404
2405 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes);
2406 } else {
2407 jail_process.pid = fork();
2408 }
2409
2410 if (jail_process.pid > 0) {
2411 /* parent process */
2412 jail_running = 1;
2413 seteuid(0);
2414 if (pidns_fd != -1) {
2415 setns(pidns_fd, CLONE_NEWPID);
2416 close(pidns_fd);
2417 }
2418 if (opts.setns.net != -1)
2419 close(opts.setns.net);
2420 if (opts.setns.ns != -1)
2421 close(opts.setns.ns);
2422 if (opts.setns.ipc != -1)
2423 close(opts.setns.ipc);
2424 if (opts.setns.uts != -1)
2425 close(opts.setns.uts);
2426 if (opts.setns.user != -1)
2427 close(opts.setns.user);
2428 if (opts.setns.cgroup != -1)
2429 close(opts.setns.cgroup);
2430 #ifdef CLONE_NEWTIME
2431 if (opts.setns.time != -1)
2432 close(opts.setns.time);
2433 #endif
2434 close(pipes[1]);
2435 close(pipes[2]);
2436 run_hooks(opts.hooks.createRuntime);
2437 if (read(pipes[0], sig_buf, 1) < 1) {
2438 ERROR("can't read from child\n");
2439 return -1;
2440 }
2441 close(pipes[0]);
2442 set_oom_score_adj();
2443
2444 if (opts.namespace & CLONE_NEWUSER) {
2445 if (write_setgroups(jail_process.pid, true)) {
2446 ERROR("can't write setgroups\n");
2447 return -1;
2448 }
2449 if (!opts.uidmap) {
2450 bool has_gr = (opts.gr_gid != -1);
2451 if (opts.pw_uid != -1) {
2452 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
2453 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
2454 } else {
2455 write_single_uid_gid_map(jail_process.pid, 0, 65534);
2456 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
2457 }
2458 } else {
2459 write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
2460 if (opts.gidmap)
2461 write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
2462 }
2463 }
2464
2465 if (opts.namespace & CLONE_NEWNET) {
2466 if (!opts.name) {
2467 ERROR("netns needs a named jail\n");
2468 return -1;
2469 }
2470 netns_fd = netns_open_pid(jail_process.pid);
2471 netns_updown(jail_process.pid, true);
2472 }
2473
2474 sig_buf[0] = 'O';
2475 if (write(pipes[3], sig_buf, 1) < 0) {
2476 ERROR("can't write to child\n");
2477 return -1;
2478 }
2479 close(pipes[3]);
2480 run_hooks(opts.hooks.poststart);
2481
2482 uloop_init();
2483 uloop_process_add(&jail_process);
2484 uloop_run();
2485 if (jail_running) {
2486 DEBUG("uloop interrupted, killing jail process\n");
2487 kill(jail_process.pid, SIGTERM);
2488 uloop_timeout_set(&jail_process_timeout, 1000);
2489 uloop_run();
2490 }
2491 uloop_done();
2492 if (opts.namespace & CLONE_NEWNET) {
2493 setns(netns_fd, CLONE_NEWNET);
2494 netns_updown(getpid(), false);
2495 close(netns_fd);
2496 }
2497 run_hooks(opts.hooks.poststop);
2498 free_opts(true);
2499 return jail_return_code;
2500 } else if (jail_process.pid == 0) {
2501 /* fork child process */
2502 return exec_jail(&pipes);
2503 } else {
2504 ERROR("failed to clone/fork: %m\n");
2505 return EXIT_FAILURE;
2506 }
2507 }