jail: fix build on glibc and uclibc
[project/procd.git] / jail / jail.c
1 /*
2 * Copyright (C) 2015 John Crispin <blogic@openwrt.org>
3 * Copyright (C) 2020 Daniel Golle <daniel@makrotopia.org>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License version 2.1
7 * as published by the Free Software Foundation
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14
15 #define _GNU_SOURCE
16 #include <sys/mount.h>
17 #include <sys/prctl.h>
18 #include <sys/wait.h>
19 #include <sys/types.h>
20 #include <sys/time.h>
21 #include <sys/resource.h>
22 #include <sys/stat.h>
23 #include <sys/sysmacros.h>
24
25 /* musl only defined 15 limit types, make sure all 16 are supported */
26 #ifndef RLIMIT_RTTIME
27 #define RLIMIT_RTTIME 15
28 #undef RLIMIT_NLIMITS
29 #define RLIMIT_NLIMITS 16
30 #undef RLIM_NLIMITS
31 #define RLIM_NLIMITS 16
32 #endif
33
34 #include <stdlib.h>
35 #include <unistd.h>
36 #include <errno.h>
37 #include <pwd.h>
38 #include <grp.h>
39 #include <string.h>
40 #include <fcntl.h>
41 #include <sched.h>
42 #include <linux/filter.h>
43 #include <linux/limits.h>
44 #include <linux/nsfs.h>
45 #include <signal.h>
46 #include <inttypes.h>
47
48 #include "capabilities.h"
49 #include "elf.h"
50 #include "fs.h"
51 #include "jail.h"
52 #include "log.h"
53 #include "seccomp-oci.h"
54
55 #include <libubox/utils.h>
56 #include <libubox/blobmsg.h>
57 #include <libubox/blobmsg_json.h>
58 #include <libubox/list.h>
59 #include <libubox/vlist.h>
60 #include <libubox/uloop.h>
61 #include <libubus.h>
62
63 #ifndef CLONE_NEWCGROUP
64 #define CLONE_NEWCGROUP 0x02000000
65 #endif
66
67 #define STACK_SIZE (1024 * 1024)
68 #define OPT_ARGS "S:C:n:h:r:w:d:psulocU:G:NR:fFO:T:EyJ:"
69
70 struct hook_execvpe {
71 char *file;
72 char **argv;
73 char **envp;
74 int timeout;
75 };
76
77 struct sysctl_val {
78 char *entry;
79 char *value;
80 };
81
82 struct mknod_args {
83 char *path;
84 mode_t mode;
85 dev_t dev;
86 uid_t uid;
87 gid_t gid;
88 };
89
90 static struct {
91 char *name;
92 char *hostname;
93 char **jail_argv;
94 char *cwd;
95 char *seccomp;
96 struct sock_fprog *ociseccomp;
97 char *capabilities;
98 struct jail_capset capset;
99 char *user;
100 char *group;
101 char *extroot;
102 char *overlaydir;
103 char *tmpoverlaysize;
104 char **envp;
105 char *uidmap;
106 char *gidmap;
107 struct sysctl_val **sysctl;
108 int no_new_privs;
109 int namespace;
110 struct {
111 int pid;
112 int net;
113 int ns;
114 int ipc;
115 int uts;
116 int user;
117 int cgroup;
118 #ifdef CLONE_NEWTIME
119 int time;
120 #endif
121 } setns;
122 int procfs;
123 int ronly;
124 int sysfs;
125 int console;
126 int pw_uid;
127 int pw_gid;
128 int gr_gid;
129 gid_t *additional_gids;
130 size_t num_additional_gids;
131 mode_t umask;
132 bool set_umask;
133 int require_jail;
134 struct {
135 struct hook_execvpe **createRuntime;
136 struct hook_execvpe **createContainer;
137 struct hook_execvpe **startContainer;
138 struct hook_execvpe **poststart;
139 struct hook_execvpe **poststop;
140 } hooks;
141 struct rlimit *rlimits[RLIM_NLIMITS];
142 int oom_score_adj;
143 bool set_oom_score_adj;
144 struct mknod_args **devices;
145 } opts;
146
147 static inline bool has_namespaces(void)
148 {
149 return ((opts.setns.pid != -1) ||
150 (opts.setns.net != -1) ||
151 (opts.setns.ns != -1) ||
152 (opts.setns.ipc != -1) ||
153 (opts.setns.uts != -1) ||
154 (opts.setns.user != -1) ||
155 (opts.setns.cgroup != -1) ||
156 #ifdef CLONE_NEWTIME
157 (opts.setns.time != -1) ||
158 #endif
159 opts.namespace);
160 }
161
162 static void free_hooklist(struct hook_execvpe **hooklist)
163 {
164 struct hook_execvpe *cur;
165 char **tmp;
166
167 if (!hooklist)
168 return;
169
170 cur = *hooklist;
171 while (cur) {
172 free(cur->file);
173 tmp = cur->argv;
174 while (tmp)
175 free(*(tmp++));
176
177 free(cur->argv);
178
179 tmp = cur->envp;
180 while (tmp)
181 free(*(tmp++));
182
183 free(cur->envp);
184 free(cur++);
185 }
186 free(hooklist);
187 }
188
189 static void free_sysctl(void) {
190 struct sysctl_val *cur;
191 cur = *opts.sysctl;
192
193 while (cur) {
194 free(cur->entry);
195 free(cur->value);
196 free(cur++);
197 }
198 free(opts.sysctl);
199 }
200
201 static void free_devices(void) {
202 struct mknod_args **cur;
203
204 if (!opts.devices)
205 return;
206
207 cur = opts.devices;
208
209 while (*cur) {
210 free((*cur)->path);
211 free(*(cur++));
212 }
213 free(opts.devices);
214 }
215
216 static void free_rlimits(void) {
217 int type;
218
219 for (type = 0; type < RLIM_NLIMITS; ++type)
220 free(opts.rlimits[type]);
221 }
222
223 static void free_opts(bool child) {
224 char **tmp;
225
226 /* we need to keep argv, envp and seccomp filter in child */
227 if (child) {
228 if (opts.ociseccomp) {
229 free(opts.ociseccomp->filter);
230 free(opts.ociseccomp);
231 }
232
233 tmp = opts.jail_argv;
234 while(tmp)
235 free(*(tmp++));
236
237 free(opts.jail_argv);
238
239 tmp = opts.envp;
240 while (tmp)
241 free(*(tmp++));
242
243 free(opts.envp);
244 };
245
246 free_rlimits();
247 free_sysctl();
248 free_devices();
249 free(opts.hostname);
250 free(opts.cwd);
251 free(opts.extroot);
252 free(opts.uidmap);
253 free(opts.gidmap);
254 free_hooklist(opts.hooks.createRuntime);
255 free_hooklist(opts.hooks.createContainer);
256 free_hooklist(opts.hooks.startContainer);
257 free_hooklist(opts.hooks.poststart);
258 free_hooklist(opts.hooks.poststop);
259 }
260
261 static struct blob_buf ocibuf;
262
263 extern int pivot_root(const char *new_root, const char *put_old);
264
265 int debug = 0;
266
267 static char child_stack[STACK_SIZE];
268
269 int console_fd;
270
271 static int mount_overlay(char *jail_root, char *overlaydir) {
272 char *upperdir, *workdir, *optsstr, *upperetc, *upperresolvconf;
273 const char mountoptsformat[] = "lowerdir=%s,upperdir=%s,workdir=%s";
274 int ret = -1, fd;
275
276 if (asprintf(&upperdir, "%s%s", overlaydir, "/upper") < 0)
277 goto out;
278
279 if (asprintf(&workdir, "%s%s", overlaydir, "/work") < 0)
280 goto upper_printf;
281
282 if (asprintf(&optsstr, mountoptsformat, jail_root, upperdir, workdir) < 0)
283 goto work_printf;
284
285 if (mkdir_p(upperdir, 0755) || mkdir_p(workdir, 0755))
286 goto opts_printf;
287
288 /*
289 * make sure /etc/resolv.conf exists in overlay and is owned by jail userns root
290 * this is to work-around a bug in overlayfs described in the overlayfs-userns
291 * patch:
292 * 3. modification of a file 'hithere' which is in l but not yet
293 * in u, and which is not owned by T, is not allowed, even if
294 * writes to u are allowed. This may be a bug in overlayfs,
295 * but it is safe behavior.
296 */
297 if (asprintf(&upperetc, "%s/etc", upperdir) < 0)
298 goto opts_printf;
299
300 if (mkdir_p(upperetc, 0755))
301 goto upper_etc_printf;
302
303 if (asprintf(&upperresolvconf, "%s/resolv.conf", upperetc) < 0)
304 goto upper_etc_printf;
305
306 fd = creat(upperresolvconf, 0644);
307 if (fd == -1) {
308 ERROR("creat(%s) failed: %m\n", upperresolvconf);
309 goto upper_resolvconf_printf;
310 }
311 close(fd);
312
313 DEBUG("mount -t overlay %s %s (%s)\n", jail_root, jail_root, optsstr);
314
315 if (mount(jail_root, jail_root, "overlay", MS_NOATIME, optsstr))
316 goto opts_printf;
317
318 ret = 0;
319
320 upper_resolvconf_printf:
321 free(upperresolvconf);
322 upper_etc_printf:
323 free(upperetc);
324 opts_printf:
325 free(optsstr);
326 work_printf:
327 free(workdir);
328 upper_printf:
329 free(upperdir);
330 out:
331 return ret;
332 }
333
334 static void pass_console(int console_fd)
335 {
336 struct ubus_context *ctx = ubus_connect(NULL);
337 static struct blob_buf req;
338 uint32_t id;
339
340 if (!ctx)
341 return;
342
343 blob_buf_init(&req, 0);
344 blobmsg_add_string(&req, "name", opts.name);
345
346 if (ubus_lookup_id(ctx, "container", &id) ||
347 ubus_invoke_fd(ctx, id, "console_set", req.head, NULL, NULL, 3000, console_fd))
348 INFO("ubus request failed\n");
349 else
350 close(console_fd);
351
352 blob_buf_free(&req);
353 ubus_free(ctx);
354 }
355
356 static int create_dev_console(const char *jail_root)
357 {
358 char *console_fname;
359 char dev_console_path[PATH_MAX];
360 int slave_console_fd;
361
362 /* Open UNIX/98 virtual console */
363 console_fd = posix_openpt(O_RDWR | O_NOCTTY);
364 if (console_fd == -1)
365 return -1;
366
367 console_fname = ptsname(console_fd);
368 DEBUG("got console fd %d and PTS client name %s\n", console_fd, console_fname);
369 if (!console_fname)
370 goto no_console;
371
372 grantpt(console_fd);
373 unlockpt(console_fd);
374
375 /* pass PTY master to procd */
376 pass_console(console_fd);
377
378 /* mount-bind PTY slave to /dev/console in jail */
379 snprintf(dev_console_path, sizeof(dev_console_path), "%s/dev/console", jail_root);
380 close(creat(dev_console_path, 0620));
381
382 if (mount(console_fname, dev_console_path, NULL, MS_BIND, NULL))
383 goto no_console;
384
385 /* use PTY slave for stdio */
386 slave_console_fd = open(console_fname, O_RDWR); /* | O_NOCTTY */
387 dup2(slave_console_fd, 0);
388 dup2(slave_console_fd, 1);
389 dup2(slave_console_fd, 2);
390 close(slave_console_fd);
391
392 INFO("using guest console %s\n", console_fname);
393
394 return 0;
395
396 no_console:
397 close(console_fd);
398 return 1;
399 }
400
401 static int hook_running = 0;
402 static int hook_return_code = 0;
403
404 static void hook_process_timeout_cb(struct uloop_timeout *t);
405 static struct uloop_timeout hook_process_timeout = {
406 .cb = hook_process_timeout_cb,
407 };
408
409 static void hook_process_handler(struct uloop_process *c, int ret)
410 {
411 uloop_timeout_cancel(&hook_process_timeout);
412 if (WIFEXITED(ret)) {
413 hook_return_code = WEXITSTATUS(ret);
414 DEBUG("hook (%d) exited with exit: %d\n", c->pid, hook_return_code);
415 } else {
416 hook_return_code = WTERMSIG(ret);
417 DEBUG("hook (%d) exited with signal: %d\n", c->pid, hook_return_code);
418 }
419 hook_running = 0;
420 uloop_end();
421 }
422
423 static struct uloop_process hook_process = {
424 .cb = hook_process_handler,
425 };
426
427 static void hook_process_timeout_cb(struct uloop_timeout *t)
428 {
429 DEBUG("hook process failed to stop, sending SIGKILL\n");
430 kill(hook_process.pid, SIGKILL);
431 }
432
433 static int run_hook(struct hook_execvpe *hook)
434 {
435 struct stat s;
436
437 DEBUG("executing hook %s\n", hook->file);
438
439 if (stat(hook->file, &s))
440 return ENOENT;
441
442 if (!((unsigned long)s.st_mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
443 return EPERM;
444
445 if (!((unsigned long)s.st_mode & (S_IRUSR | S_IRGRP | S_IROTH)))
446 return EPERM;
447
448 uloop_init();
449
450 hook_running = 1;
451 hook_process.pid = fork();
452 if (hook_process.pid > 0) {
453 /* parent */
454 uloop_process_add(&hook_process);
455
456 if (hook->timeout > 0)
457 uloop_timeout_set(&hook_process_timeout, 1000 * hook->timeout);
458
459 uloop_run();
460 if (hook_running) {
461 DEBUG("uloop interrupted, killing hook process\n");
462 kill(hook_process.pid, SIGTERM);
463 uloop_timeout_set(&hook_process_timeout, 1000);
464 uloop_run();
465 }
466 uloop_done();
467
468 waitpid(hook_process.pid, NULL, WCONTINUED);
469
470 return hook_return_code;
471 } else if (hook_process.pid == 0) {
472 /* child */
473 execvpe(hook->file, hook->argv, hook->envp);
474 hook_running = 0;
475 _exit(errno);
476 } else {
477 /* fork error */
478 hook_running = 0;
479 return errno;
480 }
481 }
482
483 static int run_hooks(struct hook_execvpe **hooklist)
484 {
485 struct hook_execvpe **cur;
486 int res;
487
488 if (!hooklist)
489 return 0; /* Nothing to do */
490
491 cur = hooklist;
492
493 while (*cur) {
494 res = run_hook(*cur);
495 if (res)
496 DEBUG(" error running hook %s\n", (*cur)->file);
497 else
498 DEBUG(" success running hook %s\n", (*cur)->file);
499
500 ++cur;
501 }
502
503 return 0;
504 }
505
506 static int apply_sysctl(const char *jail_root)
507 {
508 struct sysctl_val **cur;
509 char *procdir, *fname;
510 int f;
511
512 if (!opts.sysctl)
513 return 0;
514
515 asprintf(&procdir, "%s/proc", jail_root);
516 if (!procdir)
517 return ENOMEM;
518
519 mkdir(procdir, 0700);
520 if (mount("proc", procdir, "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, 0))
521 return EPERM;
522
523 cur = opts.sysctl;
524
525 while (*cur) {
526 asprintf(&fname, "%s/sys/%s", procdir, (*cur)->entry);
527 if (!fname)
528 return ENOMEM;
529
530 DEBUG("sysctl: writing '%s' to %s\n", (*cur)->value, fname);
531
532 f = open(fname, O_WRONLY);
533 if (f == -1) {
534 ERROR("sysctl: can't open %s\n", fname);
535 return errno;
536 }
537 write(f, (*cur)->value, strlen((*cur)->value));
538
539 free(fname);
540 close(f);
541 ++cur;
542 }
543 umount(procdir);
544 rmdir(procdir);
545 free(procdir);
546
547 return 0;
548 }
549
550 /* glibc defines makedev calling a function. make sure it's a pure macro */
551 #if defined(__GLIBC__)
552 #undef makedev
553 /* from musl's sys/sysmacros.h */
554 #define makedev(x,y) ( \
555 (((x)&0xfffff000ULL) << 32) | \
556 (((x)&0x00000fffULL) << 8) | \
557 (((y)&0xffffff00ULL) << 12) | \
558 (((y)&0x000000ffULL)) )
559 #endif
560
561 static struct mknod_args default_devices[] = {
562 { .path = "/dev/null", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 3) },
563 { .path = "/dev/zero", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 5) },
564 { .path = "/dev/full", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 7) },
565 { .path = "/dev/random", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 8) },
566 { .path = "/dev/urandom", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|S_IROTH|S_IWOTH), .dev = makedev(1, 9) },
567 { .path = "/dev/tty", .mode = (S_IFCHR|S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP), .dev = makedev(5, 0), .gid = 5 },
568 { 0 },
569 };
570
571 static int create_devices(void)
572 {
573 struct mknod_args **cur, *curdef;
574
575 if (!opts.devices)
576 goto only_default_devices;
577
578 cur = opts.devices;
579
580 while (*cur) {
581 DEBUG("creating %s (mode=%08o)\n", (*cur)->path, (*cur)->mode);
582 if (mknod((*cur)->path, (*cur)->mode, (*cur)->dev))
583 return errno;
584
585 if (((*cur)->uid || (*cur)->gid) &&
586 chown((*cur)->path, (*cur)->uid, (*cur)->gid))
587 return errno;
588
589 ++cur;
590 }
591
592 only_default_devices:
593 curdef = default_devices;
594 while(curdef->path) {
595 DEBUG("creating %s (mode=%08o)\n", curdef->path, curdef->mode);
596 if (mknod(curdef->path, curdef->mode, curdef->dev)) {
597 ++curdef;
598 continue; /* may already exist, eg. due to a bind-mount */
599 }
600 if ((curdef->uid || curdef->gid) &&
601 chown(curdef->path, curdef->uid, curdef->gid))
602 return errno;
603
604 ++curdef;
605 }
606
607 /* Dev symbolic links as defined in OCI spec */
608 symlink("/dev/pts/ptmx", "/dev/ptmx");
609 symlink("/proc/self/fd", "/dev/fd");
610 symlink("/proc/self/fd/0", "/dev/stdin");
611 symlink("/proc/self/fd/1", "/dev/stdout");
612 symlink("/proc/self/fd/2", "/dev/stderr");
613
614 return 0;
615 }
616
617 static int build_jail_fs(void)
618 {
619 char jail_root[] = "/tmp/ujail-XXXXXX";
620 char tmpovdir[] = "/tmp/ujail-overlay-XXXXXX";
621 char *overlaydir = NULL;
622 mode_t old_umask;
623
624 old_umask = umask(0);
625
626 if (mkdtemp(jail_root) == NULL) {
627 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
628 return -1;
629 }
630
631 if (apply_sysctl(jail_root)) {
632 ERROR("failed to apply sysctl values\n");
633 return -1;
634 }
635
636 /* oldroot can't be MS_SHARED else pivot_root() fails */
637 if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) {
638 ERROR("private mount failed %m\n");
639 return -1;
640 }
641
642 if (opts.extroot) {
643 if (mount(opts.extroot, jail_root, NULL, MS_BIND, NULL)) {
644 ERROR("extroot mount failed %m\n");
645 return -1;
646 }
647 } else {
648 if (mount("tmpfs", jail_root, "tmpfs", MS_NOATIME, "mode=0755")) {
649 ERROR("tmpfs mount failed %m\n");
650 return -1;
651 }
652 }
653
654 if (opts.tmpoverlaysize) {
655 char mountoptsstr[] = "mode=0755,size=XXXXXXXX";
656
657 snprintf(mountoptsstr, sizeof(mountoptsstr),
658 "mode=0755,size=%s", opts.tmpoverlaysize);
659 if (mkdtemp(tmpovdir) == NULL) {
660 ERROR("mkdtemp(%s) failed: %m\n", jail_root);
661 return -1;
662 }
663 if (mount("tmpfs", tmpovdir, "tmpfs", MS_NOATIME,
664 mountoptsstr)) {
665 ERROR("failed to mount tmpfs for overlay (size=%s)\n", opts.tmpoverlaysize);
666 return -1;
667 }
668 overlaydir = tmpovdir;
669 }
670
671 if (opts.overlaydir)
672 overlaydir = opts.overlaydir;
673
674 if (overlaydir)
675 mount_overlay(jail_root, overlaydir);
676
677 if (chdir(jail_root)) {
678 ERROR("chdir(%s) (jail_root) failed: %m\n", jail_root);
679 return -1;
680 }
681
682 if (mount_all(jail_root)) {
683 ERROR("mount_all() failed\n");
684 return -1;
685 }
686
687 if (opts.console)
688 create_dev_console(jail_root);
689
690 /* make sure /etc/resolv.conf exists if in new network namespace */
691 if (opts.namespace & CLONE_NEWNET) {
692 char jailetc[PATH_MAX], jaillink[PATH_MAX];
693
694 snprintf(jailetc, PATH_MAX, "%s/etc", jail_root);
695 mkdir_p(jailetc, 0755);
696 snprintf(jaillink, PATH_MAX, "%s/etc/resolv.conf", jail_root);
697 if (overlaydir)
698 unlink(jaillink);
699
700 symlink("../dev/resolv.conf.d/resolv.conf.auto", jaillink);
701 }
702
703 run_hooks(opts.hooks.createContainer);
704
705 char dirbuf[sizeof(jail_root) + 4];
706 snprintf(dirbuf, sizeof(dirbuf), "%s/old", jail_root);
707 mkdir(dirbuf, 0755);
708
709 if (pivot_root(jail_root, dirbuf) == -1) {
710 ERROR("pivot_root(%s, %s) failed: %m\n", jail_root, dirbuf);
711 return -1;
712 }
713 if (chdir("/")) {
714 ERROR("chdir(/) (after pivot_root) failed: %m\n");
715 return -1;
716 }
717
718 snprintf(dirbuf, sizeof(dirbuf), "/old%s", jail_root);
719 umount2(dirbuf, MNT_DETACH);
720 rmdir(dirbuf);
721 if (opts.tmpoverlaysize) {
722 char tmpdirbuf[sizeof(tmpovdir) + 4];
723 snprintf(tmpdirbuf, sizeof(tmpdirbuf), "/old%s", tmpovdir);
724 umount2(tmpdirbuf, MNT_DETACH);
725 rmdir(tmpdirbuf);
726 }
727
728 umount2("/old", MNT_DETACH);
729 rmdir("/old");
730
731 if (create_devices()) {
732 ERROR("create_devices() failed\n");
733 return -1;
734 }
735 if (opts.ronly)
736 mount(NULL, "/", NULL, MS_REMOUNT | MS_BIND | MS_RDONLY, 0);
737
738 umask(old_umask);
739
740 return 0;
741 }
742
743 static int write_uid_gid_map(pid_t child_pid, bool gidmap, char *mapstr)
744 {
745 int map_file;
746 char map_path[64];
747
748 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
749 child_pid, gidmap?"gid_map":"uid_map") < 0)
750 return -1;
751
752 if ((map_file = open(map_path, O_WRONLY)) == -1)
753 return -1;
754
755 if (dprintf(map_file, "%s", mapstr)) {
756 close(map_file);
757 return -1;
758 }
759
760 close(map_file);
761 free(mapstr);
762 return 0;
763 }
764
765 static int write_single_uid_gid_map(pid_t child_pid, bool gidmap, int id)
766 {
767 int map_file;
768 char map_path[64];
769 const char *map_format = "%d %d %d\n";
770 if (snprintf(map_path, sizeof(map_path), "/proc/%d/%s",
771 child_pid, gidmap?"gid_map":"uid_map") < 0)
772 return -1;
773
774 if ((map_file = open(map_path, O_WRONLY)) == -1)
775 return -1;
776
777 if (dprintf(map_file, map_format, 0, id, 1) == -1) {
778 close(map_file);
779 return -1;
780 }
781
782 close(map_file);
783 return 0;
784 }
785
786 static int write_setgroups(pid_t child_pid, bool allow)
787 {
788 int setgroups_file;
789 char setgroups_path[64];
790
791 if (snprintf(setgroups_path, sizeof(setgroups_path), "/proc/%d/setgroups",
792 child_pid) < 0) {
793 return -1;
794 }
795
796 if ((setgroups_file = open(setgroups_path, O_WRONLY)) == -1) {
797 return -1;
798 }
799
800 if (dprintf(setgroups_file, "%s", allow?"allow":"deny") == -1) {
801 close(setgroups_file);
802 return -1;
803 }
804
805 close(setgroups_file);
806 return 0;
807 }
808
809 static void get_jail_user(int *user, int *user_gid, int *gr_gid)
810 {
811 struct passwd *p = NULL;
812 struct group *g = NULL;
813
814 if (opts.user) {
815 p = getpwnam(opts.user);
816 if (!p) {
817 ERROR("failed to get uid/gid for user %s: %d (%s)\n",
818 opts.user, errno, strerror(errno));
819 exit(EXIT_FAILURE);
820 }
821 *user = p->pw_uid;
822 *user_gid = p->pw_gid;
823 } else {
824 *user = -1;
825 *user_gid = -1;
826 }
827
828 if (opts.group) {
829 g = getgrnam(opts.group);
830 if (!g) {
831 ERROR("failed to get gid for group %s: %m\n", opts.group);
832 exit(EXIT_FAILURE);
833 }
834 *gr_gid = g->gr_gid;
835 } else {
836 *gr_gid = -1;
837 }
838 };
839
840 static void set_jail_user(int pw_uid, int user_gid, int gr_gid)
841 {
842 if (opts.user && (user_gid != -1) && initgroups(opts.user, user_gid)) {
843 ERROR("failed to initgroups() for user %s: %m\n", opts.user);
844 exit(EXIT_FAILURE);
845 }
846
847 if ((gr_gid != -1) && setregid(gr_gid, gr_gid)) {
848 ERROR("failed to set group id %d: %m\n", gr_gid);
849 exit(EXIT_FAILURE);
850 }
851
852 if ((pw_uid != -1) && setreuid(pw_uid, pw_uid)) {
853 ERROR("failed to set user id %d: %m\n", pw_uid);
854 exit(EXIT_FAILURE);
855 }
856 }
857
858 static int apply_rlimits(void)
859 {
860 int resource;
861
862 for (resource = 0; resource < RLIM_NLIMITS; ++resource) {
863 if (opts.rlimits[resource])
864 DEBUG("applying limits to resource %u\n", resource);
865
866 if (opts.rlimits[resource] &&
867 setrlimit(resource, opts.rlimits[resource]))
868 return errno;
869 }
870
871 return 0;
872 }
873
874 #define MAX_ENVP 8
875 static char** build_envp(const char *seccomp, char **ocienvp)
876 {
877 static char *envp[MAX_ENVP];
878 static char preload_var[PATH_MAX];
879 static char seccomp_var[PATH_MAX];
880 static char debug_var[] = "LD_DEBUG=all";
881 static char container_var[] = "container=ujail";
882 const char *preload_lib = find_lib("libpreload-seccomp.so");
883 char **addenv;
884
885 int count = 0;
886
887 if (seccomp && !preload_lib) {
888 ERROR("failed to add preload-lib to env\n");
889 return NULL;
890 }
891 if (seccomp) {
892 snprintf(seccomp_var, sizeof(seccomp_var), "SECCOMP_FILE=%s", seccomp);
893 envp[count++] = seccomp_var;
894 snprintf(preload_var, sizeof(preload_var), "LD_PRELOAD=%s", preload_lib);
895 envp[count++] = preload_var;
896 }
897
898 envp[count++] = container_var;
899
900 if (debug > 1)
901 envp[count++] = debug_var;
902
903 addenv = ocienvp;
904 while (addenv && *addenv) {
905 envp[count++] = *(addenv++);
906 if (count >= MAX_ENVP) {
907 ERROR("environment limited to %d extra records, truncating\n", MAX_ENVP);
908 break;
909 }
910 }
911 return envp;
912 }
913
914 static void usage(void)
915 {
916 fprintf(stderr, "ujail <options> -- <binary> <params ...>\n");
917 fprintf(stderr, " -d <num>\tshow debug log (increase num to increase verbosity)\n");
918 fprintf(stderr, " -S <file>\tseccomp filter config\n");
919 fprintf(stderr, " -C <file>\tcapabilities drop config\n");
920 fprintf(stderr, " -c\t\tset PR_SET_NO_NEW_PRIVS\n");
921 fprintf(stderr, " -n <name>\tthe name of the jail\n");
922 fprintf(stderr, "namespace jail options:\n");
923 fprintf(stderr, " -h <hostname>\tchange the hostname of the jail\n");
924 fprintf(stderr, " -N\t\tjail has network namespace\n");
925 fprintf(stderr, " -f\t\tjail has user namespace\n");
926 fprintf(stderr, " -F\t\tjail has cgroups namespace\n");
927 fprintf(stderr, " -r <file>\treadonly files that should be staged\n");
928 fprintf(stderr, " -w <file>\twriteable files that should be staged\n");
929 fprintf(stderr, " -p\t\tjail has /proc\n");
930 fprintf(stderr, " -s\t\tjail has /sys\n");
931 fprintf(stderr, " -l\t\tjail has /dev/log\n");
932 fprintf(stderr, " -u\t\tjail has a ubus socket\n");
933 fprintf(stderr, " -U <name>\tuser to run jailed process\n");
934 fprintf(stderr, " -G <name>\tgroup to run jailed process\n");
935 fprintf(stderr, " -o\t\tremont jail root (/) read only\n");
936 fprintf(stderr, " -R <dir>\texternal jail rootfs (system container)\n");
937 fprintf(stderr, " -O <dir>\tdirectory for r/w overlayfs\n");
938 fprintf(stderr, " -T <size>\tuse tmpfs r/w overlayfs with <size>\n");
939 fprintf(stderr, " -E\t\tfail if jail cannot be setup\n");
940 fprintf(stderr, " -y\t\tprovide jail console\n");
941 fprintf(stderr, " -J <dir>\tstart OCI bundle\n");
942 fprintf(stderr, "\nWarning: by default root inside the jail is the same\n\
943 and he has the same powers as root outside the jail,\n\
944 thus he can escape the jail and/or break stuff.\n\
945 Please use seccomp/capabilities (-S/-C) to restrict his powers\n\n\
946 If you use none of the namespace jail options,\n\
947 ujail will not use namespace/build a jail,\n\
948 and will only drop capabilities/apply seccomp filter.\n\n");
949 }
950
951 static int* get_namespace_fd(const unsigned int nstype)
952 {
953 switch (nstype) {
954 case CLONE_NEWPID:
955 return &opts.setns.pid;
956 case CLONE_NEWNET:
957 return &opts.setns.net;
958 case CLONE_NEWNS:
959 return &opts.setns.ns;
960 case CLONE_NEWIPC:
961 return &opts.setns.ipc;
962 case CLONE_NEWUTS:
963 return &opts.setns.uts;
964 case CLONE_NEWUSER:
965 return &opts.setns.user;
966 case CLONE_NEWCGROUP:
967 return &opts.setns.cgroup;
968 #ifdef CLONE_NEWTIME
969 case CLONE_NEWTIME:
970 return &opts.setns.time;
971 #endif
972 default:
973 return NULL;
974 }
975 }
976
977 static int setns_open(unsigned long nstype)
978 {
979 int *fd = get_namespace_fd(nstype);
980
981 if (!*fd)
982 return EFAULT;
983
984 if (*fd == -1)
985 return 0;
986
987 if (setns(*fd, nstype) == -1) {
988 close(*fd);
989 return errno;
990 }
991
992 close(*fd);
993 return 0;
994 }
995
996 static int exec_jail(void *pipes_ptr)
997 {
998 int *pipes = (int*)pipes_ptr;
999 char buf[1];
1000 int pw_uid, pw_gid, gr_gid;
1001
1002 close(pipes[0]);
1003 close(pipes[3]);
1004
1005 setns_open(CLONE_NEWUSER);
1006 setns_open(CLONE_NEWNET);
1007 setns_open(CLONE_NEWNS);
1008 setns_open(CLONE_NEWIPC);
1009 setns_open(CLONE_NEWUTS);
1010 #ifdef CLONE_NEWTIME
1011 setns_open(CLONE_NEWTIME);
1012 #endif
1013
1014 buf[0] = 'i';
1015 if (write(pipes[1], buf, 1) < 1) {
1016 ERROR("can't write to parent\n");
1017 exit(EXIT_FAILURE);
1018 }
1019 if (read(pipes[2], buf, 1) < 1) {
1020 ERROR("can't read from parent\n");
1021 exit(EXIT_FAILURE);
1022 }
1023 if (buf[0] != 'O') {
1024 ERROR("parent had an error, child exiting\n");
1025 exit(EXIT_FAILURE);
1026 }
1027
1028 close(pipes[1]);
1029 close(pipes[2]);
1030
1031 if ((opts.namespace & CLONE_NEWUSER) || (opts.setns.user != -1)) {
1032 if (setregid(0, 0) < 0) {
1033 ERROR("setgid\n");
1034 exit(EXIT_FAILURE);
1035 }
1036 if (setreuid(0, 0) < 0) {
1037 ERROR("setuid\n");
1038 exit(EXIT_FAILURE);
1039 }
1040 if (setgroups(0, NULL) < 0) {
1041 ERROR("setgroups\n");
1042 exit(EXIT_FAILURE);
1043 }
1044 }
1045
1046 if (opts.namespace && opts.hostname && strlen(opts.hostname) > 0
1047 && sethostname(opts.hostname, strlen(opts.hostname))) {
1048 ERROR("sethostname(%s) failed: %m\n", opts.hostname);
1049 exit(EXIT_FAILURE);
1050 }
1051
1052 if ((opts.namespace & CLONE_NEWNS) && build_jail_fs()) {
1053 ERROR("failed to build jail fs\n");
1054 exit(EXIT_FAILURE);
1055 }
1056 run_hooks(opts.hooks.startContainer);
1057
1058 if (!(opts.namespace & CLONE_NEWUSER) && (opts.setns.user == -1)) {
1059 get_jail_user(&pw_uid, &pw_gid, &gr_gid);
1060
1061 set_jail_user(opts.pw_uid?:pw_uid, opts.pw_gid?:pw_gid, opts.gr_gid?:gr_gid);
1062 }
1063
1064 if (opts.additional_gids &&
1065 (setgroups(opts.num_additional_gids, opts.additional_gids) < 0)) {
1066 ERROR("setgroups failed: %m\n");
1067 exit(EXIT_FAILURE);
1068 }
1069
1070 if (opts.set_umask)
1071 umask(opts.umask);
1072
1073 if (applyOCIcapabilities(opts.capset))
1074 exit(EXIT_FAILURE);
1075
1076 if (opts.capabilities && drop_capabilities(opts.capabilities))
1077 exit(EXIT_FAILURE);
1078
1079 if (opts.no_new_privs && prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
1080 ERROR("prctl(PR_SET_NO_NEW_PRIVS) failed: %m\n");
1081 exit(EXIT_FAILURE);
1082 }
1083
1084 char **envp = build_envp(opts.seccomp, opts.envp);
1085 if (!envp)
1086 exit(EXIT_FAILURE);
1087
1088 if (opts.cwd && chdir(opts.cwd))
1089 exit(EXIT_FAILURE);
1090
1091 if (opts.ociseccomp && applyOCIlinuxseccomp(opts.ociseccomp))
1092 exit(EXIT_FAILURE);
1093
1094 uloop_end();
1095 free_opts(false);
1096 INFO("exec-ing %s\n", *opts.jail_argv);
1097 if (opts.envp) /* respect PATH if potentially set in ENV */
1098 execvpe(*opts.jail_argv, opts.jail_argv, envp);
1099 else
1100 execve(*opts.jail_argv, opts.jail_argv, envp);
1101
1102 /* we get there only if execve fails */
1103 ERROR("failed to execve %s: %m\n", *opts.jail_argv);
1104 exit(EXIT_FAILURE);
1105 }
1106
1107 static int jail_running = 0;
1108 static int jail_return_code = 0;
1109
1110 static void jail_process_timeout_cb(struct uloop_timeout *t);
1111 static struct uloop_timeout jail_process_timeout = {
1112 .cb = jail_process_timeout_cb,
1113 };
1114
1115 static void jail_process_handler(struct uloop_process *c, int ret)
1116 {
1117 uloop_timeout_cancel(&jail_process_timeout);
1118 if (WIFEXITED(ret)) {
1119 jail_return_code = WEXITSTATUS(ret);
1120 INFO("jail (%d) exited with exit: %d\n", c->pid, jail_return_code);
1121 } else {
1122 jail_return_code = WTERMSIG(ret);
1123 INFO("jail (%d) exited with signal: %d\n", c->pid, jail_return_code);
1124 }
1125 jail_running = 0;
1126 uloop_end();
1127 }
1128
1129 static struct uloop_process jail_process = {
1130 .cb = jail_process_handler,
1131 };
1132
1133 static void jail_process_timeout_cb(struct uloop_timeout *t)
1134 {
1135 DEBUG("jail process failed to stop, sending SIGKILL\n");
1136 kill(jail_process.pid, SIGKILL);
1137 }
1138
1139 static void jail_handle_signal(int signo)
1140 {
1141 if (hook_running) {
1142 DEBUG("forwarding signal %d to the hook process\n", signo);
1143 kill(hook_process.pid, signo);
1144 }
1145
1146 if (jail_running) {
1147 DEBUG("forwarding signal %d to the jailed process\n", signo);
1148 kill(jail_process.pid, signo);
1149 }
1150 }
1151
1152 static int netns_open_pid(const pid_t target_ns)
1153 {
1154 char pid_net_path[PATH_MAX];
1155
1156 snprintf(pid_net_path, sizeof(pid_net_path), "/proc/%u/ns/net", target_ns);
1157
1158 return open(pid_net_path, O_RDONLY);
1159 }
1160
1161 static int pidns_open_pid(const pid_t target_ns)
1162 {
1163 char pid_pid_path[PATH_MAX];
1164
1165 snprintf(pid_pid_path, sizeof(pid_pid_path), "/proc/%u/ns/pid", target_ns);
1166
1167 return open(pid_pid_path, O_RDONLY);
1168 }
1169
1170 static void netns_updown(pid_t pid, bool start)
1171 {
1172 struct ubus_context *ctx = ubus_connect(NULL);
1173 static struct blob_buf req;
1174 uint32_t id;
1175
1176 if (!ctx)
1177 return;
1178
1179 blob_buf_init(&req, 0);
1180 blobmsg_add_string(&req, "jail", opts.name);
1181 blobmsg_add_u32(&req, "pid", pid);
1182 blobmsg_add_u8(&req, "start", start);
1183
1184 if (ubus_lookup_id(ctx, "network", &id) ||
1185 ubus_invoke(ctx, id, "netns_updown", req.head, NULL, NULL, 3000))
1186 INFO("ubus request failed\n");
1187
1188 blob_buf_free(&req);
1189 ubus_free(ctx);
1190 }
1191
1192 static int parseOCIenvarray(struct blob_attr *msg, char ***envp)
1193 {
1194 struct blob_attr *cur;
1195 int sz = 0, rem;
1196
1197 blobmsg_for_each_attr(cur, msg, rem)
1198 ++sz;
1199
1200 if (sz > 0) {
1201 *envp = calloc(1 + sz, sizeof(char*));
1202 if (!(*envp))
1203 return ENOMEM;
1204 } else {
1205 *envp = NULL;
1206 return 0;
1207 }
1208
1209 sz = 0;
1210 blobmsg_for_each_attr(cur, msg, rem)
1211 (*envp)[sz++] = strdup(blobmsg_get_string(cur));
1212
1213 if (sz)
1214 (*envp)[sz] = NULL;
1215
1216 return 0;
1217 }
1218
1219 enum {
1220 OCI_ROOT_PATH,
1221 OCI_ROOT_READONLY,
1222 __OCI_ROOT_MAX,
1223 };
1224
1225 static const struct blobmsg_policy oci_root_policy[] = {
1226 [OCI_ROOT_PATH] = { "path", BLOBMSG_TYPE_STRING },
1227 [OCI_ROOT_READONLY] = { "readonly", BLOBMSG_TYPE_BOOL },
1228 };
1229
1230 static int parseOCIroot(const char *jsonfile, struct blob_attr *msg)
1231 {
1232 static char rootpath[PATH_MAX] = { 0 };
1233 struct blob_attr *tb[__OCI_ROOT_MAX];
1234 char *cur;
1235
1236 blobmsg_parse(oci_root_policy, __OCI_ROOT_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1237
1238 if (!tb[OCI_ROOT_PATH])
1239 return ENODATA;
1240
1241 strncpy(rootpath, jsonfile, PATH_MAX);
1242 cur = strrchr(rootpath, '/');
1243
1244 if (!cur)
1245 return ENOTDIR;
1246
1247 *(++cur) = '\0';
1248 strncat(rootpath, blobmsg_get_string(tb[OCI_ROOT_PATH]), PATH_MAX - (strlen(rootpath) + 1));
1249
1250 opts.extroot = rootpath;
1251
1252 opts.ronly = blobmsg_get_bool(tb[OCI_ROOT_READONLY]);
1253
1254 return 0;
1255 }
1256
1257
1258 enum {
1259 OCI_HOOK_PATH,
1260 OCI_HOOK_ARGS,
1261 OCI_HOOK_ENV,
1262 OCI_HOOK_TIMEOUT,
1263 __OCI_HOOK_MAX,
1264 };
1265
1266 static const struct blobmsg_policy oci_hook_policy[] = {
1267 [OCI_HOOK_PATH] = { "path", BLOBMSG_TYPE_STRING },
1268 [OCI_HOOK_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1269 [OCI_HOOK_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1270 [OCI_HOOK_TIMEOUT] = { "timeout", BLOBMSG_TYPE_INT32 },
1271 };
1272
1273
1274 static int parseOCIhook(struct hook_execvpe ***hooklist, struct blob_attr *msg)
1275 {
1276 struct blob_attr *tb[__OCI_HOOK_MAX];
1277 struct blob_attr *cur;
1278 int rem, ret = 0;
1279 int idx = 0;
1280
1281 blobmsg_for_each_attr(cur, msg, rem)
1282 ++idx;
1283
1284 if (!idx)
1285 return 0;
1286
1287 *hooklist = calloc(idx + 1, sizeof(struct hook_execvpe *));
1288 idx = 0;
1289
1290 if (!(*hooklist))
1291 return ENOMEM;
1292
1293 blobmsg_for_each_attr(cur, msg, rem) {
1294 blobmsg_parse(oci_hook_policy, __OCI_HOOK_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1295
1296 if (!tb[OCI_HOOK_PATH]) {
1297 ret = EINVAL;
1298 goto errout;
1299 }
1300
1301 (*hooklist)[idx] = malloc(sizeof(struct hook_execvpe));
1302 if (tb[OCI_HOOK_ARGS]) {
1303 ret = parseOCIenvarray(tb[OCI_HOOK_ARGS], &((*hooklist)[idx]->argv));
1304 if (ret)
1305 goto errout;
1306 } else {
1307 (*hooklist)[idx]->argv = calloc(2, sizeof(char *));
1308 ((*hooklist)[idx]->argv)[0] = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1309 ((*hooklist)[idx]->argv)[1] = NULL;
1310 };
1311
1312
1313 if (tb[OCI_HOOK_ENV]) {
1314 ret = parseOCIenvarray(tb[OCI_HOOK_ENV], &((*hooklist)[idx]->envp));
1315 if (ret)
1316 goto errout;
1317 }
1318
1319 if (tb[OCI_HOOK_TIMEOUT])
1320 (*hooklist)[idx]->timeout = blobmsg_get_u32(tb[OCI_HOOK_TIMEOUT]);
1321
1322 (*hooklist)[idx]->file = strdup(blobmsg_get_string(tb[OCI_HOOK_PATH]));
1323
1324 ++idx;
1325 }
1326
1327 (*hooklist)[idx] = NULL;
1328
1329 DEBUG("added %d hooks\n", idx);
1330
1331 return 0;
1332
1333 errout:
1334 free_hooklist(*hooklist);
1335 *hooklist = NULL;
1336
1337 return ret;
1338 };
1339
1340
1341 enum {
1342 OCI_HOOKS_PRESTART,
1343 OCI_HOOKS_CREATERUNTIME,
1344 OCI_HOOKS_CREATECONTAINER,
1345 OCI_HOOKS_STARTCONTAINER,
1346 OCI_HOOKS_POSTSTART,
1347 OCI_HOOKS_POSTSTOP,
1348 __OCI_HOOKS_MAX,
1349 };
1350
1351 static const struct blobmsg_policy oci_hooks_policy[] = {
1352 [OCI_HOOKS_PRESTART] = { "prestart", BLOBMSG_TYPE_ARRAY },
1353 [OCI_HOOKS_CREATERUNTIME] = { "createRuntime", BLOBMSG_TYPE_ARRAY },
1354 [OCI_HOOKS_CREATECONTAINER] = { "createContainer", BLOBMSG_TYPE_ARRAY },
1355 [OCI_HOOKS_STARTCONTAINER] = { "startContainer", BLOBMSG_TYPE_ARRAY },
1356 [OCI_HOOKS_POSTSTART] = { "poststart", BLOBMSG_TYPE_ARRAY },
1357 [OCI_HOOKS_POSTSTOP] = { "poststop", BLOBMSG_TYPE_ARRAY },
1358 };
1359
1360 static int parseOCIhooks(struct blob_attr *msg)
1361 {
1362 struct blob_attr *tb[__OCI_HOOKS_MAX];
1363 int ret;
1364
1365 blobmsg_parse(oci_hooks_policy, __OCI_HOOKS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1366
1367 if (tb[OCI_HOOKS_PRESTART])
1368 INFO("warning: ignoring deprecated prestart hook\n");
1369
1370 if (tb[OCI_HOOKS_CREATERUNTIME]) {
1371 ret = parseOCIhook(&opts.hooks.createRuntime, tb[OCI_HOOKS_CREATERUNTIME]);
1372 if (ret)
1373 return ret;
1374 }
1375
1376 if (tb[OCI_HOOKS_CREATECONTAINER]) {
1377 ret = parseOCIhook(&opts.hooks.createContainer, tb[OCI_HOOKS_CREATECONTAINER]);
1378 if (ret)
1379 goto out_createruntime;
1380 }
1381
1382 if (tb[OCI_HOOKS_STARTCONTAINER]) {
1383 ret = parseOCIhook(&opts.hooks.startContainer, tb[OCI_HOOKS_STARTCONTAINER]);
1384 if (ret)
1385 goto out_createcontainer;
1386 }
1387
1388 if (tb[OCI_HOOKS_POSTSTART]) {
1389 ret = parseOCIhook(&opts.hooks.poststart, tb[OCI_HOOKS_POSTSTART]);
1390 if (ret)
1391 goto out_startcontainer;
1392 }
1393
1394 if (tb[OCI_HOOKS_POSTSTOP]) {
1395 ret = parseOCIhook(&opts.hooks.poststop, tb[OCI_HOOKS_POSTSTOP]);
1396 if (ret)
1397 goto out_poststart;
1398 }
1399
1400 return 0;
1401
1402 out_poststart:
1403 free_hooklist(opts.hooks.poststart);
1404 out_startcontainer:
1405 free_hooklist(opts.hooks.startContainer);
1406 out_createcontainer:
1407 free_hooklist(opts.hooks.createContainer);
1408 out_createruntime:
1409 free_hooklist(opts.hooks.createRuntime);
1410
1411 return ret;
1412 };
1413
1414
1415 enum {
1416 OCI_PROCESS_USER_UID,
1417 OCI_PROCESS_USER_GID,
1418 OCI_PROCESS_USER_UMASK,
1419 OCI_PROCESS_USER_ADDITIONALGIDS,
1420 __OCI_PROCESS_USER_MAX,
1421 };
1422
1423 static const struct blobmsg_policy oci_process_user_policy[] = {
1424 [OCI_PROCESS_USER_UID] = { "uid", BLOBMSG_TYPE_INT32 },
1425 [OCI_PROCESS_USER_GID] = { "gid", BLOBMSG_TYPE_INT32 },
1426 [OCI_PROCESS_USER_UMASK] = { "umask", BLOBMSG_TYPE_INT32 },
1427 [OCI_PROCESS_USER_ADDITIONALGIDS] = { "additionalGids", BLOBMSG_TYPE_ARRAY },
1428 };
1429
1430 static int parseOCIprocessuser(struct blob_attr *msg) {
1431 struct blob_attr *tb[__OCI_PROCESS_USER_MAX];
1432 struct blob_attr *cur;
1433 int rem;
1434 int has_gid = 0;
1435
1436 blobmsg_parse(oci_process_user_policy, __OCI_PROCESS_USER_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1437
1438 if (tb[OCI_PROCESS_USER_UID])
1439 opts.pw_uid = blobmsg_get_u32(tb[OCI_PROCESS_USER_UID]);
1440
1441 if (tb[OCI_PROCESS_USER_GID]) {
1442 opts.pw_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1443 opts.gr_gid = blobmsg_get_u32(tb[OCI_PROCESS_USER_GID]);
1444 has_gid = 1;
1445 }
1446
1447 if (tb[OCI_PROCESS_USER_ADDITIONALGIDS]) {
1448 size_t gidcnt = 0;
1449
1450 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1451 ++gidcnt;
1452 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1453 continue;
1454 }
1455
1456 if (gidcnt) {
1457 opts.additional_gids = calloc(gidcnt + has_gid, sizeof(gid_t));
1458 gidcnt = 0;
1459
1460 /* always add primary GID to set of GIDs if set */
1461 if (has_gid)
1462 opts.additional_gids[gidcnt++] = opts.gr_gid;
1463
1464 blobmsg_for_each_attr(cur, tb[OCI_PROCESS_USER_ADDITIONALGIDS], rem) {
1465 if (has_gid && (blobmsg_get_u32(cur) == opts.gr_gid))
1466 continue;
1467 opts.additional_gids[gidcnt++] = blobmsg_get_u32(cur);
1468 }
1469 opts.num_additional_gids = gidcnt;
1470 }
1471 DEBUG("read %zu additional groups\n", gidcnt);
1472 }
1473
1474 if (tb[OCI_PROCESS_USER_UMASK]) {
1475 opts.umask = blobmsg_get_u32(tb[OCI_PROCESS_USER_UMASK]);
1476 opts.set_umask = true;
1477 }
1478
1479 return 0;
1480 }
1481
1482 /* from manpage GETRLIMIT(2) */
1483 static const char* const rlimit_names[RLIM_NLIMITS] = {
1484 [RLIMIT_AS] = "AS",
1485 [RLIMIT_CORE] = "CORE",
1486 [RLIMIT_CPU] = "CPU",
1487 [RLIMIT_DATA] = "DATA",
1488 [RLIMIT_FSIZE] = "FSIZE",
1489 [RLIMIT_LOCKS] = "LOCKS",
1490 [RLIMIT_MEMLOCK] = "MEMLOCK",
1491 [RLIMIT_MSGQUEUE] = "MSGQUEUE",
1492 [RLIMIT_NICE] = "NICE",
1493 [RLIMIT_NOFILE] = "NOFILE",
1494 [RLIMIT_NPROC] = "NPROC",
1495 [RLIMIT_RSS] = "RSS",
1496 [RLIMIT_RTPRIO] = "RTPRIO",
1497 [RLIMIT_RTTIME] = "RTTIME",
1498 [RLIMIT_SIGPENDING] = "SIGPENDING",
1499 [RLIMIT_STACK] = "STACK",
1500 };
1501
1502 static int resolve_rlimit(char *type) {
1503 unsigned int rltype;
1504
1505 for (rltype = 0; rltype < RLIM_NLIMITS; ++rltype)
1506 if (rlimit_names[rltype] &&
1507 !strncmp("RLIMIT_", type, 7) &&
1508 !strcmp(rlimit_names[rltype], type + 7))
1509 return rltype;
1510
1511 return -1;
1512 }
1513
1514
1515 static int parseOCIrlimits(struct blob_attr *msg)
1516 {
1517 struct blob_attr *cur, *cure;
1518 int rem, reme;
1519 int limtype = -1;
1520 struct rlimit *curlim;
1521 rlim_t soft, hard;
1522 bool sethard = false, setsoft = false;
1523
1524 blobmsg_for_each_attr(cur, msg, rem) {
1525 blobmsg_for_each_attr(cure, cur, reme) {
1526 if (!strcmp(blobmsg_name(cure), "type") && (blobmsg_type(cure) == BLOBMSG_TYPE_STRING)) {
1527 limtype = resolve_rlimit(blobmsg_get_string(cure));
1528 } else if (!strcmp(blobmsg_name(cure), "soft")) {
1529 switch (blobmsg_type(cure)) {
1530 case BLOBMSG_TYPE_INT32:
1531 soft = blobmsg_get_u32(cure);
1532 break;
1533 case BLOBMSG_TYPE_INT64:
1534 soft = blobmsg_get_u64(cure);
1535 break;
1536 default:
1537 return EINVAL;
1538 }
1539 setsoft = true;
1540 } else if (!strcmp(blobmsg_name(cure), "hard")) {
1541 switch (blobmsg_type(cure)) {
1542 case BLOBMSG_TYPE_INT32:
1543 hard = blobmsg_get_u32(cure);
1544 break;
1545 case BLOBMSG_TYPE_INT64:
1546 hard = blobmsg_get_u64(cure);
1547 break;
1548 default:
1549 return EINVAL;
1550 }
1551 sethard = true;
1552 } else {
1553 return EINVAL;
1554 }
1555 }
1556
1557 if (limtype < 0)
1558 return EINVAL;
1559
1560 if (opts.rlimits[limtype])
1561 return ENOTUNIQ;
1562
1563 if (!sethard || !setsoft)
1564 return ENODATA;
1565
1566 curlim = malloc(sizeof(struct rlimit));
1567 curlim->rlim_cur = soft;
1568 curlim->rlim_max = hard;
1569
1570 opts.rlimits[limtype] = curlim;
1571 }
1572
1573 return 0;
1574 };
1575
1576 enum {
1577 OCI_PROCESS_ARGS,
1578 OCI_PROCESS_CAPABILITIES,
1579 OCI_PROCESS_CWD,
1580 OCI_PROCESS_ENV,
1581 OCI_PROCESS_OOMSCOREADJ,
1582 OCI_PROCESS_NONEWPRIVILEGES,
1583 OCI_PROCESS_RLIMITS,
1584 OCI_PROCESS_TERMINAL,
1585 OCI_PROCESS_USER,
1586 __OCI_PROCESS_MAX,
1587 };
1588
1589 static const struct blobmsg_policy oci_process_policy[] = {
1590 [OCI_PROCESS_ARGS] = { "args", BLOBMSG_TYPE_ARRAY },
1591 [OCI_PROCESS_CAPABILITIES] = { "capabilities", BLOBMSG_TYPE_TABLE },
1592 [OCI_PROCESS_CWD] = { "cwd", BLOBMSG_TYPE_STRING },
1593 [OCI_PROCESS_ENV] = { "env", BLOBMSG_TYPE_ARRAY },
1594 [OCI_PROCESS_OOMSCOREADJ] = { "oomScoreAdj", BLOBMSG_TYPE_INT32 },
1595 [OCI_PROCESS_NONEWPRIVILEGES] = { "noNewPrivileges", BLOBMSG_TYPE_BOOL },
1596 [OCI_PROCESS_RLIMITS] = { "rlimits", BLOBMSG_TYPE_ARRAY },
1597 [OCI_PROCESS_TERMINAL] = { "terminal", BLOBMSG_TYPE_BOOL },
1598 [OCI_PROCESS_USER] = { "user", BLOBMSG_TYPE_TABLE },
1599 };
1600
1601
1602 static int parseOCIprocess(struct blob_attr *msg)
1603 {
1604 struct blob_attr *tb[__OCI_PROCESS_MAX];
1605 int res;
1606
1607 blobmsg_parse(oci_process_policy, __OCI_PROCESS_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1608
1609 if (!tb[OCI_PROCESS_ARGS])
1610 return ENOENT;
1611
1612 res = parseOCIenvarray(tb[OCI_PROCESS_ARGS], &opts.jail_argv);
1613 if (res)
1614 return res;
1615
1616 opts.console = blobmsg_get_bool(tb[OCI_PROCESS_TERMINAL]);
1617 opts.no_new_privs = blobmsg_get_bool(tb[OCI_PROCESS_NONEWPRIVILEGES]);
1618
1619 if (tb[OCI_PROCESS_CWD])
1620 opts.cwd = strdup(blobmsg_get_string(tb[OCI_PROCESS_CWD]));
1621
1622 if (tb[OCI_PROCESS_ENV]) {
1623 res = parseOCIenvarray(tb[OCI_PROCESS_ENV], &opts.envp);
1624 if (res)
1625 return res;
1626 }
1627
1628 if (tb[OCI_PROCESS_USER] && (res = parseOCIprocessuser(tb[OCI_PROCESS_USER])))
1629 return res;
1630
1631 if (tb[OCI_PROCESS_CAPABILITIES] &&
1632 (res = parseOCIcapabilities(&opts.capset, tb[OCI_PROCESS_CAPABILITIES])))
1633 return res;
1634
1635 if (tb[OCI_PROCESS_RLIMITS] &&
1636 (res = parseOCIrlimits(tb[OCI_PROCESS_RLIMITS])))
1637 return res;
1638
1639 if (tb[OCI_PROCESS_OOMSCOREADJ]) {
1640 opts.oom_score_adj = blobmsg_get_u32(tb[OCI_PROCESS_OOMSCOREADJ]);
1641 opts.set_oom_score_adj = true;
1642 }
1643
1644 return 0;
1645 }
1646
1647 enum {
1648 OCI_LINUX_NAMESPACE_TYPE,
1649 OCI_LINUX_NAMESPACE_PATH,
1650 __OCI_LINUX_NAMESPACE_MAX,
1651 };
1652
1653 static const struct blobmsg_policy oci_linux_namespace_policy[] = {
1654 [OCI_LINUX_NAMESPACE_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1655 [OCI_LINUX_NAMESPACE_PATH] = { "path", BLOBMSG_TYPE_STRING },
1656 };
1657
1658 static int resolve_nstype(char *type) {
1659 if (!strcmp("pid", type))
1660 return CLONE_NEWPID;
1661 else if (!strcmp("network", type))
1662 return CLONE_NEWNET;
1663 else if (!strcmp("mount", type))
1664 return CLONE_NEWNS;
1665 else if (!strcmp("ipc", type))
1666 return CLONE_NEWIPC;
1667 else if (!strcmp("uts", type))
1668 return CLONE_NEWUTS;
1669 else if (!strcmp("user", type))
1670 return CLONE_NEWUSER;
1671 else if (!strcmp("cgroup", type))
1672 return CLONE_NEWCGROUP;
1673 #ifdef CLONE_NEWTIME
1674 else if (!strcmp("time", type))
1675 return CLONE_NEWTIME;
1676 #endif
1677 else
1678 return 0;
1679 }
1680
1681 static int parseOCIlinuxns(struct blob_attr *msg)
1682 {
1683 struct blob_attr *tb[__OCI_LINUX_NAMESPACE_MAX];
1684 int nstype;
1685 int *setns;
1686 int fd;
1687
1688 blobmsg_parse(oci_linux_namespace_policy, __OCI_LINUX_NAMESPACE_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1689
1690 if (!tb[OCI_LINUX_NAMESPACE_TYPE])
1691 return EINVAL;
1692
1693 nstype = resolve_nstype(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]));
1694 if (!nstype)
1695 return EINVAL;
1696
1697 if (opts.namespace & nstype)
1698 return ENOTUNIQ;
1699
1700 setns = get_namespace_fd(nstype);
1701
1702 if (!setns)
1703 return EFAULT;
1704
1705 if (*setns != -1)
1706 return ENOTUNIQ;
1707
1708 if (tb[OCI_LINUX_NAMESPACE_PATH]) {
1709 DEBUG("opening existing %s namespace from path %s\n",
1710 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1711 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]));
1712
1713 fd = open(blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_PATH]), O_RDONLY);
1714 if (fd == -1)
1715 return errno?:ESTALE;
1716
1717 if (ioctl(fd, NS_GET_NSTYPE) != nstype)
1718 return EINVAL;
1719
1720 DEBUG("opened existing %s namespace got filehandler %u\n",
1721 blobmsg_get_string(tb[OCI_LINUX_NAMESPACE_TYPE]),
1722 fd);
1723
1724 *setns = fd;
1725 } else {
1726 opts.namespace |= nstype;
1727 }
1728
1729 return 0;
1730 };
1731
1732
1733 enum {
1734 OCI_LINUX_UIDGIDMAP_CONTAINERID,
1735 OCI_LINUX_UIDGIDMAP_HOSTID,
1736 OCI_LINUX_UIDGIDMAP_SIZE,
1737 __OCI_LINUX_UIDGIDMAP_MAX,
1738 };
1739
1740 static const struct blobmsg_policy oci_linux_uidgidmap_policy[] = {
1741 [OCI_LINUX_UIDGIDMAP_CONTAINERID] = { "containerID", BLOBMSG_TYPE_INT32 },
1742 [OCI_LINUX_UIDGIDMAP_HOSTID] = { "hostID", BLOBMSG_TYPE_INT32 },
1743 [OCI_LINUX_UIDGIDMAP_SIZE] = { "size", BLOBMSG_TYPE_INT32 },
1744 };
1745
1746 static int parseOCIuidgidmappings(struct blob_attr *msg, bool is_gidmap)
1747 {
1748 const char *map_format = "%d %d %d\n";
1749 struct blob_attr *tb[__OCI_LINUX_UIDGIDMAP_MAX];
1750 struct blob_attr *cur;
1751 int rem, len;
1752 char **mappings;
1753 char *map, *curstr;
1754 unsigned int cnt = 0;
1755 size_t totallen = 0;
1756
1757 /* count number of mappings */
1758 blobmsg_for_each_attr(cur, msg, rem)
1759 cnt++;
1760
1761 if (!cnt)
1762 return 0;
1763
1764 /* allocate array for mappings */
1765 mappings = calloc(1 + cnt, sizeof(char*));
1766 if (!mappings)
1767 return ENOMEM;
1768
1769 mappings[cnt] = NULL;
1770
1771 cnt = 0;
1772 blobmsg_for_each_attr(cur, msg, rem) {
1773 blobmsg_parse(oci_linux_uidgidmap_policy, __OCI_LINUX_UIDGIDMAP_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1774
1775 if (!tb[OCI_LINUX_UIDGIDMAP_CONTAINERID] ||
1776 !tb[OCI_LINUX_UIDGIDMAP_HOSTID] ||
1777 !tb[OCI_LINUX_UIDGIDMAP_SIZE])
1778 return EINVAL;
1779
1780 /* write mapping line into allocated string */
1781 len = asprintf(&mappings[cnt++], map_format,
1782 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_CONTAINERID]),
1783 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_HOSTID]),
1784 blobmsg_get_u32(tb[OCI_LINUX_UIDGIDMAP_SIZE]));
1785
1786 if (len < 0)
1787 return ENOMEM;
1788
1789 totallen += len;
1790 }
1791
1792 /* allocate combined mapping string */
1793 map = calloc(1 + totallen, sizeof(char));
1794 if (!map)
1795 return ENOMEM;
1796
1797 map[0] = '\0';
1798
1799 /* concatenate mapping strings into combined string */
1800 curstr = mappings[0];
1801 while (curstr) {
1802 strcat(map, curstr);
1803 free(curstr++);
1804 }
1805 free(mappings);
1806
1807 if (is_gidmap)
1808 opts.gidmap = map;
1809 else
1810 opts.uidmap = map;
1811
1812 return 0;
1813 }
1814
1815 enum {
1816 OCI_DEVICES_TYPE,
1817 OCI_DEVICES_PATH,
1818 OCI_DEVICES_MAJOR,
1819 OCI_DEVICES_MINOR,
1820 OCI_DEVICES_FILEMODE,
1821 OCI_DEVICES_UID,
1822 OCI_DEVICES_GID,
1823 __OCI_DEVICES_MAX,
1824 };
1825
1826 static const struct blobmsg_policy oci_devices_policy[] = {
1827 [OCI_DEVICES_TYPE] = { "type", BLOBMSG_TYPE_STRING },
1828 [OCI_DEVICES_PATH] = { "path", BLOBMSG_TYPE_STRING },
1829 [OCI_DEVICES_MAJOR] = { "major", BLOBMSG_TYPE_INT32 },
1830 [OCI_DEVICES_MINOR] = { "minor", BLOBMSG_TYPE_INT32 },
1831 [OCI_DEVICES_FILEMODE] = { "fileMode", BLOBMSG_TYPE_INT32 },
1832 [OCI_DEVICES_UID] = { "uid", BLOBMSG_TYPE_INT32 },
1833 [OCI_DEVICES_GID] = { "uid", BLOBMSG_TYPE_INT32 },
1834 };
1835
1836 static mode_t resolve_devtype(char *tstr)
1837 {
1838 if (!strcmp("c", tstr) ||
1839 !strcmp("u", tstr))
1840 return S_IFCHR;
1841 else if (!strcmp("b", tstr))
1842 return S_IFBLK;
1843 else if (!strcmp("p", tstr))
1844 return S_IFIFO;
1845 else
1846 return 0;
1847 }
1848
1849 static int parseOCIdevices(struct blob_attr *msg)
1850 {
1851 struct blob_attr *tb[__OCI_DEVICES_MAX];
1852 struct blob_attr *cur;
1853 int rem;
1854 size_t cnt = 0;
1855 struct mknod_args *tmp;
1856
1857 blobmsg_for_each_attr(cur, msg, rem)
1858 ++cnt;
1859
1860 opts.devices = calloc(cnt + 1, sizeof(struct mknod_args *));
1861
1862 cnt = 0;
1863 blobmsg_for_each_attr(cur, msg, rem) {
1864 blobmsg_parse(oci_devices_policy, __OCI_DEVICES_MAX, tb, blobmsg_data(cur), blobmsg_len(cur));
1865 if (!tb[OCI_DEVICES_TYPE] ||
1866 !tb[OCI_DEVICES_PATH])
1867 return ENODATA;
1868
1869 tmp = calloc(1, sizeof(struct mknod_args));
1870 if (!tmp)
1871 return ENOMEM;
1872
1873 tmp->mode = resolve_devtype(blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
1874 if (!tmp->mode)
1875 return EINVAL;
1876
1877 if (tmp->mode != S_IFIFO) {
1878 if (!tb[OCI_DEVICES_MAJOR] || !tb[OCI_DEVICES_MINOR])
1879 return ENODATA;
1880
1881 tmp->dev = makedev(blobmsg_get_u32(tb[OCI_DEVICES_MAJOR]),
1882 blobmsg_get_u32(tb[OCI_DEVICES_MINOR]));
1883 }
1884
1885 if (tb[OCI_DEVICES_FILEMODE]) {
1886 if (~(S_IRWXU|S_IRWXG|S_IRWXO) & blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]))
1887 return EINVAL;
1888
1889 tmp->mode |= blobmsg_get_u32(tb[OCI_DEVICES_FILEMODE]);
1890 } else {
1891 tmp->mode |= (S_IRUSR|S_IWUSR); /* 0600 */
1892 }
1893
1894 tmp->path = strdup(blobmsg_get_string(tb[OCI_DEVICES_PATH]));
1895
1896 if (tb[OCI_DEVICES_UID])
1897 tmp->uid = blobmsg_get_u32(tb[OCI_DEVICES_UID]);
1898 else
1899 tmp->uid = -1;
1900
1901 if (tb[OCI_DEVICES_GID])
1902 tmp->gid = blobmsg_get_u32(tb[OCI_DEVICES_GID]);
1903 else
1904 tmp->gid = -1;
1905
1906 DEBUG("read device %s (%s)\n", blobmsg_get_string(tb[OCI_DEVICES_PATH]), blobmsg_get_string(tb[OCI_DEVICES_TYPE]));
1907 opts.devices[cnt++] = tmp;
1908 }
1909
1910 opts.devices[cnt] = NULL;
1911
1912 return 0;
1913 }
1914
1915 enum {
1916 OCI_LINUX_RESOURCES,
1917 OCI_LINUX_SECCOMP,
1918 OCI_LINUX_SYSCTL,
1919 OCI_LINUX_NAMESPACES,
1920 OCI_LINUX_DEVICES,
1921 OCI_LINUX_UIDMAPPINGS,
1922 OCI_LINUX_GIDMAPPINGS,
1923 OCI_LINUX_MASKEDPATHS,
1924 OCI_LINUX_READONLYPATHS,
1925 OCI_LINUX_ROOTFSPROPAGATION,
1926 __OCI_LINUX_MAX,
1927 };
1928
1929 static const struct blobmsg_policy oci_linux_policy[] = {
1930 [OCI_LINUX_RESOURCES] = { "resources", BLOBMSG_TYPE_TABLE },
1931 [OCI_LINUX_SECCOMP] = { "seccomp", BLOBMSG_TYPE_TABLE },
1932 [OCI_LINUX_SYSCTL] = { "sysctl", BLOBMSG_TYPE_TABLE },
1933 [OCI_LINUX_NAMESPACES] = { "namespaces", BLOBMSG_TYPE_ARRAY },
1934 [OCI_LINUX_DEVICES] = { "devices", BLOBMSG_TYPE_ARRAY },
1935 [OCI_LINUX_UIDMAPPINGS] = { "uidMappings", BLOBMSG_TYPE_ARRAY },
1936 [OCI_LINUX_GIDMAPPINGS] = { "gidMappings", BLOBMSG_TYPE_ARRAY },
1937 [OCI_LINUX_MASKEDPATHS] = { "maskedPaths", BLOBMSG_TYPE_ARRAY },
1938 [OCI_LINUX_READONLYPATHS] = { "readonlyPaths", BLOBMSG_TYPE_ARRAY },
1939 [OCI_LINUX_ROOTFSPROPAGATION] = { "rootfsPropagation", BLOBMSG_TYPE_STRING },
1940 };
1941
1942 static int parseOCIsysctl(struct blob_attr *msg)
1943 {
1944 struct blob_attr *cur;
1945 int rem;
1946 char *tmp, *tc;
1947 size_t cnt = 0;
1948
1949 blobmsg_for_each_attr(cur, msg, rem) {
1950 if (!blobmsg_name(cur) || !blobmsg_get_string(cur))
1951 return EINVAL;
1952
1953 ++cnt;
1954 }
1955
1956 if (!cnt)
1957 return 0;
1958
1959 opts.sysctl = calloc(cnt + 1, sizeof(struct sysctl_val *));
1960 if (!opts.sysctl)
1961 return ENOMEM;
1962
1963 cnt = 0;
1964 blobmsg_for_each_attr(cur, msg, rem) {
1965 opts.sysctl[cnt] = malloc(sizeof(struct sysctl_val));
1966 if (!opts.sysctl[cnt])
1967 return ENOMEM;
1968
1969 /* replace '.' with '/' in entry name */
1970 tc = tmp = strdup(blobmsg_name(cur));
1971 while ((tc = strchr(tc, '.')))
1972 *tc = '/';
1973
1974 opts.sysctl[cnt]->value = strdup(blobmsg_get_string(cur));
1975 opts.sysctl[cnt]->entry = tmp;
1976
1977 ++cnt;
1978 }
1979
1980 opts.sysctl[cnt] = NULL;
1981
1982 return 0;
1983 }
1984
1985 static int parseOCIlinux(struct blob_attr *msg)
1986 {
1987 struct blob_attr *tb[__OCI_LINUX_MAX];
1988 struct blob_attr *cur;
1989 int rem;
1990 int res = 0;
1991
1992 blobmsg_parse(oci_linux_policy, __OCI_LINUX_MAX, tb, blobmsg_data(msg), blobmsg_len(msg));
1993
1994 if (tb[OCI_LINUX_NAMESPACES]) {
1995 blobmsg_for_each_attr(cur, tb[OCI_LINUX_NAMESPACES], rem) {
1996 res = parseOCIlinuxns(cur);
1997 if (res)
1998 return res;
1999 }
2000 }
2001
2002 if (tb[OCI_LINUX_UIDMAPPINGS]) {
2003 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 0);
2004 if (res)
2005 return res;
2006 }
2007
2008 if (tb[OCI_LINUX_GIDMAPPINGS]) {
2009 res = parseOCIuidgidmappings(tb[OCI_LINUX_GIDMAPPINGS], 1);
2010 if (res)
2011 return res;
2012 }
2013
2014 if (tb[OCI_LINUX_READONLYPATHS]) {
2015 blobmsg_for_each_attr(cur, tb[OCI_LINUX_READONLYPATHS], rem) {
2016 res = add_mount(NULL, blobmsg_get_string(cur), NULL, MS_BIND | MS_REC | MS_RDONLY, NULL, 0);
2017 if (res)
2018 return res;
2019 }
2020 }
2021
2022 if (tb[OCI_LINUX_MASKEDPATHS]) {
2023 blobmsg_for_each_attr(cur, tb[OCI_LINUX_MASKEDPATHS], rem) {
2024 res = add_mount((void *)(-1), blobmsg_get_string(cur), NULL, 0, NULL, 1);
2025 if (res)
2026 return res;
2027 }
2028 }
2029
2030 if (tb[OCI_LINUX_SYSCTL]) {
2031 res = parseOCIsysctl(tb[OCI_LINUX_SYSCTL]);
2032 if (res)
2033 return res;
2034 }
2035
2036 if (tb[OCI_LINUX_SECCOMP]) {
2037 opts.ociseccomp = parseOCIlinuxseccomp(tb[OCI_LINUX_SECCOMP]);
2038 if (!opts.ociseccomp)
2039 return EINVAL;
2040 }
2041
2042 if (tb[OCI_LINUX_DEVICES]) {
2043 res = parseOCIdevices(tb[OCI_LINUX_DEVICES]);
2044 if (res)
2045 return res;
2046 }
2047
2048 return 0;
2049 }
2050
2051 enum {
2052 OCI_VERSION,
2053 OCI_HOSTNAME,
2054 OCI_PROCESS,
2055 OCI_ROOT,
2056 OCI_MOUNTS,
2057 OCI_HOOKS,
2058 OCI_LINUX,
2059 __OCI_MAX,
2060 };
2061
2062 static const struct blobmsg_policy oci_policy[] = {
2063 [OCI_VERSION] = { "ociVersion", BLOBMSG_TYPE_STRING },
2064 [OCI_HOSTNAME] = { "hostname", BLOBMSG_TYPE_STRING },
2065 [OCI_PROCESS] = { "process", BLOBMSG_TYPE_TABLE },
2066 [OCI_ROOT] = { "root", BLOBMSG_TYPE_TABLE },
2067 [OCI_MOUNTS] = { "mounts", BLOBMSG_TYPE_ARRAY },
2068 [OCI_HOOKS] = { "hooks", BLOBMSG_TYPE_TABLE },
2069 [OCI_LINUX] = { "linux", BLOBMSG_TYPE_TABLE },
2070 };
2071
2072 static int parseOCI(const char *jsonfile)
2073 {
2074 struct blob_attr *tb[__OCI_MAX];
2075 struct blob_attr *cur;
2076 int rem;
2077 int res;
2078
2079 blob_buf_init(&ocibuf, 0);
2080 if (!blobmsg_add_json_from_file(&ocibuf, jsonfile))
2081 return ENOENT;
2082
2083 blobmsg_parse(oci_policy, __OCI_MAX, tb, blob_data(ocibuf.head), blob_len(ocibuf.head));
2084
2085 if (!tb[OCI_VERSION])
2086 return ENOMSG;
2087
2088 if (strncmp("1.0", blobmsg_get_string(tb[OCI_VERSION]), 3)) {
2089 ERROR("unsupported ociVersion %s\n", blobmsg_get_string(tb[OCI_VERSION]));
2090 return ENOTSUP;
2091 }
2092
2093 if (tb[OCI_HOSTNAME])
2094 opts.hostname = strdup(blobmsg_get_string(tb[OCI_HOSTNAME]));
2095
2096 if (!tb[OCI_PROCESS])
2097 return ENODATA;
2098
2099 if ((res = parseOCIprocess(tb[OCI_PROCESS])))
2100 return res;
2101
2102 if (!tb[OCI_ROOT])
2103 return ENODATA;
2104
2105 if ((res = parseOCIroot(jsonfile, tb[OCI_ROOT])))
2106 return res;
2107
2108 if (!tb[OCI_MOUNTS])
2109 return ENODATA;
2110
2111 blobmsg_for_each_attr(cur, tb[OCI_MOUNTS], rem)
2112 if ((res = parseOCImount(cur)))
2113 return res;
2114
2115 if (tb[OCI_LINUX] && (res = parseOCIlinux(tb[OCI_LINUX])))
2116 return res;
2117
2118 if (tb[OCI_HOOKS] && (res = parseOCIhooks(tb[OCI_HOOKS])))
2119 return res;
2120
2121 blob_buf_free(&ocibuf);
2122
2123 return 0;
2124 }
2125
2126 static int set_oom_score_adj(void)
2127 {
2128 int f;
2129 char fname[32];
2130
2131 if (!opts.set_oom_score_adj)
2132 return 0;
2133
2134 snprintf(fname, sizeof(fname), "/proc/%u/oom_score_adj", jail_process.pid);
2135 f = open(fname, O_WRONLY | O_TRUNC);
2136 if (f == -1)
2137 return errno;
2138
2139 dprintf(f, "%d", opts.oom_score_adj);
2140 close(f);
2141
2142 return 0;
2143 }
2144
2145
2146 int main(int argc, char **argv)
2147 {
2148 sigset_t sigmask;
2149 uid_t uid = getuid();
2150 const char log[] = "/dev/log";
2151 const char ubus[] = "/var/run/ubus.sock";
2152 char *jsonfile = NULL;
2153 int i, ch;
2154 int pipes[4];
2155 char sig_buf[1];
2156 int netns_fd;
2157 int pidns_fd;
2158
2159 if (uid) {
2160 ERROR("not root, aborting: %m\n");
2161 return EXIT_FAILURE;
2162 }
2163
2164 umask(022);
2165 mount_list_init();
2166 init_library_search();
2167
2168 while ((ch = getopt(argc, argv, OPT_ARGS)) != -1) {
2169 switch (ch) {
2170 case 'd':
2171 debug = atoi(optarg);
2172 break;
2173 case 'p':
2174 opts.namespace |= CLONE_NEWNS;
2175 opts.procfs = 1;
2176 break;
2177 case 'o':
2178 opts.namespace |= CLONE_NEWNS;
2179 opts.ronly = 1;
2180 break;
2181 case 'f':
2182 opts.namespace |= CLONE_NEWUSER;
2183 break;
2184 case 'F':
2185 opts.namespace |= CLONE_NEWCGROUP;
2186 break;
2187 case 'R':
2188 opts.extroot = strdup(optarg);
2189 break;
2190 case 's':
2191 opts.namespace |= CLONE_NEWNS;
2192 opts.sysfs = 1;
2193 break;
2194 case 'S':
2195 opts.seccomp = optarg;
2196 add_mount_bind(optarg, 1, -1);
2197 break;
2198 case 'C':
2199 opts.capabilities = optarg;
2200 break;
2201 case 'c':
2202 opts.no_new_privs = 1;
2203 break;
2204 case 'n':
2205 opts.name = optarg;
2206 break;
2207 case 'N':
2208 opts.namespace |= CLONE_NEWNET;
2209 break;
2210 case 'h':
2211 opts.namespace |= CLONE_NEWUTS;
2212 opts.hostname = strdup(optarg);
2213 break;
2214 case 'r':
2215 opts.namespace |= CLONE_NEWNS;
2216 add_path_and_deps(optarg, 1, 0, 0);
2217 break;
2218 case 'w':
2219 opts.namespace |= CLONE_NEWNS;
2220 add_path_and_deps(optarg, 0, 0, 0);
2221 break;
2222 case 'u':
2223 opts.namespace |= CLONE_NEWNS;
2224 add_mount_bind(ubus, 0, -1);
2225 break;
2226 case 'l':
2227 opts.namespace |= CLONE_NEWNS;
2228 add_mount_bind(log, 0, -1);
2229 break;
2230 case 'U':
2231 opts.user = optarg;
2232 break;
2233 case 'G':
2234 opts.group = optarg;
2235 break;
2236 case 'O':
2237 opts.overlaydir = optarg;
2238 break;
2239 case 'T':
2240 opts.tmpoverlaysize = optarg;
2241 break;
2242 case 'E':
2243 opts.require_jail = 1;
2244 break;
2245 case 'y':
2246 opts.console = 1;
2247 break;
2248 case 'J':
2249 asprintf(&jsonfile, "%s/config.json", optarg);
2250 break;
2251 }
2252 }
2253
2254 if (opts.namespace && !jsonfile)
2255 opts.namespace |= CLONE_NEWIPC | CLONE_NEWPID;
2256
2257 /* those are filehandlers, so -1 indicates unused */
2258 opts.setns.pid = -1;
2259 opts.setns.net = -1;
2260 opts.setns.ns = -1;
2261 opts.setns.ipc = -1;
2262 opts.setns.uts = -1;
2263 opts.setns.user = -1;
2264 opts.setns.cgroup = -1;
2265 #ifdef CLONE_NEWTIME
2266 opts.setns.time = -1;
2267 #endif
2268
2269 if (jsonfile) {
2270 int ocires;
2271 ocires = parseOCI(jsonfile);
2272 free(jsonfile);
2273 if (ocires) {
2274 ERROR("parsing of OCI JSON spec has failed: %s (%d)\n", strerror(ocires), ocires);
2275 return ocires;
2276 }
2277 }
2278
2279 if (opts.tmpoverlaysize && strlen(opts.tmpoverlaysize) > 8) {
2280 ERROR("size parameter too long: \"%s\"\n", opts.tmpoverlaysize);
2281 return -1;
2282 }
2283
2284 /* no <binary> param found */
2285 if (!jsonfile && (argc - optind < 1)) {
2286 usage();
2287 return EXIT_FAILURE;
2288 }
2289 if (!(jsonfile||opts.namespace||opts.capabilities||opts.seccomp)) {
2290 ERROR("Not using namespaces, capabilities or seccomp !!!\n\n");
2291 usage();
2292 return EXIT_FAILURE;
2293 }
2294 DEBUG("Using namespaces(0x%08x), capabilities(%d), seccomp(%d)\n",
2295 opts.namespace,
2296 opts.capabilities != 0 || opts.capset.apply,
2297 opts.seccomp != 0 || opts.ociseccomp != 0);
2298
2299 if (!jsonfile) {
2300 /* allocate NULL-terminated array for argv */
2301 opts.jail_argv = calloc(1 + argc - optind, sizeof(char**));
2302 if (!opts.jail_argv)
2303 return EXIT_FAILURE;
2304
2305 for (size_t s = optind; s < argc; s++)
2306 opts.jail_argv[s - optind] = strdup(argv[s]);
2307
2308 if (opts.namespace & CLONE_NEWUSER)
2309 get_jail_user(&opts.pw_uid, &opts.pw_gid, &opts.gr_gid);
2310 }
2311
2312 if (!opts.extroot) {
2313 if (opts.namespace && add_path_and_deps(*opts.jail_argv, 1, -1, 0)) {
2314 ERROR("failed to load dependencies\n");
2315 return -1;
2316 }
2317 }
2318
2319 if (opts.namespace && opts.seccomp && add_path_and_deps("libpreload-seccomp.so", 1, -1, 1)) {
2320 ERROR("failed to load libpreload-seccomp.so\n");
2321 opts.seccomp = 0;
2322 if (opts.require_jail)
2323 return -1;
2324 }
2325
2326 if (apply_rlimits()) {
2327 ERROR("error applying resource limits\n");
2328 exit(EXIT_FAILURE);
2329 }
2330
2331 if (opts.name)
2332 prctl(PR_SET_NAME, opts.name, NULL, NULL, NULL);
2333
2334 sigfillset(&sigmask);
2335 for (i = 0; i < _NSIG; i++) {
2336 struct sigaction s = { 0 };
2337
2338 if (!sigismember(&sigmask, i))
2339 continue;
2340 if ((i == SIGCHLD) || (i == SIGPIPE) || (i == SIGSEGV))
2341 continue;
2342
2343 s.sa_handler = jail_handle_signal;
2344 sigaction(i, &s, NULL);
2345 }
2346
2347 if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0)
2348 return -1;
2349
2350 if (has_namespaces()) {
2351 if (opts.namespace & CLONE_NEWNS) {
2352 if (!opts.extroot && (opts.user || opts.group)) {
2353 add_mount_bind("/etc/passwd", 0, -1);
2354 add_mount_bind("/etc/group", 0, -1);
2355 }
2356
2357 #if defined(__GLIBC__)
2358 if (!opts.extroot)
2359 add_mount_bind("/etc/nsswitch.conf", 0, -1);
2360 #endif
2361
2362 if (!(opts.namespace & CLONE_NEWNET)) {
2363 add_mount_bind("/etc/resolv.conf", 0, -1);
2364 } else if (opts.setns.net == -1) {
2365 char hostdir[PATH_MAX];
2366
2367 snprintf(hostdir, PATH_MAX, "/tmp/resolv.conf-%s.d", opts.name);
2368 mkdir_p(hostdir, 0755);
2369 add_mount(hostdir, "/dev/resolv.conf.d", NULL, MS_BIND | MS_NOEXEC | MS_NOATIME | MS_NOSUID | MS_NODEV | MS_RDONLY, NULL, -1);
2370 }
2371
2372 /* default mounts */
2373 add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M", -1);
2374 add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0);
2375
2376 if (opts.procfs || jsonfile) {
2377 add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL, -1);
2378
2379 /*
2380 * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only
2381 * which cannot be expressed with OCI spec, but happends to be very useful.
2382 * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or
2383 * readonlyPath.
2384 * If not running in a new network namespace, only make /proc/sys read-only.
2385 * If running in a new network namespace, temporarily stash (ie. mount-bind)
2386 * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net.
2387 * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into
2388 * /proc/sys/net.
2389 * This works because mounts are executed in incrementing strcmp() order and
2390 * /proc/self/net appears there before /proc/sys/net and hence the operation
2391 * succeeds as the bind-mount of /proc/self/net is performed first and then
2392 * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII
2393 * table (and in the alphabet).
2394 */
2395 if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, NULL, -1))
2396 if (opts.namespace & CLONE_NEWNET)
2397 if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, NULL, -1))
2398 add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, NULL, -1);
2399
2400 }
2401 if (opts.sysfs || jsonfile)
2402 add_mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, NULL, -1);
2403
2404 if (jsonfile)
2405 add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, "mode=1777", -1);
2406
2407 }
2408
2409 if (opts.setns.pid != -1) {
2410 pidns_fd = pidns_open_pid(getpid());
2411 setns_open(CLONE_NEWPID);
2412 } else {
2413 pidns_fd = -1;
2414 }
2415
2416 jail_process.pid = clone(exec_jail, child_stack + STACK_SIZE, SIGCHLD | opts.namespace, &pipes);
2417 } else {
2418 jail_process.pid = fork();
2419 }
2420
2421 if (jail_process.pid > 0) {
2422 /* parent process */
2423 jail_running = 1;
2424 seteuid(0);
2425 if (pidns_fd != -1) {
2426 setns(pidns_fd, CLONE_NEWPID);
2427 close(pidns_fd);
2428 }
2429 if (opts.setns.net != -1)
2430 close(opts.setns.net);
2431 if (opts.setns.ns != -1)
2432 close(opts.setns.ns);
2433 if (opts.setns.ipc != -1)
2434 close(opts.setns.ipc);
2435 if (opts.setns.uts != -1)
2436 close(opts.setns.uts);
2437 if (opts.setns.user != -1)
2438 close(opts.setns.user);
2439 if (opts.setns.cgroup != -1)
2440 close(opts.setns.cgroup);
2441 #ifdef CLONE_NEWTIME
2442 if (opts.setns.time != -1)
2443 close(opts.setns.time);
2444 #endif
2445 close(pipes[1]);
2446 close(pipes[2]);
2447 run_hooks(opts.hooks.createRuntime);
2448 if (read(pipes[0], sig_buf, 1) < 1) {
2449 ERROR("can't read from child\n");
2450 return -1;
2451 }
2452 close(pipes[0]);
2453 set_oom_score_adj();
2454
2455 if (opts.namespace & CLONE_NEWUSER) {
2456 if (write_setgroups(jail_process.pid, true)) {
2457 ERROR("can't write setgroups\n");
2458 return -1;
2459 }
2460 if (!opts.uidmap) {
2461 bool has_gr = (opts.gr_gid != -1);
2462 if (opts.pw_uid != -1) {
2463 write_single_uid_gid_map(jail_process.pid, 0, opts.pw_uid);
2464 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:opts.pw_gid);
2465 } else {
2466 write_single_uid_gid_map(jail_process.pid, 0, 65534);
2467 write_single_uid_gid_map(jail_process.pid, 1, has_gr?opts.gr_gid:65534);
2468 }
2469 } else {
2470 write_uid_gid_map(jail_process.pid, 0, opts.uidmap);
2471 if (opts.gidmap)
2472 write_uid_gid_map(jail_process.pid, 1, opts.gidmap);
2473 }
2474 }
2475
2476 if (opts.namespace & CLONE_NEWNET) {
2477 if (!opts.name) {
2478 ERROR("netns needs a named jail\n");
2479 return -1;
2480 }
2481 netns_fd = netns_open_pid(jail_process.pid);
2482 netns_updown(jail_process.pid, true);
2483 }
2484
2485 sig_buf[0] = 'O';
2486 if (write(pipes[3], sig_buf, 1) < 0) {
2487 ERROR("can't write to child\n");
2488 return -1;
2489 }
2490 close(pipes[3]);
2491 run_hooks(opts.hooks.poststart);
2492
2493 uloop_init();
2494 uloop_process_add(&jail_process);
2495 uloop_run();
2496 if (jail_running) {
2497 DEBUG("uloop interrupted, killing jail process\n");
2498 kill(jail_process.pid, SIGTERM);
2499 uloop_timeout_set(&jail_process_timeout, 1000);
2500 uloop_run();
2501 }
2502 uloop_done();
2503 if (opts.namespace & CLONE_NEWNET) {
2504 setns(netns_fd, CLONE_NEWNET);
2505 netns_updown(getpid(), false);
2506 close(netns_fd);
2507 }
2508 run_hooks(opts.hooks.poststop);
2509 free_opts(true);
2510 return jail_return_code;
2511 } else if (jail_process.pid == 0) {
2512 /* fork child process */
2513 return exec_jail(&pipes);
2514 } else {
2515 ERROR("failed to clone/fork: %m\n");
2516 return EXIT_FAILURE;
2517 }
2518 }