seccomp.c (12597B)
1 /*
2 * Copyright (c) 2019-2022 Joris Vink <joris@coders.se>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <sys/param.h>
18 #include <sys/mman.h>
19 #include <sys/epoll.h>
20 #include <sys/ptrace.h>
21 #include <sys/prctl.h>
22 #include <sys/user.h>
23 #include <sys/syscall.h>
24
25 #include <linux/ptrace.h>
26 #include <linux/seccomp.h>
27 #include <linux/filter.h>
28 #include <linux/audit.h>
29
30 #include <stddef.h>
31 #include <sched.h>
32
33 #include "kore.h"
34 #include "seccomp.h"
35 #include "platform.h"
36
37 #if defined(KORE_USE_PYTHON)
38 #include "python_api.h"
39 #endif
40
41 #if !defined(SECCOMP_KILL_POLICY)
42 #define SECCOMP_KILL_POLICY SECCOMP_RET_KILL
43 #endif
44
45 /*
46 * The bare minimum to be able to run kore. These are added last and can
47 * be overwritten by a filter program that is added before hand.
48 */
49 static struct sock_filter filter_kore[] = {
50 /* Deny these, but with EACCESS instead of dying. */
51 KORE_SYSCALL_DENY(ioctl, EACCES),
52
53 /* File related. */
54 #if defined(SYS_open)
55 KORE_SYSCALL_ALLOW(open),
56 #endif
57 KORE_SYSCALL_ALLOW(read),
58 #if defined(SYS_stat)
59 KORE_SYSCALL_ALLOW(stat),
60 #endif
61 #if defined(SYS_stat64)
62 KORE_SYSCALL_ALLOW(stat64),
63 #endif
64 #if defined(SYS_lstat)
65 KORE_SYSCALL_ALLOW(lstat),
66 #endif
67 KORE_SYSCALL_ALLOW(fstat),
68 #if defined(SYS_fstat64)
69 KORE_SYSCALL_ALLOW(fstat64),
70 #endif
71 #if defined(SYS_newfstatat)
72 KORE_SYSCALL_ALLOW(newfstatat),
73 #endif
74 KORE_SYSCALL_ALLOW(write),
75 KORE_SYSCALL_ALLOW(fcntl),
76 #if defined(SYS_fcntl64)
77 KORE_SYSCALL_ALLOW(fcntl64),
78 #endif
79 KORE_SYSCALL_ALLOW(lseek),
80 #if defined(SYS__llseek)
81 KORE_SYSCALL_ALLOW(_llseek),
82 #endif
83 KORE_SYSCALL_ALLOW(close),
84 KORE_SYSCALL_ALLOW(openat),
85 #if defined(SYS_access)
86 KORE_SYSCALL_ALLOW(access),
87 #endif
88 KORE_SYSCALL_ALLOW(writev),
89 KORE_SYSCALL_ALLOW(getcwd),
90 #if defined(SYS_unlink)
91 KORE_SYSCALL_ALLOW(unlink),
92 #endif
93 #if defined(SYS_readlink)
94 KORE_SYSCALL_ALLOW(readlink),
95 #endif
96 #if defined(SYS_readlinkat)
97 KORE_SYSCALL_ALLOW(readlinkat),
98 #endif
99
100 /* Process related. */
101 KORE_SYSCALL_ALLOW(exit),
102 KORE_SYSCALL_ALLOW(kill),
103 KORE_SYSCALL_ALLOW(getpid),
104 KORE_SYSCALL_ALLOW(getuid),
105 KORE_SYSCALL_ALLOW(geteuid),
106 KORE_SYSCALL_ALLOW(exit_group),
107 KORE_SYSCALL_ALLOW(nanosleep),
108 #if defined(SYS_clock_gettime64)
109 KORE_SYSCALL_ALLOW(clock_gettime64),
110 #endif
111 #if defined(SYS_clock_nanosleep)
112 KORE_SYSCALL_ALLOW(clock_nanosleep),
113 #endif
114 #if defined(SYS_sigreturn)
115 KORE_SYSCALL_ALLOW(sigreturn),
116 #endif
117
118 /* Memory related. */
119 KORE_SYSCALL_ALLOW(brk),
120 KORE_SYSCALL_ALLOW(munmap),
121
122 /* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
123 #if defined(SYS_mmap)
124 KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
125 #endif
126 #if defined(SYS_mmap2)
127 KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
128 #endif
129 KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
130
131 #if defined(SYS_mmap)
132 KORE_SYSCALL_ALLOW(mmap),
133 #endif
134 #if defined(SYS_mmap2)
135 KORE_SYSCALL_ALLOW(mmap2),
136 #endif
137 KORE_SYSCALL_ALLOW(madvise),
138 KORE_SYSCALL_ALLOW(mprotect),
139
140 /* Net related. */
141 #if defined(SYS_poll)
142 KORE_SYSCALL_ALLOW(poll),
143 #endif
144 KORE_SYSCALL_ALLOW(ppoll),
145 #if defined(SYS_send)
146 KORE_SYSCALL_ALLOW(send),
147 #endif
148 KORE_SYSCALL_ALLOW(sendto),
149 KORE_SYSCALL_ALLOW(accept),
150 KORE_SYSCALL_ALLOW(sendfile),
151 #if defined(SYS_recv)
152 KORE_SYSCALL_ALLOW(recv),
153 #endif
154 KORE_SYSCALL_ALLOW(recvfrom),
155 KORE_SYSCALL_ALLOW(epoll_ctl),
156 KORE_SYSCALL_ALLOW(setsockopt),
157 #if defined(SYS_epoll_wait)
158 KORE_SYSCALL_ALLOW(epoll_wait),
159 #endif
160 KORE_SYSCALL_ALLOW(epoll_pwait),
161
162 /* Signal related. */
163 KORE_SYSCALL_ALLOW(sigaltstack),
164 KORE_SYSCALL_ALLOW(rt_sigreturn),
165 KORE_SYSCALL_ALLOW(rt_sigaction),
166 KORE_SYSCALL_ALLOW(rt_sigprocmask),
167
168 /* "Other" without clear category. */
169 KORE_SYSCALL_ALLOW(futex),
170 #if defined(SYS_clock_gettime)
171 KORE_SYSCALL_ALLOW(clock_gettime),
172 #endif
173
174 #if defined(__NR_getrandom)
175 KORE_SYSCALL_ALLOW(getrandom),
176 #endif
177 };
178
179 /* bpf program prologue. */
180 static struct sock_filter filter_prologue[] = {
181 /* Load arch member into accumulator (A) (arch is __u32). */
182 KORE_BPF_LOAD(arch, 0),
183
184 /* Compare accumulator against constant, if false jump over kill. */
185 KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
186 KORE_BPF_RET(SECCOMP_RET_KILL),
187
188 /* Load the system call number into the accumulator. */
189 KORE_BPF_LOAD(nr, 0),
190 };
191
192 /* bpf program epilogue. */
193 static struct sock_filter filter_epilogue[] = {
194 /* Return hit if no system calls matched our list. */
195 BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
196 };
197
198 static struct sock_filter *seccomp_filter_update(struct sock_filter *,
199 const char *, size_t);
200
201 #define filter_prologue_len KORE_FILTER_LEN(filter_prologue)
202 #define filter_epilogue_len KORE_FILTER_LEN(filter_epilogue)
203
204 static void seccomp_register_violation(pid_t);
205
206 struct filter {
207 char *name;
208 struct sock_filter *prog;
209 size_t instructions;
210 TAILQ_ENTRY(filter) list;
211 };
212
213 static TAILQ_HEAD(, filter) filters;
214 static struct filter *ufilter = NULL;
215
216 /*
217 * If enabled will instruct the parent process to ptrace its children and
218 * log any seccomp SECCOMP_RET_TRACE rule.
219 */
220 int kore_seccomp_tracing = 0;
221
222 void
223 kore_seccomp_init(void)
224 {
225 TAILQ_INIT(&filters);
226 }
227
228 void
229 kore_seccomp_drop(void)
230 {
231 struct filter *filter;
232
233 while ((filter = TAILQ_FIRST(&filters)) != NULL) {
234 if (!kore_quiet) {
235 kore_log(LOG_INFO,
236 "seccomp filter '%s' dropped", filter->name);
237 }
238 TAILQ_REMOVE(&filters, filter, list);
239 kore_free(filter->name);
240 kore_free(filter);
241 }
242
243 TAILQ_INIT(&filters);
244 }
245
246 void
247 kore_seccomp_enable(void)
248 {
249 struct sock_filter *sf;
250 struct sock_fprog prog;
251 struct kore_runtime_call *rcall;
252 struct filter *filter;
253 size_t prog_len, off, i;
254
255 /*
256 * If kore_seccomp_tracing is turned on, set the default policy to
257 * SECCOMP_RET_TRACE so we can log the system calls.
258 */
259 if (kore_seccomp_tracing) {
260 filter_epilogue[0].k = SECCOMP_RET_TRACE;
261 kore_log(LOG_NOTICE, "seccomp tracing enabled");
262 }
263
264 #if defined(KORE_USE_PYTHON)
265 ufilter = TAILQ_FIRST(&filters);
266 kore_python_seccomp_hook("koreapp.seccomp");
267 ufilter = NULL;
268 #endif
269
270 /* Allow application to add its own filters. */
271 if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
272 ufilter = TAILQ_FIRST(&filters);
273 kore_runtime_execute(rcall);
274 kore_free(rcall);
275 ufilter = NULL;
276 }
277
278 if (worker->id != KORE_WORKER_KEYMGR) {
279 /* Add worker required syscalls. */
280 kore_seccomp_filter("worker", filter_kore,
281 KORE_FILTER_LEN(filter_kore));
282 }
283
284 /* Start with the prologue. */
285 prog_len = filter_prologue_len;
286
287 /* Now account for all enabled filters. */
288 TAILQ_FOREACH(filter, &filters, list)
289 prog_len += filter->instructions;
290
291 /* Finally add the epilogue. */
292 prog_len += filter_epilogue_len;
293
294 /* Build the entire bpf program now. */
295 if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
296 fatalx("calloc");
297
298 off = 0;
299 for (i = 0; i < filter_prologue_len; i++)
300 sf[off++] = filter_prologue[i];
301
302 TAILQ_FOREACH(filter, &filters, list) {
303 for (i = 0; i < filter->instructions; i++)
304 sf[off++] = filter->prog[i];
305 }
306
307 for (i = 0; i < filter_epilogue_len; i++)
308 sf[off++] = filter_epilogue[i];
309
310 /* Lock and load it. */
311 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
312 fatalx("prctl: %s", errno_s);
313
314 prog.filter = sf;
315 prog.len = prog_len;
316
317 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
318 fatalx("prctl: %s", errno_s);
319
320 #if defined(KORE_USE_PYTHON)
321 kore_python_seccomp_cleanup();
322 #endif
323 }
324
325 int
326 kore_seccomp_filter(const char *name, void *prog, size_t len)
327 {
328 struct filter *filter;
329
330 TAILQ_FOREACH(filter, &filters, list) {
331 if (!strcmp(filter->name, name))
332 return (KORE_RESULT_ERROR);
333 }
334
335 filter = kore_calloc(1, sizeof(*filter));
336
337 filter->prog = prog;
338 filter->instructions = len;
339 filter->name = kore_strdup(name);
340
341 if (ufilter) {
342 TAILQ_INSERT_BEFORE(ufilter, filter, list);
343 } else {
344 TAILQ_INSERT_TAIL(&filters, filter, list);
345 }
346
347 return (KORE_RESULT_OK);
348 }
349
350 void
351 kore_seccomp_traceme(void)
352 {
353 if (kore_seccomp_tracing == 0)
354 return;
355
356 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
357 fatalx("ptrace: %s", errno_s);
358 if (kill(worker->pid, SIGSTOP) == -1)
359 fatalx("kill: %s", errno_s);
360 }
361
362 int
363 kore_seccomp_trace(pid_t pid, int status)
364 {
365 int evt;
366
367 if (kore_seccomp_tracing == 0)
368 return (KORE_RESULT_ERROR);
369
370 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
371 if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
372 PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
373 PTRACE_O_TRACEFORK) == -1)
374 fatal("ptrace: %s", errno_s);
375 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
376 fatal("ptrace: %s", errno_s);
377 return (KORE_RESULT_OK);
378 }
379
380 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
381 evt = status >> 8;
382 if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
383 seccomp_register_violation(pid);
384 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
385 fatal("ptrace: %s", errno_s);
386 return (KORE_RESULT_OK);
387 }
388
389 if (WIFSTOPPED(status)) {
390 if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
391 fatal("ptrace: %s", errno_s);
392 return (KORE_RESULT_OK);
393 }
394
395 return (KORE_RESULT_ERROR);
396 }
397
398 int
399 kore_seccomp_syscall_resolve(const char *name)
400 {
401 int i;
402
403 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
404 if (!strcmp(name, kore_syscall_map[i].name))
405 return (kore_syscall_map[i].nr);
406 }
407
408 return (-1);
409 }
410
411 const char *
412 kore_seccomp_syscall_name(long sysnr)
413 {
414 int i;
415
416 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
417 if (kore_syscall_map[i].nr == sysnr)
418 return (kore_syscall_map[i].name);
419 }
420
421 return ("unknown");
422 }
423
424 struct sock_filter *
425 kore_seccomp_syscall_filter(const char *name, int action)
426 {
427 struct sock_filter filter[] = {
428 KORE_SYSCALL_FILTER(exit, action),
429 KORE_BPF_GUARD
430 };
431
432 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
433 }
434
435 struct sock_filter *
436 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
437 {
438 struct sock_filter filter[] = {
439 KORE_SYSCALL_ARG(exit, arg, value, action),
440 KORE_BPF_GUARD
441 };
442
443 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
444 }
445
446 struct sock_filter *
447 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
448 {
449 struct sock_filter filter[] = {
450 KORE_SYSCALL_MASK(exit, arg, value, action),
451 KORE_BPF_GUARD
452 };
453
454 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
455 }
456
457 struct sock_filter *
458 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
459 {
460 struct sock_filter filter[] = {
461 KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
462 KORE_BPF_GUARD
463 };
464
465 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
466 }
467
468 static void
469 seccomp_register_violation(pid_t pid)
470 {
471 int idx;
472 struct kore_worker *kw;
473 struct iovec iov;
474 #if defined(__arm__)
475 struct pt_regs regs;
476 #else
477 struct user_regs_struct regs;
478 #endif
479 long sysnr;
480 const char *name;
481
482 iov.iov_base = ®s;
483 iov.iov_len = sizeof(regs);
484
485 if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
486 fatal("ptrace: %s", errno_s);
487
488 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
489 sysnr = regs.orig_rax;
490 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
491 sysnr = regs.regs[8];
492 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
493 sysnr = regs.uregs[7];
494 #else
495 #error "platform not supported"
496 #endif
497
498 name = NULL;
499 for (idx = 0; idx < worker_count; idx++) {
500 kw = kore_worker_data(idx);
501 if (kw->pid == pid) {
502 name = kore_worker_name(kw->id);
503 break;
504 }
505 }
506
507 if (name == NULL)
508 name = "<child>";
509
510 kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
511 name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
512 }
513
514 static struct sock_filter *
515 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
516 {
517 int nr;
518 struct sock_filter *result;
519
520 if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
521 return (NULL);
522
523 result = kore_calloc(elm, sizeof(struct sock_filter));
524 memcpy(result, filter, elm * sizeof(struct sock_filter));
525
526 /* Update the syscall number to the one specified. */
527 result[0].k = nr;
528
529 return (result);
530 }