seccomp.c (12665B)
1 /*
2 * Copyright (c) 2019-2022 Joris Vink <joris@coders.se>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <sys/param.h>
18 #include <sys/mman.h>
19 #include <sys/epoll.h>
20 #include <sys/ptrace.h>
21 #include <sys/prctl.h>
22 #include <sys/user.h>
23 #include <sys/syscall.h>
24
25 #include <linux/ptrace.h>
26 #include <linux/seccomp.h>
27 #include <linux/filter.h>
28 #include <linux/audit.h>
29
30 #include <stddef.h>
31 #include <sched.h>
32
33 #include "kore.h"
34 #include "seccomp.h"
35 #include "platform.h"
36
37 #if defined(KORE_USE_PYTHON)
38 #include "python_api.h"
39 #endif
40
41 #if !defined(SECCOMP_KILL_POLICY)
42 #define SECCOMP_KILL_POLICY SECCOMP_RET_KILL
43 #endif
44
45 /*
46 * The bare minimum to be able to run kore. These are added last and can
47 * be overwritten by a filter program that is added before hand.
48 */
49 static struct sock_filter filter_kore[] = {
50 /* Deny these, but with EACCESS instead of dying. */
51 KORE_SYSCALL_DENY(ioctl, EACCES),
52
53 /* File related. */
54 #if defined(SYS_open)
55 KORE_SYSCALL_ALLOW(open),
56 #endif
57 KORE_SYSCALL_ALLOW(read),
58 #if defined(SYS_stat)
59 KORE_SYSCALL_ALLOW(stat),
60 #endif
61 #if defined(SYS_stat64)
62 KORE_SYSCALL_ALLOW(stat64),
63 #endif
64 #if defined(SYS_lstat)
65 KORE_SYSCALL_ALLOW(lstat),
66 #endif
67 KORE_SYSCALL_ALLOW(fstat),
68 #if defined(SYS_fstat64)
69 KORE_SYSCALL_ALLOW(fstat64),
70 #endif
71 #if defined(SYS_newfstatat)
72 KORE_SYSCALL_ALLOW(newfstatat),
73 #endif
74 #if defined(SYS_faccessat2)
75 KORE_SYSCALL_ALLOW(faccessat2),
76 #endif
77 KORE_SYSCALL_ALLOW(write),
78 KORE_SYSCALL_ALLOW(fcntl),
79 #if defined(SYS_fcntl64)
80 KORE_SYSCALL_ALLOW(fcntl64),
81 #endif
82 KORE_SYSCALL_ALLOW(lseek),
83 #if defined(SYS__llseek)
84 KORE_SYSCALL_ALLOW(_llseek),
85 #endif
86 KORE_SYSCALL_ALLOW(close),
87 KORE_SYSCALL_ALLOW(openat),
88 #if defined(SYS_access)
89 KORE_SYSCALL_ALLOW(access),
90 #endif
91 KORE_SYSCALL_ALLOW(writev),
92 KORE_SYSCALL_ALLOW(getcwd),
93 #if defined(SYS_unlink)
94 KORE_SYSCALL_ALLOW(unlink),
95 #endif
96 #if defined(SYS_readlink)
97 KORE_SYSCALL_ALLOW(readlink),
98 #endif
99 #if defined(SYS_readlinkat)
100 KORE_SYSCALL_ALLOW(readlinkat),
101 #endif
102
103 /* Process related. */
104 KORE_SYSCALL_ALLOW(exit),
105 KORE_SYSCALL_ALLOW(kill),
106 KORE_SYSCALL_ALLOW(getpid),
107 KORE_SYSCALL_ALLOW(getuid),
108 KORE_SYSCALL_ALLOW(geteuid),
109 KORE_SYSCALL_ALLOW(exit_group),
110 KORE_SYSCALL_ALLOW(nanosleep),
111 #if defined(SYS_clock_gettime64)
112 KORE_SYSCALL_ALLOW(clock_gettime64),
113 #endif
114 #if defined(SYS_clock_nanosleep)
115 KORE_SYSCALL_ALLOW(clock_nanosleep),
116 #endif
117 #if defined(SYS_sigreturn)
118 KORE_SYSCALL_ALLOW(sigreturn),
119 #endif
120
121 /* Memory related. */
122 KORE_SYSCALL_ALLOW(brk),
123 KORE_SYSCALL_ALLOW(munmap),
124
125 /* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
126 #if defined(SYS_mmap)
127 KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
128 #endif
129 #if defined(SYS_mmap2)
130 KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
131 #endif
132 KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
133
134 #if defined(SYS_mmap)
135 KORE_SYSCALL_ALLOW(mmap),
136 #endif
137 #if defined(SYS_mmap2)
138 KORE_SYSCALL_ALLOW(mmap2),
139 #endif
140 KORE_SYSCALL_ALLOW(madvise),
141 KORE_SYSCALL_ALLOW(mprotect),
142
143 /* Net related. */
144 #if defined(SYS_poll)
145 KORE_SYSCALL_ALLOW(poll),
146 #endif
147 KORE_SYSCALL_ALLOW(ppoll),
148 #if defined(SYS_send)
149 KORE_SYSCALL_ALLOW(send),
150 #endif
151 KORE_SYSCALL_ALLOW(sendto),
152 KORE_SYSCALL_ALLOW(accept),
153 KORE_SYSCALL_ALLOW(sendfile),
154 #if defined(SYS_recv)
155 KORE_SYSCALL_ALLOW(recv),
156 #endif
157 KORE_SYSCALL_ALLOW(recvfrom),
158 KORE_SYSCALL_ALLOW(epoll_ctl),
159 KORE_SYSCALL_ALLOW(setsockopt),
160 #if defined(SYS_epoll_wait)
161 KORE_SYSCALL_ALLOW(epoll_wait),
162 #endif
163 KORE_SYSCALL_ALLOW(epoll_pwait),
164
165 /* Signal related. */
166 KORE_SYSCALL_ALLOW(sigaltstack),
167 KORE_SYSCALL_ALLOW(rt_sigreturn),
168 KORE_SYSCALL_ALLOW(rt_sigaction),
169 KORE_SYSCALL_ALLOW(rt_sigprocmask),
170
171 /* "Other" without clear category. */
172 KORE_SYSCALL_ALLOW(futex),
173 #if defined(SYS_clock_gettime)
174 KORE_SYSCALL_ALLOW(clock_gettime),
175 #endif
176
177 #if defined(__NR_getrandom)
178 KORE_SYSCALL_ALLOW(getrandom),
179 #endif
180 };
181
182 /* bpf program prologue. */
183 static struct sock_filter filter_prologue[] = {
184 /* Load arch member into accumulator (A) (arch is __u32). */
185 KORE_BPF_LOAD(arch, 0),
186
187 /* Compare accumulator against constant, if false jump over kill. */
188 KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
189 KORE_BPF_RET(SECCOMP_RET_KILL),
190
191 /* Load the system call number into the accumulator. */
192 KORE_BPF_LOAD(nr, 0),
193 };
194
195 /* bpf program epilogue. */
196 static struct sock_filter filter_epilogue[] = {
197 /* Return hit if no system calls matched our list. */
198 BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
199 };
200
201 static struct sock_filter *seccomp_filter_update(struct sock_filter *,
202 const char *, size_t);
203
204 #define filter_prologue_len KORE_FILTER_LEN(filter_prologue)
205 #define filter_epilogue_len KORE_FILTER_LEN(filter_epilogue)
206
207 static void seccomp_register_violation(pid_t);
208
209 struct filter {
210 char *name;
211 struct sock_filter *prog;
212 size_t instructions;
213 TAILQ_ENTRY(filter) list;
214 };
215
216 static TAILQ_HEAD(, filter) filters;
217 static struct filter *ufilter = NULL;
218
219 /*
220 * If enabled will instruct the parent process to ptrace its children and
221 * log any seccomp SECCOMP_RET_TRACE rule.
222 */
223 int kore_seccomp_tracing = 0;
224
225 void
226 kore_seccomp_init(void)
227 {
228 TAILQ_INIT(&filters);
229 }
230
231 void
232 kore_seccomp_drop(void)
233 {
234 struct filter *filter;
235
236 while ((filter = TAILQ_FIRST(&filters)) != NULL) {
237 if (!kore_quiet) {
238 kore_log(LOG_INFO,
239 "seccomp filter '%s' dropped", filter->name);
240 }
241 TAILQ_REMOVE(&filters, filter, list);
242 kore_free(filter->name);
243 kore_free(filter);
244 }
245
246 TAILQ_INIT(&filters);
247 }
248
249 void
250 kore_seccomp_enable(void)
251 {
252 struct sock_filter *sf;
253 struct sock_fprog prog;
254 struct kore_runtime_call *rcall;
255 struct filter *filter;
256 size_t prog_len, off, i;
257
258 /*
259 * If kore_seccomp_tracing is turned on, set the default policy to
260 * SECCOMP_RET_TRACE so we can log the system calls.
261 */
262 if (kore_seccomp_tracing) {
263 filter_epilogue[0].k = SECCOMP_RET_TRACE;
264 kore_log(LOG_NOTICE, "seccomp tracing enabled");
265 }
266
267 #if defined(KORE_USE_PYTHON)
268 ufilter = TAILQ_FIRST(&filters);
269 kore_python_seccomp_hook("koreapp.seccomp");
270 ufilter = NULL;
271 #endif
272
273 /* Allow application to add its own filters. */
274 if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
275 ufilter = TAILQ_FIRST(&filters);
276 kore_runtime_execute(rcall);
277 kore_free(rcall);
278 ufilter = NULL;
279 }
280
281 if (worker->id != KORE_WORKER_KEYMGR) {
282 /* Add worker required syscalls. */
283 kore_seccomp_filter("worker", filter_kore,
284 KORE_FILTER_LEN(filter_kore));
285 }
286
287 /* Start with the prologue. */
288 prog_len = filter_prologue_len;
289
290 /* Now account for all enabled filters. */
291 TAILQ_FOREACH(filter, &filters, list)
292 prog_len += filter->instructions;
293
294 /* Finally add the epilogue. */
295 prog_len += filter_epilogue_len;
296
297 /* Build the entire bpf program now. */
298 if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
299 fatalx("calloc");
300
301 off = 0;
302 for (i = 0; i < filter_prologue_len; i++)
303 sf[off++] = filter_prologue[i];
304
305 TAILQ_FOREACH(filter, &filters, list) {
306 for (i = 0; i < filter->instructions; i++)
307 sf[off++] = filter->prog[i];
308 }
309
310 for (i = 0; i < filter_epilogue_len; i++)
311 sf[off++] = filter_epilogue[i];
312
313 /* Lock and load it. */
314 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
315 fatalx("prctl: %s", errno_s);
316
317 prog.filter = sf;
318 prog.len = prog_len;
319
320 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
321 fatalx("prctl: %s", errno_s);
322
323 #if defined(KORE_USE_PYTHON)
324 kore_python_seccomp_cleanup();
325 #endif
326 }
327
328 int
329 kore_seccomp_filter(const char *name, void *prog, size_t len)
330 {
331 struct filter *filter;
332
333 TAILQ_FOREACH(filter, &filters, list) {
334 if (!strcmp(filter->name, name))
335 return (KORE_RESULT_ERROR);
336 }
337
338 filter = kore_calloc(1, sizeof(*filter));
339
340 filter->prog = prog;
341 filter->instructions = len;
342 filter->name = kore_strdup(name);
343
344 if (ufilter) {
345 TAILQ_INSERT_BEFORE(ufilter, filter, list);
346 } else {
347 TAILQ_INSERT_TAIL(&filters, filter, list);
348 }
349
350 return (KORE_RESULT_OK);
351 }
352
353 void
354 kore_seccomp_traceme(void)
355 {
356 if (kore_seccomp_tracing == 0)
357 return;
358
359 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
360 fatalx("ptrace: %s", errno_s);
361 if (kill(worker->pid, SIGSTOP) == -1)
362 fatalx("kill: %s", errno_s);
363 }
364
365 int
366 kore_seccomp_trace(pid_t pid, int status)
367 {
368 int evt;
369
370 if (kore_seccomp_tracing == 0)
371 return (KORE_RESULT_ERROR);
372
373 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
374 if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
375 PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
376 PTRACE_O_TRACEFORK) == -1)
377 fatal("ptrace: %s", errno_s);
378 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
379 fatal("ptrace: %s", errno_s);
380 return (KORE_RESULT_OK);
381 }
382
383 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
384 evt = status >> 8;
385 if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
386 seccomp_register_violation(pid);
387 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
388 fatal("ptrace: %s", errno_s);
389 return (KORE_RESULT_OK);
390 }
391
392 if (WIFSTOPPED(status)) {
393 if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
394 fatal("ptrace: %s", errno_s);
395 return (KORE_RESULT_OK);
396 }
397
398 return (KORE_RESULT_ERROR);
399 }
400
401 int
402 kore_seccomp_syscall_resolve(const char *name)
403 {
404 int i;
405
406 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
407 if (!strcmp(name, kore_syscall_map[i].name))
408 return (kore_syscall_map[i].nr);
409 }
410
411 return (-1);
412 }
413
414 const char *
415 kore_seccomp_syscall_name(long sysnr)
416 {
417 int i;
418
419 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
420 if (kore_syscall_map[i].nr == sysnr)
421 return (kore_syscall_map[i].name);
422 }
423
424 return ("unknown");
425 }
426
427 struct sock_filter *
428 kore_seccomp_syscall_filter(const char *name, int action)
429 {
430 struct sock_filter filter[] = {
431 KORE_SYSCALL_FILTER(exit, action),
432 KORE_BPF_GUARD
433 };
434
435 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
436 }
437
438 struct sock_filter *
439 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
440 {
441 struct sock_filter filter[] = {
442 KORE_SYSCALL_ARG(exit, arg, value, action),
443 KORE_BPF_GUARD
444 };
445
446 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
447 }
448
449 struct sock_filter *
450 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
451 {
452 struct sock_filter filter[] = {
453 KORE_SYSCALL_MASK(exit, arg, value, action),
454 KORE_BPF_GUARD
455 };
456
457 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
458 }
459
460 struct sock_filter *
461 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
462 {
463 struct sock_filter filter[] = {
464 KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
465 KORE_BPF_GUARD
466 };
467
468 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
469 }
470
471 static void
472 seccomp_register_violation(pid_t pid)
473 {
474 int idx;
475 struct kore_worker *kw;
476 struct iovec iov;
477 #if defined(__arm__)
478 struct pt_regs regs;
479 #else
480 struct user_regs_struct regs;
481 #endif
482 long sysnr;
483 const char *name;
484
485 iov.iov_base = ®s;
486 iov.iov_len = sizeof(regs);
487
488 if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
489 fatal("ptrace: %s", errno_s);
490
491 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
492 sysnr = regs.orig_rax;
493 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
494 sysnr = regs.regs[8];
495 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
496 sysnr = regs.uregs[7];
497 #else
498 #error "platform not supported"
499 #endif
500
501 name = NULL;
502 for (idx = 0; idx < worker_count; idx++) {
503 kw = kore_worker_data(idx);
504 if (kw->pid == pid) {
505 name = kore_worker_name(kw->id);
506 break;
507 }
508 }
509
510 if (name == NULL)
511 name = "<child>";
512
513 kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
514 name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
515 }
516
517 static struct sock_filter *
518 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
519 {
520 int nr;
521 struct sock_filter *result;
522
523 if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
524 return (NULL);
525
526 result = kore_calloc(elm, sizeof(struct sock_filter));
527 memcpy(result, filter, elm * sizeof(struct sock_filter));
528
529 /* Update the syscall number to the one specified. */
530 result[0].k = nr;
531
532 return (result);
533 }