seccomp.c (12451B)
1 /*
2 * Copyright (c) 2019-2022 Joris Vink <joris@coders.se>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <sys/param.h>
18 #include <sys/mman.h>
19 #include <sys/epoll.h>
20 #include <sys/ptrace.h>
21 #include <sys/prctl.h>
22 #include <sys/user.h>
23 #include <sys/syscall.h>
24
25 #include <linux/ptrace.h>
26 #include <linux/seccomp.h>
27 #include <linux/filter.h>
28 #include <linux/audit.h>
29
30 #include <stddef.h>
31 #include <sched.h>
32
33 #include "kore.h"
34 #include "seccomp.h"
35 #include "platform.h"
36
37 #if defined(KORE_USE_PYTHON)
38 #include "python_api.h"
39 #endif
40
41 #if !defined(SECCOMP_KILL_POLICY)
42 #define SECCOMP_KILL_POLICY SECCOMP_RET_KILL
43 #endif
44
45 /*
46 * The bare minimum to be able to run kore. These are added last and can
47 * be overwritten by a filter program that is added before hand.
48 */
49 static struct sock_filter filter_kore[] = {
50 /* Deny these, but with EACCESS instead of dying. */
51 KORE_SYSCALL_DENY(ioctl, EACCES),
52
53 /* File related. */
54 #if defined(SYS_open)
55 KORE_SYSCALL_ALLOW(open),
56 #endif
57 KORE_SYSCALL_ALLOW(read),
58 #if defined(SYS_stat)
59 KORE_SYSCALL_ALLOW(stat),
60 #endif
61 #if defined(SYS_stat64)
62 KORE_SYSCALL_ALLOW(stat64),
63 #endif
64 #if defined(SYS_lstat)
65 KORE_SYSCALL_ALLOW(lstat),
66 #endif
67 KORE_SYSCALL_ALLOW(fstat),
68 #if defined(SYS_fstat64)
69 KORE_SYSCALL_ALLOW(fstat64),
70 #endif
71 KORE_SYSCALL_ALLOW(write),
72 KORE_SYSCALL_ALLOW(fcntl),
73 #if defined(SYS_fcntl64)
74 KORE_SYSCALL_ALLOW(fcntl64),
75 #endif
76 KORE_SYSCALL_ALLOW(lseek),
77 #if defined(SYS__llseek)
78 KORE_SYSCALL_ALLOW(_llseek),
79 #endif
80 KORE_SYSCALL_ALLOW(close),
81 KORE_SYSCALL_ALLOW(openat),
82 #if defined(SYS_access)
83 KORE_SYSCALL_ALLOW(access),
84 #endif
85 KORE_SYSCALL_ALLOW(writev),
86 KORE_SYSCALL_ALLOW(getcwd),
87 #if defined(SYS_unlink)
88 KORE_SYSCALL_ALLOW(unlink),
89 #endif
90 #if defined(SYS_readlink)
91 KORE_SYSCALL_ALLOW(readlink),
92 #endif
93 #if defined(SYS_readlinkat)
94 KORE_SYSCALL_ALLOW(readlinkat),
95 #endif
96
97 /* Process related. */
98 KORE_SYSCALL_ALLOW(exit),
99 KORE_SYSCALL_ALLOW(kill),
100 KORE_SYSCALL_ALLOW(getpid),
101 KORE_SYSCALL_ALLOW(getuid),
102 KORE_SYSCALL_ALLOW(geteuid),
103 KORE_SYSCALL_ALLOW(exit_group),
104 KORE_SYSCALL_ALLOW(nanosleep),
105 #if defined(SYS_clock_nanosleep)
106 KORE_SYSCALL_ALLOW(clock_nanosleep),
107 #endif
108 #if defined(SYS_sigreturn)
109 KORE_SYSCALL_ALLOW(sigreturn),
110 #endif
111
112 /* Memory related. */
113 KORE_SYSCALL_ALLOW(brk),
114 KORE_SYSCALL_ALLOW(munmap),
115
116 /* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
117 #if defined(SYS_mmap)
118 KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
119 #endif
120 #if defined(SYS_mmap2)
121 KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
122 #endif
123 KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
124
125 #if defined(SYS_mmap)
126 KORE_SYSCALL_ALLOW(mmap),
127 #endif
128 #if defined(SYS_mmap2)
129 KORE_SYSCALL_ALLOW(mmap2),
130 #endif
131 KORE_SYSCALL_ALLOW(madvise),
132 KORE_SYSCALL_ALLOW(mprotect),
133
134 /* Net related. */
135 #if defined(SYS_poll)
136 KORE_SYSCALL_ALLOW(poll),
137 #endif
138 KORE_SYSCALL_ALLOW(ppoll),
139 #if defined(SYS_send)
140 KORE_SYSCALL_ALLOW(send),
141 #endif
142 KORE_SYSCALL_ALLOW(sendto),
143 KORE_SYSCALL_ALLOW(accept),
144 KORE_SYSCALL_ALLOW(sendfile),
145 #if defined(SYS_recv)
146 KORE_SYSCALL_ALLOW(recv),
147 #endif
148 KORE_SYSCALL_ALLOW(recvfrom),
149 KORE_SYSCALL_ALLOW(epoll_ctl),
150 KORE_SYSCALL_ALLOW(setsockopt),
151 #if defined(SYS_epoll_wait)
152 KORE_SYSCALL_ALLOW(epoll_wait),
153 #endif
154 KORE_SYSCALL_ALLOW(epoll_pwait),
155
156 /* Signal related. */
157 KORE_SYSCALL_ALLOW(sigaltstack),
158 KORE_SYSCALL_ALLOW(rt_sigreturn),
159 KORE_SYSCALL_ALLOW(rt_sigaction),
160 KORE_SYSCALL_ALLOW(rt_sigprocmask),
161
162 /* "Other" without clear category. */
163 KORE_SYSCALL_ALLOW(futex),
164 #if defined(SYS_clock_gettime)
165 KORE_SYSCALL_ALLOW(clock_gettime),
166 #endif
167
168 #if defined(__NR_getrandom)
169 KORE_SYSCALL_ALLOW(getrandom),
170 #endif
171 };
172
173 /* bpf program prologue. */
174 static struct sock_filter filter_prologue[] = {
175 /* Load arch member into accumulator (A) (arch is __u32). */
176 KORE_BPF_LOAD(arch, 0),
177
178 /* Compare accumulator against constant, if false jump over kill. */
179 KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
180 KORE_BPF_RET(SECCOMP_RET_KILL),
181
182 /* Load the system call number into the accumulator. */
183 KORE_BPF_LOAD(nr, 0),
184 };
185
186 /* bpf program epilogue. */
187 static struct sock_filter filter_epilogue[] = {
188 /* Return hit if no system calls matched our list. */
189 BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
190 };
191
192 static struct sock_filter *seccomp_filter_update(struct sock_filter *,
193 const char *, size_t);
194
195 #define filter_prologue_len KORE_FILTER_LEN(filter_prologue)
196 #define filter_epilogue_len KORE_FILTER_LEN(filter_epilogue)
197
198 static void seccomp_register_violation(pid_t);
199
200 struct filter {
201 char *name;
202 struct sock_filter *prog;
203 size_t instructions;
204 TAILQ_ENTRY(filter) list;
205 };
206
207 static TAILQ_HEAD(, filter) filters;
208 static struct filter *ufilter = NULL;
209
210 /*
211 * If enabled will instruct the parent process to ptrace its children and
212 * log any seccomp SECCOMP_RET_TRACE rule.
213 */
214 int kore_seccomp_tracing = 0;
215
216 void
217 kore_seccomp_init(void)
218 {
219 TAILQ_INIT(&filters);
220 }
221
222 void
223 kore_seccomp_drop(void)
224 {
225 struct filter *filter;
226
227 while ((filter = TAILQ_FIRST(&filters)) != NULL) {
228 if (!kore_quiet) {
229 kore_log(LOG_INFO,
230 "seccomp filter '%s' dropped", filter->name);
231 }
232 TAILQ_REMOVE(&filters, filter, list);
233 kore_free(filter->name);
234 kore_free(filter);
235 }
236
237 TAILQ_INIT(&filters);
238 }
239
240 void
241 kore_seccomp_enable(void)
242 {
243 struct sock_filter *sf;
244 struct sock_fprog prog;
245 struct kore_runtime_call *rcall;
246 struct filter *filter;
247 size_t prog_len, off, i;
248
249 /*
250 * If kore_seccomp_tracing is turned on, set the default policy to
251 * SECCOMP_RET_TRACE so we can log the system calls.
252 */
253 if (kore_seccomp_tracing) {
254 filter_epilogue[0].k = SECCOMP_RET_TRACE;
255 kore_log(LOG_NOTICE, "seccomp tracing enabled");
256 }
257
258 #if defined(KORE_USE_PYTHON)
259 ufilter = TAILQ_FIRST(&filters);
260 kore_python_seccomp_hook("koreapp.seccomp");
261 ufilter = NULL;
262 #endif
263
264 /* Allow application to add its own filters. */
265 if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
266 ufilter = TAILQ_FIRST(&filters);
267 kore_runtime_execute(rcall);
268 kore_free(rcall);
269 ufilter = NULL;
270 }
271
272 if (worker->id != KORE_WORKER_KEYMGR) {
273 /* Add worker required syscalls. */
274 kore_seccomp_filter("worker", filter_kore,
275 KORE_FILTER_LEN(filter_kore));
276 }
277
278 /* Start with the prologue. */
279 prog_len = filter_prologue_len;
280
281 /* Now account for all enabled filters. */
282 TAILQ_FOREACH(filter, &filters, list)
283 prog_len += filter->instructions;
284
285 /* Finally add the epilogue. */
286 prog_len += filter_epilogue_len;
287
288 /* Build the entire bpf program now. */
289 if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
290 fatalx("calloc");
291
292 off = 0;
293 for (i = 0; i < filter_prologue_len; i++)
294 sf[off++] = filter_prologue[i];
295
296 TAILQ_FOREACH(filter, &filters, list) {
297 for (i = 0; i < filter->instructions; i++)
298 sf[off++] = filter->prog[i];
299 }
300
301 for (i = 0; i < filter_epilogue_len; i++)
302 sf[off++] = filter_epilogue[i];
303
304 /* Lock and load it. */
305 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
306 fatalx("prctl: %s", errno_s);
307
308 prog.filter = sf;
309 prog.len = prog_len;
310
311 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
312 fatalx("prctl: %s", errno_s);
313
314 #if defined(KORE_USE_PYTHON)
315 kore_python_seccomp_cleanup();
316 #endif
317 }
318
319 int
320 kore_seccomp_filter(const char *name, void *prog, size_t len)
321 {
322 struct filter *filter;
323
324 TAILQ_FOREACH(filter, &filters, list) {
325 if (!strcmp(filter->name, name))
326 return (KORE_RESULT_ERROR);
327 }
328
329 filter = kore_calloc(1, sizeof(*filter));
330
331 filter->prog = prog;
332 filter->instructions = len;
333 filter->name = kore_strdup(name);
334
335 if (ufilter) {
336 TAILQ_INSERT_BEFORE(ufilter, filter, list);
337 } else {
338 TAILQ_INSERT_TAIL(&filters, filter, list);
339 }
340
341 return (KORE_RESULT_OK);
342 }
343
344 void
345 kore_seccomp_traceme(void)
346 {
347 if (kore_seccomp_tracing == 0)
348 return;
349
350 if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
351 fatalx("ptrace: %s", errno_s);
352 if (kill(worker->pid, SIGSTOP) == -1)
353 fatalx("kill: %s", errno_s);
354 }
355
356 int
357 kore_seccomp_trace(pid_t pid, int status)
358 {
359 int evt;
360
361 if (kore_seccomp_tracing == 0)
362 return (KORE_RESULT_ERROR);
363
364 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
365 if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
366 PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
367 PTRACE_O_TRACEFORK) == -1)
368 fatal("ptrace: %s", errno_s);
369 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
370 fatal("ptrace: %s", errno_s);
371 return (KORE_RESULT_OK);
372 }
373
374 if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
375 evt = status >> 8;
376 if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
377 seccomp_register_violation(pid);
378 if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
379 fatal("ptrace: %s", errno_s);
380 return (KORE_RESULT_OK);
381 }
382
383 if (WIFSTOPPED(status)) {
384 if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
385 fatal("ptrace: %s", errno_s);
386 return (KORE_RESULT_OK);
387 }
388
389 return (KORE_RESULT_ERROR);
390 }
391
392 int
393 kore_seccomp_syscall_resolve(const char *name)
394 {
395 int i;
396
397 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
398 if (!strcmp(name, kore_syscall_map[i].name))
399 return (kore_syscall_map[i].nr);
400 }
401
402 return (-1);
403 }
404
405 const char *
406 kore_seccomp_syscall_name(long sysnr)
407 {
408 int i;
409
410 for (i = 0; kore_syscall_map[i].name != NULL; i++) {
411 if (kore_syscall_map[i].nr == sysnr)
412 return (kore_syscall_map[i].name);
413 }
414
415 return ("unknown");
416 }
417
418 struct sock_filter *
419 kore_seccomp_syscall_filter(const char *name, int action)
420 {
421 struct sock_filter filter[] = {
422 KORE_SYSCALL_FILTER(exit, action),
423 KORE_BPF_GUARD
424 };
425
426 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
427 }
428
429 struct sock_filter *
430 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
431 {
432 struct sock_filter filter[] = {
433 KORE_SYSCALL_ARG(exit, arg, value, action),
434 KORE_BPF_GUARD
435 };
436
437 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
438 }
439
440 struct sock_filter *
441 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
442 {
443 struct sock_filter filter[] = {
444 KORE_SYSCALL_MASK(exit, arg, value, action),
445 KORE_BPF_GUARD
446 };
447
448 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
449 }
450
451 struct sock_filter *
452 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
453 {
454 struct sock_filter filter[] = {
455 KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
456 KORE_BPF_GUARD
457 };
458
459 return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
460 }
461
462 static void
463 seccomp_register_violation(pid_t pid)
464 {
465 int idx;
466 struct kore_worker *kw;
467 struct iovec iov;
468 #if defined(__arm__)
469 struct pt_regs regs;
470 #else
471 struct user_regs_struct regs;
472 #endif
473 long sysnr;
474 const char *name;
475
476 iov.iov_base = ®s;
477 iov.iov_len = sizeof(regs);
478
479 if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
480 fatal("ptrace: %s", errno_s);
481
482 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
483 sysnr = regs.orig_rax;
484 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
485 sysnr = regs.regs[8];
486 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
487 sysnr = regs.uregs[7];
488 #else
489 #error "platform not supported"
490 #endif
491
492 name = NULL;
493 for (idx = 0; idx < worker_count; idx++) {
494 kw = kore_worker_data(idx);
495 if (kw->pid == pid) {
496 name = kore_worker_name(kw->id);
497 break;
498 }
499 }
500
501 if (name == NULL)
502 name = "<child>";
503
504 kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
505 name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
506 }
507
508 static struct sock_filter *
509 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
510 {
511 int nr;
512 struct sock_filter *result;
513
514 if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
515 return (NULL);
516
517 result = kore_calloc(elm, sizeof(struct sock_filter));
518 memcpy(result, filter, elm * sizeof(struct sock_filter));
519
520 /* Update the syscall number to the one specified. */
521 result[0].k = nr;
522
523 return (result);
524 }