kore

Kore is a web application platform for writing scalable, concurrent web based processes in C or Python.
Commits | Files | Refs | README | LICENSE | git clone https://git.kore.io/kore.git

seccomp.c (12665B)



      1 /*
      2  * Copyright (c) 2019-2022 Joris Vink <joris@coders.se>
      3  *
      4  * Permission to use, copy, modify, and distribute this software for any
      5  * purpose with or without fee is hereby granted, provided that the above
      6  * copyright notice and this permission notice appear in all copies.
      7  *
      8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
      9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     15  */
     16 
     17 #include <sys/param.h>
     18 #include <sys/mman.h>
     19 #include <sys/epoll.h>
     20 #include <sys/ptrace.h>
     21 #include <sys/prctl.h>
     22 #include <sys/user.h>
     23 #include <sys/syscall.h>
     24 
     25 #include <linux/ptrace.h>
     26 #include <linux/seccomp.h>
     27 #include <linux/filter.h>
     28 #include <linux/audit.h>
     29 
     30 #include <stddef.h>
     31 #include <sched.h>
     32 
     33 #include "kore.h"
     34 #include "seccomp.h"
     35 #include "platform.h"
     36 
     37 #if defined(KORE_USE_PYTHON)
     38 #include "python_api.h"
     39 #endif
     40 
     41 #if !defined(SECCOMP_KILL_POLICY)
     42 #define SECCOMP_KILL_POLICY		SECCOMP_RET_KILL
     43 #endif
     44 
     45 /*
     46  * The bare minimum to be able to run kore. These are added last and can
     47  * be overwritten by a filter program that is added before hand.
     48  */
     49 static struct sock_filter filter_kore[] = {
     50 	/* Deny these, but with EACCESS instead of dying. */
     51 	KORE_SYSCALL_DENY(ioctl, EACCES),
     52 
     53 	/* File related. */
     54 #if defined(SYS_open)
     55 	KORE_SYSCALL_ALLOW(open),
     56 #endif
     57 	KORE_SYSCALL_ALLOW(read),
     58 #if defined(SYS_stat)
     59 	KORE_SYSCALL_ALLOW(stat),
     60 #endif
     61 #if defined(SYS_stat64)
     62 	KORE_SYSCALL_ALLOW(stat64),
     63 #endif
     64 #if defined(SYS_lstat)
     65 	KORE_SYSCALL_ALLOW(lstat),
     66 #endif
     67 	KORE_SYSCALL_ALLOW(fstat),
     68 #if defined(SYS_fstat64)
     69 	KORE_SYSCALL_ALLOW(fstat64),
     70 #endif
     71 #if defined(SYS_newfstatat)
     72 	KORE_SYSCALL_ALLOW(newfstatat),
     73 #endif
     74 #if defined(SYS_faccessat2)
     75 	KORE_SYSCALL_ALLOW(faccessat2),
     76 #endif
     77 	KORE_SYSCALL_ALLOW(write),
     78 	KORE_SYSCALL_ALLOW(fcntl),
     79 #if defined(SYS_fcntl64)
     80 	KORE_SYSCALL_ALLOW(fcntl64),
     81 #endif
     82 	KORE_SYSCALL_ALLOW(lseek),
     83 #if defined(SYS__llseek)
     84 	KORE_SYSCALL_ALLOW(_llseek),
     85 #endif
     86 	KORE_SYSCALL_ALLOW(close),
     87 	KORE_SYSCALL_ALLOW(openat),
     88 #if defined(SYS_access)
     89 	KORE_SYSCALL_ALLOW(access),
     90 #endif
     91 	KORE_SYSCALL_ALLOW(writev),
     92 	KORE_SYSCALL_ALLOW(getcwd),
     93 #if defined(SYS_unlink)
     94 	KORE_SYSCALL_ALLOW(unlink),
     95 #endif
     96 #if defined(SYS_readlink)
     97 	KORE_SYSCALL_ALLOW(readlink),
     98 #endif
     99 #if defined(SYS_readlinkat)
    100 	KORE_SYSCALL_ALLOW(readlinkat),
    101 #endif
    102 
    103 	/* Process related. */
    104 	KORE_SYSCALL_ALLOW(exit),
    105 	KORE_SYSCALL_ALLOW(kill),
    106 	KORE_SYSCALL_ALLOW(getpid),
    107 	KORE_SYSCALL_ALLOW(getuid),
    108 	KORE_SYSCALL_ALLOW(geteuid),
    109 	KORE_SYSCALL_ALLOW(exit_group),
    110 	KORE_SYSCALL_ALLOW(nanosleep),
    111 #if defined(SYS_clock_gettime64)
    112 	KORE_SYSCALL_ALLOW(clock_gettime64),
    113 #endif
    114 #if defined(SYS_clock_nanosleep)
    115 	KORE_SYSCALL_ALLOW(clock_nanosleep),
    116 #endif
    117 #if defined(SYS_sigreturn)
    118 	KORE_SYSCALL_ALLOW(sigreturn),
    119 #endif
    120 
    121 	/* Memory related. */
    122 	KORE_SYSCALL_ALLOW(brk),
    123 	KORE_SYSCALL_ALLOW(munmap),
    124 
    125 	/* Deny mmap/mprotect calls with PROT_EXEC/PROT_WRITE protection. */
    126 #if defined(SYS_mmap)
    127 	KORE_SYSCALL_DENY_WITH_FLAG(mmap, 2, PROT_EXEC | PROT_WRITE, EINVAL),
    128 #endif
    129 #if defined(SYS_mmap2)
    130 	KORE_SYSCALL_DENY_WITH_FLAG(mmap2, 2, PROT_EXEC | PROT_WRITE, EINVAL),
    131 #endif
    132 	KORE_SYSCALL_DENY_WITH_FLAG(mprotect, 2, PROT_EXEC, EINVAL),
    133 
    134 #if defined(SYS_mmap)
    135 	KORE_SYSCALL_ALLOW(mmap),
    136 #endif
    137 #if defined(SYS_mmap2)
    138 	KORE_SYSCALL_ALLOW(mmap2),
    139 #endif
    140 	KORE_SYSCALL_ALLOW(madvise),
    141 	KORE_SYSCALL_ALLOW(mprotect),
    142 
    143 	/* Net related. */
    144 #if defined(SYS_poll)
    145 	KORE_SYSCALL_ALLOW(poll),
    146 #endif
    147 	KORE_SYSCALL_ALLOW(ppoll),
    148 #if defined(SYS_send)
    149 	KORE_SYSCALL_ALLOW(send),
    150 #endif
    151 	KORE_SYSCALL_ALLOW(sendto),
    152 	KORE_SYSCALL_ALLOW(accept),
    153 	KORE_SYSCALL_ALLOW(sendfile),
    154 #if defined(SYS_recv)
    155 	KORE_SYSCALL_ALLOW(recv),
    156 #endif
    157 	KORE_SYSCALL_ALLOW(recvfrom),
    158 	KORE_SYSCALL_ALLOW(epoll_ctl),
    159 	KORE_SYSCALL_ALLOW(setsockopt),
    160 #if defined(SYS_epoll_wait)
    161 	KORE_SYSCALL_ALLOW(epoll_wait),
    162 #endif
    163 	KORE_SYSCALL_ALLOW(epoll_pwait),
    164 
    165 	/* Signal related. */
    166 	KORE_SYSCALL_ALLOW(sigaltstack),
    167 	KORE_SYSCALL_ALLOW(rt_sigreturn),
    168 	KORE_SYSCALL_ALLOW(rt_sigaction),
    169 	KORE_SYSCALL_ALLOW(rt_sigprocmask),
    170 
    171 	/* "Other" without clear category. */
    172 	KORE_SYSCALL_ALLOW(futex),
    173 #if defined(SYS_clock_gettime)
    174 	KORE_SYSCALL_ALLOW(clock_gettime),
    175 #endif
    176 
    177 #if defined(__NR_getrandom)
    178 	KORE_SYSCALL_ALLOW(getrandom),
    179 #endif
    180 };
    181 
    182 /* bpf program prologue. */
    183 static struct sock_filter filter_prologue[] = {
    184 	/* Load arch member into accumulator (A) (arch is __u32). */
    185 	KORE_BPF_LOAD(arch, 0),
    186 
    187 	/* Compare accumulator against constant, if false jump over kill. */
    188 	KORE_BPF_CMP(SECCOMP_AUDIT_ARCH, 1, 0),
    189 	KORE_BPF_RET(SECCOMP_RET_KILL),
    190 
    191 	/* Load the system call number into the accumulator. */
    192 	KORE_BPF_LOAD(nr, 0),
    193 };
    194 
    195 /* bpf program epilogue. */
    196 static struct sock_filter filter_epilogue[] = {
    197 	/* Return hit if no system calls matched our list. */
    198 	BPF_STMT(BPF_RET+BPF_K, SECCOMP_KILL_POLICY)
    199 };
    200 
    201 static struct sock_filter	*seccomp_filter_update(struct sock_filter *,
    202 				    const char *, size_t);
    203 
    204 #define filter_prologue_len	KORE_FILTER_LEN(filter_prologue)
    205 #define filter_epilogue_len	KORE_FILTER_LEN(filter_epilogue)
    206 
    207 static void	seccomp_register_violation(pid_t);
    208 
    209 struct filter {
    210 	char			*name;
    211 	struct sock_filter	*prog;
    212 	size_t			instructions;
    213 	TAILQ_ENTRY(filter)	list;
    214 };
    215 
    216 static TAILQ_HEAD(, filter)	filters;
    217 static struct filter		*ufilter = NULL;
    218 
    219 /*
    220  * If enabled will instruct the parent process to ptrace its children and
    221  * log any seccomp SECCOMP_RET_TRACE rule.
    222  */
    223 int	kore_seccomp_tracing = 0;
    224 
    225 void
    226 kore_seccomp_init(void)
    227 {
    228 	TAILQ_INIT(&filters);
    229 }
    230 
    231 void
    232 kore_seccomp_drop(void)
    233 {
    234 	struct filter		*filter;
    235 
    236 	while ((filter = TAILQ_FIRST(&filters)) != NULL) {
    237 		if (!kore_quiet) {
    238 			kore_log(LOG_INFO,
    239 			    "seccomp filter '%s' dropped", filter->name);
    240 		}
    241 		TAILQ_REMOVE(&filters, filter, list);
    242 		kore_free(filter->name);
    243 		kore_free(filter);
    244 	}
    245 
    246 	TAILQ_INIT(&filters);
    247 }
    248 
    249 void
    250 kore_seccomp_enable(void)
    251 {
    252 	struct sock_filter		*sf;
    253 	struct sock_fprog		prog;
    254 	struct kore_runtime_call	*rcall;
    255 	struct filter			*filter;
    256 	size_t				prog_len, off, i;
    257 
    258 	/*
    259 	 * If kore_seccomp_tracing is turned on, set the default policy to
    260 	 * SECCOMP_RET_TRACE so we can log the system calls.
    261 	 */
    262 	if (kore_seccomp_tracing) {
    263 		filter_epilogue[0].k = SECCOMP_RET_TRACE;
    264 		kore_log(LOG_NOTICE, "seccomp tracing enabled");
    265 	}
    266 
    267 #if defined(KORE_USE_PYTHON)
    268 	ufilter = TAILQ_FIRST(&filters);
    269 	kore_python_seccomp_hook("koreapp.seccomp");
    270 	ufilter = NULL;
    271 #endif
    272 
    273 	/* Allow application to add its own filters. */
    274 	if ((rcall = kore_runtime_getcall("kore_seccomp_hook")) != NULL) {
    275 		ufilter = TAILQ_FIRST(&filters);
    276 		kore_runtime_execute(rcall);
    277 		kore_free(rcall);
    278 		ufilter = NULL;
    279 	}
    280 
    281 	if (worker->id != KORE_WORKER_KEYMGR) {
    282 		/* Add worker required syscalls. */
    283 		kore_seccomp_filter("worker", filter_kore,
    284 		    KORE_FILTER_LEN(filter_kore));
    285 	}
    286 
    287 	/* Start with the prologue. */
    288 	prog_len = filter_prologue_len;
    289 
    290 	/* Now account for all enabled filters. */
    291 	TAILQ_FOREACH(filter, &filters, list)
    292 		prog_len += filter->instructions;
    293 
    294 	/* Finally add the epilogue. */
    295 	prog_len += filter_epilogue_len;
    296 
    297 	/* Build the entire bpf program now. */
    298 	if ((sf = calloc(prog_len, sizeof(*sf))) == NULL)
    299 		fatalx("calloc");
    300 
    301 	off = 0;
    302 	for (i = 0; i < filter_prologue_len; i++)
    303 		sf[off++] = filter_prologue[i];
    304 
    305 	TAILQ_FOREACH(filter, &filters, list) {
    306 		for (i = 0; i < filter->instructions; i++)
    307 			sf[off++] = filter->prog[i];
    308 	}
    309 
    310 	for (i = 0; i < filter_epilogue_len; i++)
    311 		sf[off++] = filter_epilogue[i];
    312 
    313 	/* Lock and load it. */
    314 	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1)
    315 		fatalx("prctl: %s", errno_s);
    316 
    317 	prog.filter = sf;
    318 	prog.len = prog_len;
    319 
    320 	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1)
    321 		fatalx("prctl: %s", errno_s);
    322 
    323 #if defined(KORE_USE_PYTHON)
    324 	kore_python_seccomp_cleanup();
    325 #endif
    326 }
    327 
    328 int
    329 kore_seccomp_filter(const char *name, void *prog, size_t len)
    330 {
    331 	struct filter		*filter;
    332 
    333 	TAILQ_FOREACH(filter, &filters, list) {
    334 		if (!strcmp(filter->name, name))
    335 			return (KORE_RESULT_ERROR);
    336 	}
    337 
    338 	filter = kore_calloc(1, sizeof(*filter));
    339 
    340 	filter->prog = prog;
    341 	filter->instructions = len;
    342 	filter->name = kore_strdup(name);
    343 
    344 	if (ufilter) {
    345 		TAILQ_INSERT_BEFORE(ufilter, filter, list);
    346 	} else {
    347 		TAILQ_INSERT_TAIL(&filters, filter, list);
    348 	}
    349 
    350 	return (KORE_RESULT_OK);
    351 }
    352 
    353 void
    354 kore_seccomp_traceme(void)
    355 {
    356 	if (kore_seccomp_tracing == 0)
    357 		return;
    358 
    359 	if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) == -1)
    360 		fatalx("ptrace: %s", errno_s);
    361 	if (kill(worker->pid, SIGSTOP) == -1)
    362 		fatalx("kill: %s", errno_s);
    363 }
    364 
    365 int
    366 kore_seccomp_trace(pid_t pid, int status)
    367 {
    368 	int	evt;
    369 
    370 	if (kore_seccomp_tracing == 0)
    371 		return (KORE_RESULT_ERROR);
    372 
    373 	if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP) {
    374 		if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
    375 		    PTRACE_O_TRACESECCOMP | PTRACE_O_TRACECLONE |
    376 		    PTRACE_O_TRACEFORK) == -1)
    377 			fatal("ptrace: %s", errno_s);
    378 		if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
    379 			fatal("ptrace: %s", errno_s);
    380 		return (KORE_RESULT_OK);
    381 	}
    382 
    383 	if (WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP) {
    384 		evt = status >> 8;
    385 		if (evt == (SIGTRAP | (PTRACE_EVENT_SECCOMP << 8)))
    386 			seccomp_register_violation(pid);
    387 		if (ptrace(PTRACE_CONT, pid, NULL, NULL) == -1)
    388 			fatal("ptrace: %s", errno_s);
    389 		return (KORE_RESULT_OK);
    390 	}
    391 
    392 	if (WIFSTOPPED(status)) {
    393 		if (ptrace(PTRACE_CONT, pid, NULL, WSTOPSIG(status)) == -1)
    394 			fatal("ptrace: %s", errno_s);
    395 		return (KORE_RESULT_OK);
    396 	}
    397 
    398 	return (KORE_RESULT_ERROR);
    399 }
    400 
    401 int
    402 kore_seccomp_syscall_resolve(const char *name)
    403 {
    404 	int		i;
    405 
    406 	for (i = 0; kore_syscall_map[i].name != NULL; i++) {
    407 		if (!strcmp(name, kore_syscall_map[i].name))
    408 			return (kore_syscall_map[i].nr);
    409 	}
    410 
    411 	return (-1);
    412 }
    413 
    414 const char *
    415 kore_seccomp_syscall_name(long sysnr)
    416 {
    417 	int		i;
    418 
    419 	for (i = 0; kore_syscall_map[i].name != NULL; i++) {
    420 		if (kore_syscall_map[i].nr == sysnr)
    421 			return (kore_syscall_map[i].name);
    422 	}
    423 
    424 	return ("unknown");
    425 }
    426 
    427 struct sock_filter *
    428 kore_seccomp_syscall_filter(const char *name, int action)
    429 {
    430 	struct sock_filter	filter[] = {
    431 		KORE_SYSCALL_FILTER(exit, action),
    432 		KORE_BPF_GUARD
    433 	};
    434 
    435 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
    436 }
    437 
    438 struct sock_filter *
    439 kore_seccomp_syscall_arg(const char *name, int action, int arg, int value)
    440 {
    441 	struct sock_filter	filter[] = {
    442 		KORE_SYSCALL_ARG(exit, arg, value, action),
    443 		KORE_BPF_GUARD
    444 	};
    445 
    446 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
    447 }
    448 
    449 struct sock_filter *
    450 kore_seccomp_syscall_mask(const char *name, int action, int arg, int value)
    451 {
    452 	struct sock_filter	filter[] = {
    453 		KORE_SYSCALL_MASK(exit, arg, value, action),
    454 		KORE_BPF_GUARD
    455 	};
    456 
    457 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
    458 }
    459 
    460 struct sock_filter *
    461 kore_seccomp_syscall_flag(const char *name, int action, int arg, int value)
    462 {
    463 	struct sock_filter	filter[] = {
    464 		KORE_SYSCALL_WITH_FLAG(exit, arg, value, action),
    465 		KORE_BPF_GUARD
    466 	};
    467 
    468 	return (seccomp_filter_update(filter, name, KORE_FILTER_LEN(filter)));
    469 }
    470 
    471 static void
    472 seccomp_register_violation(pid_t pid)
    473 {
    474 	int				idx;
    475 	struct kore_worker		*kw;
    476 	struct iovec			iov;
    477 #if defined(__arm__)
    478 	struct pt_regs			regs;
    479 #else
    480 	struct user_regs_struct		regs;
    481 #endif
    482 	long				sysnr;
    483 	const char			*name;
    484 
    485 	iov.iov_base = &regs;
    486 	iov.iov_len = sizeof(regs);
    487 
    488 	if (ptrace(PTRACE_GETREGSET, pid, 1, &iov) == -1)
    489 		fatal("ptrace: %s", errno_s);
    490 
    491 #if SECCOMP_AUDIT_ARCH == AUDIT_ARCH_X86_64
    492 	sysnr = regs.orig_rax;
    493 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_AARCH64
    494 	sysnr = regs.regs[8];
    495 #elif SECCOMP_AUDIT_ARCH == AUDIT_ARCH_ARM
    496 	sysnr = regs.uregs[7];
    497 #else
    498 #error "platform not supported"
    499 #endif
    500 
    501 	name = NULL;
    502 	for (idx = 0; idx < worker_count; idx++) {
    503 		kw = kore_worker_data(idx);
    504 		if (kw->pid == pid) {
    505 			name = kore_worker_name(kw->id);
    506 			break;
    507 		}
    508 	}
    509 
    510 	if (name == NULL)
    511 		name = "<child>";
    512 
    513 	kore_log(LOG_INFO, "seccomp violation, %s pid=%d, syscall=%ld:%s",
    514 	    name, pid, sysnr, kore_seccomp_syscall_name(sysnr));
    515 }
    516 
    517 static struct sock_filter *
    518 seccomp_filter_update(struct sock_filter *filter, const char *name, size_t elm)
    519 {
    520 	int			nr;
    521 	struct sock_filter	*result;
    522 
    523 	if ((nr = kore_seccomp_syscall_resolve(name)) == -1)
    524 		return (NULL);
    525 
    526 	result = kore_calloc(elm, sizeof(struct sock_filter));
    527 	memcpy(result, filter, elm * sizeof(struct sock_filter));
    528 
    529 	/* Update the syscall number to the one specified. */
    530 	result[0].k = nr;
    531 
    532 	return (result);
    533 }