Commit 804382d5 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'vfs-6.15-rc1.overlayfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs overlayfs updates from Christian Brauner:
 "Currently overlayfs uses the mounter's credentials for its
  override_creds() calls. That provides a consistent permission model.

  This patches allows a caller to instruct overlayfs to use its
  credentials instead. The caller must be located in the same user
  namespace hierarchy as the user namespace the overlayfs instance will
  be mounted in. This provides a consistent and simple security model.

  With this it is possible to e.g., mount an overlayfs instance where
  the mounter must have CAP_SYS_ADMIN but the credentials used for
  override_creds() have dropped CAP_SYS_ADMIN. It also allows the usage
  of custom fs{g,u}id different from the callers and other tweaks"

* tag 'vfs-6.15-rc1.overlayfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  selftests/ovl: add third selftest for "override_creds"
  selftests/ovl: add second selftest for "override_creds"
  selftests/filesystems: add utils.{c,h}
  selftests/ovl: add first selftest for "override_creds"
  ovl: allow to specify override credentials
parents 0ec0d4ec 9c27e5cc
Loading
Loading
Loading
Loading
+19 −5
Original line number Diff line number Diff line
@@ -292,13 +292,27 @@ rename or unlink will of course be noticed and handled).
Permission model
----------------

An overlay filesystem stashes credentials that will be used when
accessing lower or upper filesystems.

In the old mount api the credentials of the task calling mount(2) are
stashed. In the new mount api the credentials of the task creating the
superblock through FSCONFIG_CMD_CREATE command of fsconfig(2) are
stashed.

Starting with kernel v6.15 it is possible to use the "override_creds"
mount option which will cause the credentials of the calling task to be
recorded. Note that "override_creds" is only meaningful when used with
the new mount api as the old mount api combines setting options and
superblock creation in a single mount(2) syscall.

Permission checking in the overlay filesystem follows these principles:

 1) permission check SHOULD return the same result before and after copy up

 2) task creating the overlay mount MUST NOT gain additional privileges

 3) non-mounting task MAY gain additional privileges through the overlay,
 3) task[*] MAY gain additional privileges through the overlay,
    compared to direct access on underlying lower or upper filesystems

This is achieved by performing two permission checks on each access:
@@ -306,7 +320,7 @@ This is achieved by performing two permission checks on each access:
 a) check if current task is allowed access based on local DAC (owner,
    group, mode and posix acl), as well as MAC checks

 b) check if mounting task would be allowed real operation on lower or
 b) check if stashed credentials would be allowed real operation on lower or
    upper layer based on underlying filesystem permissions, again including
    MAC checks

@@ -315,10 +329,10 @@ are copied up. On the other hand it can result in server enforced
permissions (used by NFS, for example) being ignored (3).

Check (b) ensures that no task gains permissions to underlying layers that
the mounting task does not have (2).  This also means that it is possible
the stashed credentials do not have (2).  This also means that it is possible
to create setups where the consistency rule (1) does not hold; normally,
however, the mounting task will have sufficient privileges to perform all
operations.
however, the stashed credentials will have sufficient privileges to
perform all operations.

Another way to demonstrate this model is drawing parallels between::

+25 −0
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@ enum ovl_opt {
	Opt_metacopy,
	Opt_verity,
	Opt_volatile,
	Opt_override_creds,
};

static const struct constant_table ovl_parameter_bool[] = {
@@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
	fsparam_enum("metacopy",            Opt_metacopy, ovl_parameter_bool),
	fsparam_enum("verity",              Opt_verity, ovl_parameter_verity),
	fsparam_flag("volatile",            Opt_volatile),
	fsparam_flag_no("override_creds",   Opt_override_creds),
	{}
};

@@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
	case Opt_userxattr:
		config->userxattr = true;
		break;
	case Opt_override_creds: {
		const struct cred *cred = NULL;

		if (result.negated) {
			swap(cred, ofs->creator_cred);
			put_cred(cred);
			break;
		}

		if (!current_in_userns(fc->user_ns)) {
			err = -EINVAL;
			break;
		}

		cred = prepare_creds();
		if (cred)
			swap(cred, ofs->creator_cred);
		else
			err = -ENOMEM;

		put_cred(cred);
		break;
	}
	default:
		pr_err("unrecognized mount option \"%s\" or missing value\n",
		       param->key);
+15 −1
Original line number Diff line number Diff line
@@ -1305,6 +1305,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
	struct ovl_fs *ofs = sb->s_fs_info;
	struct ovl_fs_context *ctx = fc->fs_private;
	const struct cred *old_cred = NULL;
	struct dentry *root_dentry;
	struct ovl_entry *oe;
	struct ovl_layer *layers;
@@ -1318,10 +1319,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
	sb->s_d_op = &ovl_dentry_operations;

	err = -ENOMEM;
	if (!ofs->creator_cred)
		ofs->creator_cred = cred = prepare_creds();
	else
		cred = (struct cred *)ofs->creator_cred;
	if (!cred)
		goto out_err;

	old_cred = ovl_override_creds(sb);

	err = ovl_fs_params_verify(ctx, &ofs->config);
	if (err)
		goto out_err;
@@ -1481,11 +1487,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)

	sb->s_root = root_dentry;

	ovl_revert_creds(old_cred);
	return 0;

out_free_oe:
	ovl_free_entry(oe);
out_err:
	/*
	 * Revert creds before calling ovl_free_fs() which will call
	 * put_cred() and put_cred() requires that the cred's that are
	 * put are not the caller's creds, i.e., current->cred.
	 */
	if (old_cred)
		ovl_revert_creds(old_cred);
	ovl_free_fs(ofs);
	sb->s_fs_info = NULL;
	return err;
+9 −2
Original line number Diff line number Diff line
# SPDX-License-Identifier: GPL-2.0

TEST_GEN_PROGS := dev_in_maps set_layers_via_fds
CFLAGS += -Wall
CFLAGS += $(KHDR_INCLUDES)
LDLIBS += -lcap

CFLAGS := -Wall -Werror
LOCAL_HDRS += wrappers.h log.h

TEST_GEN_PROGS := dev_in_maps
TEST_GEN_PROGS += set_layers_via_fds

include ../../lib.mk

$(OUTPUT)/set_layers_via_fds: ../utils.c
+312 −4
Original line number Diff line number Diff line
@@ -6,30 +6,40 @@
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/mount.h>
#include <unistd.h>

#include "../../kselftest_harness.h"
#include "../../pidfd/pidfd.h"
#include "log.h"
#include "../utils.h"
#include "wrappers.h"

FIXTURE(set_layers_via_fds) {
	int pidfd;
};

FIXTURE_SETUP(set_layers_via_fds)
{
	ASSERT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
	ASSERT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0);
	self->pidfd = -EBADF;
	EXPECT_EQ(mkdir("/set_layers_via_fds", 0755), 0);
	EXPECT_EQ(mkdir("/set_layers_via_fds_tmpfs", 0755), 0);
}

FIXTURE_TEARDOWN(set_layers_via_fds)
{
	if (self->pidfd >= 0) {
		EXPECT_EQ(sys_pidfd_send_signal(self->pidfd, SIGKILL, NULL, 0), 0);
		EXPECT_EQ(close(self->pidfd), 0);
	}
	umount2("/set_layers_via_fds", 0);
	ASSERT_EQ(rmdir("/set_layers_via_fds"), 0);
	EXPECT_EQ(rmdir("/set_layers_via_fds"), 0);

	umount2("/set_layers_via_fds_tmpfs", 0);
	ASSERT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0);
	EXPECT_EQ(rmdir("/set_layers_via_fds_tmpfs"), 0);
}

TEST_F(set_layers_via_fds, set_layers_via_fds)
@@ -218,6 +228,304 @@ TEST_F(set_layers_via_fds, set_500_layers_via_fds)
	ASSERT_EQ(close(fd_overlay), 0);
}

TEST_F(set_layers_via_fds, set_override_creds)
{
	int fd_context, fd_tmpfs, fd_overlay;
	int layer_fds[] = { [0 ... 3] = -EBADF };
	pid_t pid;
	int pidfd;

	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);

	fd_context = sys_fsopen("tmpfs", 0);
	ASSERT_GE(fd_context, 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_tmpfs, 0);
	ASSERT_EQ(close(fd_context), 0);

	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);

	layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
	ASSERT_GE(layer_fds[0], 0);

	layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
	ASSERT_GE(layer_fds[1], 0);

	layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
	ASSERT_GE(layer_fds[2], 0);

	layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
	ASSERT_GE(layer_fds[3], 0);

	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
	ASSERT_EQ(close(fd_tmpfs), 0);

	fd_context = sys_fsopen("overlay", 0);
	ASSERT_GE(fd_context, 0);

	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_STRING, "metacopy", "on", 0), 0);

	pid = create_child(&pidfd, 0);
	ASSERT_GE(pid, 0);
	if (pid == 0) {
		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
			TH_LOG("sys_fsconfig should have succeeded");
			_exit(EXIT_FAILURE);
		}

		_exit(EXIT_SUCCESS);
	}
	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
	ASSERT_GE(close(pidfd), 0);

	pid = create_child(&pidfd, 0);
	ASSERT_GE(pid, 0);
	if (pid == 0) {
		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "nooverride_creds", NULL, 0)) {
			TH_LOG("sys_fsconfig should have succeeded");
			_exit(EXIT_FAILURE);
		}

		_exit(EXIT_SUCCESS);
	}
	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
	ASSERT_GE(close(pidfd), 0);

	pid = create_child(&pidfd, 0);
	ASSERT_GE(pid, 0);
	if (pid == 0) {
		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
			TH_LOG("sys_fsconfig should have succeeded");
			_exit(EXIT_FAILURE);
		}

		_exit(EXIT_SUCCESS);
	}
	ASSERT_GE(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
	ASSERT_GE(close(pidfd), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);

	fd_overlay = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_overlay, 0);

	ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);

	ASSERT_EQ(close(fd_context), 0);
	ASSERT_EQ(close(fd_overlay), 0);
}

TEST_F(set_layers_via_fds, set_override_creds_invalid)
{
	int fd_context, fd_tmpfs, fd_overlay, ret;
	int layer_fds[] = { [0 ... 3] = -EBADF };
	pid_t pid;
	int fd_userns1, fd_userns2;
	int ipc_sockets[2];
	char c;
	const unsigned int predictable_fd_context_nr = 123;

	fd_userns1 = get_userns_fd(0, 0, 10000);
	ASSERT_GE(fd_userns1, 0);

	fd_userns2 = get_userns_fd(0, 1234, 10000);
	ASSERT_GE(fd_userns2, 0);

	ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
	ASSERT_GE(ret, 0);

	pid = create_child(&self->pidfd, 0);
	ASSERT_GE(pid, 0);
	if (pid == 0) {
		if (close(ipc_sockets[0])) {
			TH_LOG("close should have succeeded");
			_exit(EXIT_FAILURE);
		}

		if (!switch_userns(fd_userns2, 0, 0, false)) {
			TH_LOG("switch_userns should have succeeded");
			_exit(EXIT_FAILURE);
		}

		if (read_nointr(ipc_sockets[1], &c, 1) != 1) {
			TH_LOG("read_nointr should have succeeded");
			_exit(EXIT_FAILURE);
		}

		if (close(ipc_sockets[1])) {
			TH_LOG("close should have succeeded");
			_exit(EXIT_FAILURE);
		}

		if (!sys_fsconfig(predictable_fd_context_nr, FSCONFIG_SET_FLAG, "override_creds", NULL, 0)) {
			TH_LOG("sys_fsconfig should have failed");
			_exit(EXIT_FAILURE);
		}

		_exit(EXIT_SUCCESS);
	}

	ASSERT_EQ(close(ipc_sockets[1]), 0);
	ASSERT_EQ(switch_userns(fd_userns1, 0, 0, false), true);
	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);

	fd_context = sys_fsopen("tmpfs", 0);
	ASSERT_GE(fd_context, 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_tmpfs, 0);
	ASSERT_EQ(close(fd_context), 0);

	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);

	layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
	ASSERT_GE(layer_fds[0], 0);

	layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
	ASSERT_GE(layer_fds[1], 0);

	layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
	ASSERT_GE(layer_fds[2], 0);

	layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
	ASSERT_GE(layer_fds[3], 0);

	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
	ASSERT_EQ(close(fd_tmpfs), 0);

	fd_context = sys_fsopen("overlay", 0);
	ASSERT_GE(fd_context, 0);
	ASSERT_EQ(dup3(fd_context, predictable_fd_context_nr, 0), predictable_fd_context_nr);
	ASSERT_EQ(close(fd_context), 0);
	fd_context = predictable_fd_context_nr;
	ASSERT_EQ(write_nointr(ipc_sockets[0], "1", 1), 1);
	ASSERT_EQ(close(ipc_sockets[0]), 0);

	ASSERT_EQ(wait_for_pid(pid), 0);
	ASSERT_EQ(close(self->pidfd), 0);
	self->pidfd = -EBADF;

	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);

	for (int i = 0; i < ARRAY_SIZE(layer_fds); i++)
		ASSERT_EQ(close(layer_fds[i]), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);

	fd_overlay = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_overlay, 0);

	ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);

	ASSERT_EQ(close(fd_context), 0);
	ASSERT_EQ(close(fd_overlay), 0);
	ASSERT_EQ(close(fd_userns1), 0);
	ASSERT_EQ(close(fd_userns2), 0);
}

TEST_F(set_layers_via_fds, set_override_creds_nomknod)
{
	int fd_context, fd_tmpfs, fd_overlay;
	int layer_fds[] = { [0 ... 3] = -EBADF };
	pid_t pid;
	int pidfd;

	ASSERT_EQ(unshare(CLONE_NEWNS), 0);
	ASSERT_EQ(sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL), 0);

	fd_context = sys_fsopen("tmpfs", 0);
	ASSERT_GE(fd_context, 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);
	fd_tmpfs = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_tmpfs, 0);
	ASSERT_EQ(close(fd_context), 0);

	ASSERT_EQ(mkdirat(fd_tmpfs, "w", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "u", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l1", 0755), 0);
	ASSERT_EQ(mkdirat(fd_tmpfs, "l2", 0755), 0);

	layer_fds[0] = openat(fd_tmpfs, "w", O_DIRECTORY);
	ASSERT_GE(layer_fds[0], 0);

	layer_fds[1] = openat(fd_tmpfs, "u", O_DIRECTORY);
	ASSERT_GE(layer_fds[1], 0);

	layer_fds[2] = openat(fd_tmpfs, "l1", O_DIRECTORY);
	ASSERT_GE(layer_fds[2], 0);

	layer_fds[3] = openat(fd_tmpfs, "l2", O_DIRECTORY);
	ASSERT_GE(layer_fds[3], 0);

	ASSERT_EQ(sys_move_mount(fd_tmpfs, "", -EBADF, "/tmp", MOVE_MOUNT_F_EMPTY_PATH), 0);
	ASSERT_EQ(close(fd_tmpfs), 0);

	fd_context = sys_fsopen("overlay", 0);
	ASSERT_GE(fd_context, 0);

	ASSERT_NE(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir", NULL, layer_fds[2]), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "workdir",   NULL, layer_fds[0]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "upperdir",  NULL, layer_fds[1]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[2]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FD, "lowerdir+", NULL, layer_fds[3]), 0);
	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "userxattr", NULL, 0), 0);

	pid = create_child(&pidfd, 0);
	ASSERT_GE(pid, 0);
	if (pid == 0) {
		if (!cap_down(CAP_MKNOD))
			_exit(EXIT_FAILURE);

		if (!cap_down(CAP_SYS_ADMIN))
			_exit(EXIT_FAILURE);

		if (sys_fsconfig(fd_context, FSCONFIG_SET_FLAG, "override_creds", NULL, 0))
			_exit(EXIT_FAILURE);

		_exit(EXIT_SUCCESS);
	}
	ASSERT_EQ(sys_waitid(P_PID, pid, NULL, WEXITED), 0);
	ASSERT_GE(close(pidfd), 0);

	ASSERT_EQ(sys_fsconfig(fd_context, FSCONFIG_CMD_CREATE, NULL, NULL, 0), 0);

	fd_overlay = sys_fsmount(fd_context, 0, 0);
	ASSERT_GE(fd_overlay, 0);

	ASSERT_EQ(sys_move_mount(fd_overlay, "", -EBADF, "/set_layers_via_fds", MOVE_MOUNT_F_EMPTY_PATH), 0);
	ASSERT_EQ(mknodat(fd_overlay, "dev-zero", S_IFCHR | 0644, makedev(1, 5)), -1);
	ASSERT_EQ(errno, EPERM);

	ASSERT_EQ(close(fd_context), 0);
	ASSERT_EQ(close(fd_overlay), 0);
}

TEST_F(set_layers_via_fds, set_500_layers_via_opath_fds)
{
	int fd_context, fd_tmpfs, fd_overlay, fd_work, fd_upper, fd_lower;
Loading