Commit 7f851936 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull overlayfs updates from Amir Goldstein:

 - Overlayfs aio cleanups and fixes

   Cleanups and minor fixes in preparation for factoring out of
   read/write passthrough code.

 - Overlayfs lock ordering changes

   Hold mnt_writers only throughout copy up instead of a long lived
   elevated refcount.

 - Add support for nesting overlayfs private xattrs

   There are cases where you want to use an overlayfs mount as a
   lowerdir for another overlayfs mount. For example, if the system
   rootfs is on overlayfs due to composefs, or to make it volatile (via
   tmpfs), then you cannot currently store a lowerdir on the rootfs,
   because the inner overlayfs will eat all the whiteouts and overlay
   xattrs. This means you can't e.g. store on the rootfs a prepared
   container image for use with overlayfs.

   This adds support for nesting of overlayfs mounts by escaping the
   problematic features and unescaping them when exposing to the
   overlayfs user.

 - Add new mount options for appending lowerdirs

* tag 'ovl-update-6.7' of git://git.kernel.org/pub/scm/linux/kernel/git/overlayfs/vfs:
  ovl: add support for appending lowerdirs one by one
  ovl: refactor layer parsing helpers
  ovl: store and show the user provided lowerdir mount option
  ovl: remove unused code in lowerdir param parsing
  ovl: Add documentation on nesting of overlayfs mounts
  ovl: Add an alternative type of whiteout
  ovl: Support escaped overlay.* xattrs
  ovl: Add OVL_XATTR_TRUSTED/USER_PREFIX_LEN macros
  ovl: Move xattr support to new xattrs.c file
  ovl: do not encode lower fh with upper sb_writers held
  ovl: do not open/llseek lower file with upper sb_writers held
  ovl: reorder ovl_want_write() after ovl_inode_lock()
  ovl: split ovl_want_write() into two helpers
  ovl: add helper ovl_file_modified()
  ovl: protect copying of realinode attributes to ovl inode
  ovl: punt write aio completion to workqueue
  ovl: propagate IOCB_APPEND flag on writes to realfile
  ovl: use simpler function to convert iocb to rw flags
parents c9d01179 24e16e38
Loading
Loading
Loading
Loading
+37 −3
Original line number Diff line number Diff line
@@ -344,10 +344,11 @@ escaping the colons with a single backslash. For example:

  mount -t overlay overlay -olowerdir=/a\:lower\:\:dir /merged

Since kernel version v6.5, directory names containing colons can also
be provided as lower layer using the fsconfig syscall from new mount api:
Since kernel version v6.8, directory names containing colons can also
be configured as lower layer using the "lowerdir+" mount options and the
fsconfig syscall from new mount api.  For example:

  fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir", "/a:lower::dir", 0);
  fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir+", "/a:lower::dir", 0);

In the latter case, colons in lower layer directory names will be escaped
as an octal characters (\072) when displayed in /proc/self/mountinfo.
@@ -416,6 +417,16 @@ Only the data of the files in the "data-only" lower layers may be visible
when a "metacopy" file in one of the lower layers above it, has a "redirect"
to the absolute path of the "lower data" file in the "data-only" lower layer.

Since kernel version v6.8, "data-only" lower layers can also be added using
the "datadir+" mount options and the fsconfig syscall from new mount api.
For example:

  fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir+", "/l1", 0);
  fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir+", "/l2", 0);
  fsconfig(fs_fd, FSCONFIG_SET_STRING, "lowerdir+", "/l3", 0);
  fsconfig(fs_fd, FSCONFIG_SET_STRING, "datadir+", "/do1", 0);
  fsconfig(fs_fd, FSCONFIG_SET_STRING, "datadir+", "/do2", 0);


fs-verity support
----------------------
@@ -504,6 +515,29 @@ directory tree on the same or different underlying filesystem, and even
to a different machine.  With the "inodes index" feature, trying to mount
the copied layers will fail the verification of the lower root file handle.

Nesting overlayfs mounts
------------------------

It is possible to use a lower directory that is stored on an overlayfs
mount. For regular files this does not need any special care. However, files
that have overlayfs attributes, such as whiteouts or "overlay.*" xattrs will be
interpreted by the underlying overlayfs mount and stripped out. In order to
allow the second overlayfs mount to see the attributes they must be escaped.

Overlayfs specific xattrs are escaped by using a special prefix of
"overlay.overlay.". So, a file with a "trusted.overlay.overlay.metacopy" xattr
in the lower dir will be exposed as a regular file with a
"trusted.overlay.metacopy" xattr in the overlayfs mount. This can be nested by
repeating the prefix multiple time, as each instance only removes one prefix.

A lower dir with a regular whiteout will always be handled by the overlayfs
mount, so to support storing an effective whiteout file in an overlayfs mount an
alternative form of whiteout is supported. This form is a regular, zero-size
file with the "overlay.whiteout" xattr set, inside a directory with the
"overlay.whiteouts" xattr set. Such whiteouts are never created by overlayfs,
but can be used by userspace tools (like containers) that generate lower layers.
These alternative whiteouts can be escaped using the standard xattr escape
mechanism in order to properly nest to any depth.

Non-standard behavior
---------------------
+1 −1
Original line number Diff line number Diff line
@@ -6,4 +6,4 @@
obj-$(CONFIG_OVERLAY_FS) += overlay.o

overlay-objs := super.o namei.o util.o inode.o file.o dir.o readdir.o \
		copy_up.o export.o params.o
		copy_up.o export.o params.o xattrs.o
+91 −51
Original line number Diff line number Diff line
@@ -252,7 +252,9 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
		return PTR_ERR(old_file);

	/* Try to use clone_file_range to clone up within the same fs */
	ovl_start_write(dentry);
	cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
	ovl_end_write(dentry);
	if (cloned == len)
		goto out_fput;
	/* Couldn't clone, so now we try to copy the data */
@@ -287,8 +289,12 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
		 * it may not recognize all kind of holes and sometimes
		 * only skips partial of hole area. However, it will be
		 * enough for most of the use cases.
		 *
		 * We do not hold upper sb_writers throughout the loop to avert
		 * lockdep warning with llseek of lower file in nested overlay:
		 * - upper sb_writers
		 * -- lower ovl_inode_lock (ovl_llseek)
		 */

		if (skip_hole && data_pos < old_pos) {
			data_pos = vfs_llseek(old_file, old_pos, SEEK_DATA);
			if (data_pos > old_pos) {
@@ -303,9 +309,11 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
			}
		}

		ovl_start_write(dentry);
		bytes = do_splice_direct(old_file, &old_pos,
					 new_file, &new_pos,
					 this_len, SPLICE_F_MOVE);
		ovl_end_write(dentry);
		if (bytes <= 0) {
			error = bytes;
			break;
@@ -426,29 +434,29 @@ struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real,
	return ERR_PTR(err);
}

int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower,
		   struct dentry *upper)
struct ovl_fh *ovl_get_origin_fh(struct ovl_fs *ofs, struct dentry *origin)
{
	const struct ovl_fh *fh = NULL;
	int err;

	/*
	 * When lower layer doesn't support export operations store a 'null' fh,
	 * so we can use the overlay.origin xattr to distignuish between a copy
	 * up and a pure upper inode.
	 */
	if (ovl_can_decode_fh(lower->d_sb)) {
		fh = ovl_encode_real_fh(ofs, lower, false);
		if (IS_ERR(fh))
			return PTR_ERR(fh);
	if (!ovl_can_decode_fh(origin->d_sb))
		return NULL;

	return ovl_encode_real_fh(ofs, origin, false);
}

int ovl_set_origin_fh(struct ovl_fs *ofs, const struct ovl_fh *fh,
		      struct dentry *upper)
{
	int err;

	/*
	 * Do not fail when upper doesn't support xattrs.
	 */
	err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf,
				 fh ? fh->fb.len : 0, 0);
	kfree(fh);

	/* Ignore -EPERM from setting "user.*" on symlink/special */
	return err == -EPERM ? 0 : err;
@@ -476,7 +484,7 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
 *
 * Caller must hold i_mutex on indexdir.
 */
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
			    struct dentry *upper)
{
	struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
@@ -502,7 +510,7 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
	if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
		return -EIO;

	err = ovl_get_index_name(ofs, origin, &name);
	err = ovl_get_index_name_fh(fh, &name);
	if (err)
		return err;

@@ -541,6 +549,7 @@ struct ovl_copy_up_ctx {
	struct dentry *destdir;
	struct qstr destname;
	struct dentry *workdir;
	const struct ovl_fh *origin_fh;
	bool origin;
	bool indexed;
	bool metacopy;
@@ -555,14 +564,16 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
	struct inode *udir = d_inode(upperdir);

	ovl_start_write(c->dentry);

	/* Mark parent "impure" because it may now contain non-pure upper */
	err = ovl_set_impure(c->parent, upperdir);
	if (err)
		return err;
		goto out;

	err = ovl_set_nlink_lower(c->dentry);
	if (err)
		return err;
		goto out;

	inode_lock_nested(udir, I_MUTEX_PARENT);
	upper = ovl_lookup_upper(ofs, c->dentry->d_name.name, upperdir,
@@ -581,10 +592,12 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
	}
	inode_unlock(udir);
	if (err)
		return err;
		goto out;

	err = ovl_set_nlink_upper(c->dentry);

out:
	ovl_end_write(c->dentry);
	return err;
}

@@ -637,7 +650,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp)
	 * hard link.
	 */
	if (c->origin) {
		err = ovl_set_origin(ofs, c->lowerpath.dentry, temp);
		err = ovl_set_origin_fh(ofs, c->origin_fh, temp);
		if (err)
			return err;
	}
@@ -719,21 +732,19 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
		.link = c->link
	};

	/* workdir and destdir could be the same when copying up to indexdir */
	err = -EIO;
	if (lock_rename(c->workdir, c->destdir) != NULL)
		goto unlock;

	err = ovl_prep_cu_creds(c->dentry, &cc);
	if (err)
		goto unlock;
		return err;

	ovl_start_write(c->dentry);
	inode_lock(wdir);
	temp = ovl_create_temp(ofs, c->workdir, &cattr);
	inode_unlock(wdir);
	ovl_end_write(c->dentry);
	ovl_revert_cu_creds(&cc);

	err = PTR_ERR(temp);
	if (IS_ERR(temp))
		goto unlock;
		return PTR_ERR(temp);

	/*
	 * Copy up data first and then xattrs. Writing data after
@@ -741,15 +752,28 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
	 */
	path.dentry = temp;
	err = ovl_copy_up_data(c, &path);
	if (err)
	/*
	 * We cannot hold lock_rename() throughout this helper, because or
	 * lock ordering with sb_writers, which shouldn't be held when calling
	 * ovl_copy_up_data(), so lock workdir and destdir and make sure that
	 * temp wasn't moved before copy up completion or cleanup.
	 * If temp was moved, abort without the cleanup.
	 */
	ovl_start_write(c->dentry);
	if (lock_rename(c->workdir, c->destdir) != NULL ||
	    temp->d_parent != c->workdir) {
		err = -EIO;
		goto unlock;
	} else if (err) {
		goto cleanup;
	}

	err = ovl_copy_up_metadata(c, temp);
	if (err)
		goto cleanup;

	if (S_ISDIR(c->stat.mode) && c->indexed) {
		err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
		err = ovl_create_index(c->dentry, c->origin_fh, temp);
		if (err)
			goto cleanup;
	}
@@ -779,6 +803,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
		ovl_set_flag(OVL_WHITEOUTS, inode);
unlock:
	unlock_rename(c->workdir, c->destdir);
	ovl_end_write(c->dentry);

	return err;

@@ -802,9 +827,10 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
	if (err)
		return err;

	ovl_start_write(c->dentry);
	tmpfile = ovl_do_tmpfile(ofs, c->workdir, c->stat.mode);
	ovl_end_write(c->dentry);
	ovl_revert_cu_creds(&cc);

	if (IS_ERR(tmpfile))
		return PTR_ERR(tmpfile);

@@ -815,9 +841,11 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
			goto out_fput;
	}

	ovl_start_write(c->dentry);

	err = ovl_copy_up_metadata(c, temp);
	if (err)
		goto out_fput;
		goto out;

	inode_lock_nested(udir, I_MUTEX_PARENT);

@@ -831,7 +859,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
	inode_unlock(udir);

	if (err)
		goto out_fput;
		goto out;

	if (c->metacopy_digest)
		ovl_set_flag(OVL_HAS_DIGEST, d_inode(c->dentry));
@@ -843,6 +871,8 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
		ovl_set_upperdata(d_inode(c->dentry));
	ovl_inode_update(d_inode(c->dentry), dget(temp));

out:
	ovl_end_write(c->dentry);
out_fput:
	fput(tmpfile);
	return err;
@@ -861,6 +891,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
{
	int err;
	struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
	struct dentry *origin = c->lowerpath.dentry;
	struct ovl_fh *fh = NULL;
	bool to_index = false;

	/*
@@ -877,25 +909,35 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
			to_index = true;
	}

	if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
	if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index) {
		fh = ovl_get_origin_fh(ofs, origin);
		if (IS_ERR(fh))
			return PTR_ERR(fh);

		/* origin_fh may be NULL */
		c->origin_fh = fh;
		c->origin = true;
	}

	if (to_index) {
		c->destdir = ovl_indexdir(c->dentry->d_sb);
		err = ovl_get_index_name(ofs, c->lowerpath.dentry, &c->destname);
		err = ovl_get_index_name(ofs, origin, &c->destname);
		if (err)
			return err;
			goto out_free_fh;
	} else if (WARN_ON(!c->parent)) {
		/* Disconnected dentry must be copied up to index dir */
		return -EIO;
		err = -EIO;
		goto out_free_fh;
	} else {
		/*
		 * Mark parent "impure" because it may now contain non-pure
		 * upper
		 */
		ovl_start_write(c->dentry);
		err = ovl_set_impure(c->parent, c->destdir);
		ovl_end_write(c->dentry);
		if (err)
			return err;
			goto out_free_fh;
	}

	/* Should we copyup with O_TMPFILE or with workdir? */
@@ -909,6 +951,7 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
	if (c->indexed)
		ovl_set_flag(OVL_INDEX, d_inode(c->dentry));

	ovl_start_write(c->dentry);
	if (to_index) {
		/* Initialize nlink for copy up of disconnected dentry */
		err = ovl_set_nlink_upper(c->dentry);
@@ -923,10 +966,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
		ovl_dentry_set_upper_alias(c->dentry);
		ovl_dentry_update_reval(c->dentry, ovl_dentry_upper(c->dentry));
	}
	ovl_end_write(c->dentry);

out:
	if (to_index)
		kfree(c->destname.name);
out_free_fh:
	kfree(fh);
	return err;
}

@@ -1011,15 +1057,16 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
	 * Writing to upper file will clear security.capability xattr. We
	 * don't want that to happen for normal copy-up operation.
	 */
	ovl_start_write(c->dentry);
	if (capability) {
		err = ovl_do_setxattr(ofs, upperpath.dentry, XATTR_NAME_CAPS,
				      capability, cap_size, 0);
		if (err)
			goto out_free;
	}


	err = ovl_removexattr(ofs, upperpath.dentry, OVL_XATTR_METACOPY);
	if (!err) {
		err = ovl_removexattr(ofs, upperpath.dentry,
				      OVL_XATTR_METACOPY);
	}
	ovl_end_write(c->dentry);
	if (err)
		goto out_free;

@@ -1170,17 +1217,10 @@ static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)

int ovl_maybe_copy_up(struct dentry *dentry, int flags)
{
	int err = 0;

	if (ovl_open_need_copy_up(dentry, flags)) {
		err = ovl_want_write(dentry);
		if (!err) {
			err = ovl_copy_up_flags(dentry, flags);
			ovl_drop_write(dentry);
		}
	}
	if (!ovl_open_need_copy_up(dentry, flags))
		return 0;

	return err;
	return ovl_copy_up_flags(dentry, flags);
}

int ovl_copy_up_with_data(struct dentry *dentry)
+28 −36
Original line number Diff line number Diff line
@@ -477,7 +477,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
		goto out_unlock;

	err = -ESTALE;
	if (d_is_negative(upper) || !IS_WHITEOUT(d_inode(upper)))
	if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
		goto out_dput;

	newdentry = ovl_create_temp(ofs, workdir, cattr);
@@ -559,10 +559,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
	struct cred *override_cred;
	struct dentry *parent = dentry->d_parent;

	err = ovl_copy_up(parent);
	if (err)
		return err;

	old_cred = ovl_override_creds(dentry->d_sb);

	/*
@@ -626,6 +622,10 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
		.link = link,
	};

	err = ovl_copy_up(dentry->d_parent);
	if (err)
		return err;

	err = ovl_want_write(dentry);
	if (err)
		goto out;
@@ -700,28 +700,24 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
	int err;
	struct inode *inode;

	err = ovl_want_write(old);
	err = ovl_copy_up(old);
	if (err)
		goto out;

	err = ovl_copy_up(old);
	err = ovl_copy_up(new->d_parent);
	if (err)
		goto out_drop_write;
		goto out;

	err = ovl_copy_up(new->d_parent);
	err = ovl_nlink_start(old);
	if (err)
		goto out_drop_write;
		goto out;

	if (ovl_is_metacopy_dentry(old)) {
		err = ovl_set_link_redirect(old);
		if (err)
			goto out_drop_write;
			goto out_nlink_end;
	}

	err = ovl_nlink_start(old);
	if (err)
		goto out_drop_write;

	inode = d_inode(old);
	ihold(inode);

@@ -731,9 +727,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
	if (err)
		iput(inode);

out_nlink_end:
	ovl_nlink_end(old);
out_drop_write:
	ovl_drop_write(old);
out:
	return err;
}
@@ -891,17 +886,13 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
			goto out;
	}

	err = ovl_want_write(dentry);
	if (err)
		goto out;

	err = ovl_copy_up(dentry->d_parent);
	if (err)
		goto out_drop_write;
		goto out;

	err = ovl_nlink_start(dentry);
	if (err)
		goto out_drop_write;
		goto out;

	old_cred = ovl_override_creds(dentry->d_sb);
	if (!lower_positive)
@@ -926,8 +917,6 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
	if (ovl_dentry_upper(dentry))
		ovl_copyattr(d_inode(dentry));

out_drop_write:
	ovl_drop_write(dentry);
out:
	ovl_cache_free(&list);
	return err;
@@ -1131,29 +1120,32 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
		}
	}

	err = ovl_want_write(old);
	if (err)
		goto out;

	err = ovl_copy_up(old);
	if (err)
		goto out_drop_write;
		goto out;

	err = ovl_copy_up(new->d_parent);
	if (err)
		goto out_drop_write;
		goto out;
	if (!overwrite) {
		err = ovl_copy_up(new);
		if (err)
			goto out_drop_write;
			goto out;
	} else if (d_inode(new)) {
		err = ovl_nlink_start(new);
		if (err)
			goto out_drop_write;
			goto out;

		update_nlink = true;
	}

	if (!update_nlink) {
		/* ovl_nlink_start() took ovl_want_write() */
		err = ovl_want_write(old);
		if (err)
			goto out;
	}

	old_cred = ovl_override_creds(old->d_sb);

	if (!list_empty(&list)) {
@@ -1219,7 +1211,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
		}
	} else {
		if (!d_is_negative(newdentry)) {
			if (!new_opaque || !ovl_is_whiteout(newdentry))
			if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
				goto out_dput;
		} else {
			if (flags & RENAME_EXCHANGE)
@@ -1286,7 +1278,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
	revert_creds(old_cred);
	if (update_nlink)
		ovl_nlink_end(new);
out_drop_write:
	else
		ovl_drop_write(old);
out:
	dput(opaquedir);
+1 −6
Original line number Diff line number Diff line
@@ -23,12 +23,7 @@ static int ovl_encode_maybe_copy_up(struct dentry *dentry)
	if (ovl_dentry_upper(dentry))
		return 0;

	err = ovl_want_write(dentry);
	if (!err) {
	err = ovl_copy_up(dentry);
		ovl_drop_write(dentry);
	}

	if (err) {
		pr_warn_ratelimited("failed to copy up on encode (%pd2, err=%i)\n",
				    dentry, err);
Loading