Unverified Commit 40ebc18b authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge series 'Open block devices as files' of...

Merge series 'Open block devices as files' of https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-0-adbd023e19cc@kernel.org

Pull block devices as files from Christian Brauner:

This opens block devices as files. Instead of introducing a separate
indirection into bdev_open_by_*() vis struct bdev_handle we can just
make bdev_file_open_by_*() return a struct file. Opening and closing a
block device from setup_bdev_super() and in all other places just
becomes equivalent to opening and closing a file.

This has held up in xfstests and in blktests so far and it seems stable
and clean. The equivalence of opening and closing block devices to
regular files is a win in and of itself imho. Added to that is the
ability to do away with struct bdev_handle completely and make various
low-level helpers private to the block layer.

All places were we currently stash a struct bdev_handle we just stash a
file and use an accessor such as file_bdev() akin to I_BDEV() to get to
the block device.

It's now also possible to use file->f_mapping as a replacement for
bdev->bd_inode->i_mapping and file->f_inode or file->f_mapping->host as
an alternative to bdev->bd_inode allowing us to significantly reduce or
even fully remove bdev->bd_inode in follow-up patches.

In addition, we could get rid of sb->s_bdev and various other places
that stash the block device directly and instead stash the block device
file. Again, this is follow-up work if we want this.

* series 'Open block devices as files' of https://lore.kernel.org/r/20240123-vfs-bdev-file-v2-0-adbd023e19cc@kernel.org

: (35 commits)
  file: add alloc_file_pseudo_noaccount()
  file: prepare for new helper
  init: flush async file closing
  block: remove bdev_handle completely
  block: don't rely on BLK_OPEN_RESTRICT_WRITES when yielding write access
  bdev: remove bdev pointer from struct bdev_handle
  bdev: make struct bdev_handle private to the block layer
  bdev: make bdev_{release, open_by_dev}() private to block layer
  bdev: remove bdev_open_by_path()
  reiserfs: port block device access to file
  ocfs2: port block device access to file
  nfs: port block device access to files
  jfs: port block device access to file
  f2fs: port block device access to files
  ext4: port block device access to file
  erofs: port device access to file
  btrfs: port device access to file
  bcachefs: port block device access to file
  target: port block device access to file
  s390: port block device access to file
  nvme: port block device access to file
  block2mtd: port device access to files
  bcache: port block device access to files
  ...

Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 6613476e ab838b3f
Loading
Loading
Loading
Loading
+155 −97
Original line number Diff line number Diff line
@@ -49,6 +49,12 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);

struct block_device *file_bdev(struct file *bdev_file)
{
	return I_BDEV(bdev_file->f_mapping->host);
}
EXPORT_SYMBOL(file_bdev);

static void bdev_write_inode(struct block_device *bdev)
{
	struct inode *inode = bdev->bd_inode;
@@ -368,12 +374,12 @@ static struct file_system_type bd_type = {
};

struct super_block *blockdev_superblock __ro_after_init;
struct vfsmount *blockdev_mnt __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);

void __init bdev_cache_init(void)
{
	int err;
	static struct vfsmount *bd_mnt __ro_after_init;

	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -382,10 +388,10 @@ void __init bdev_cache_init(void)
	err = register_filesystem(&bd_type);
	if (err)
		panic("Cannot register bdev pseudo-fs");
	bd_mnt = kern_mount(&bd_type);
	if (IS_ERR(bd_mnt))
	blockdev_mnt = kern_mount(&bd_type);
	if (IS_ERR(blockdev_mnt))
		panic("Cannot create bdev pseudo-fs");
	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
	blockdev_superblock = blockdev_mnt->mnt_sb;   /* For writeback */
}

struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
@@ -696,6 +702,31 @@ static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
	return ret;
}

int bdev_permission(dev_t dev, blk_mode_t mode, void *holder)
{
	int ret;

	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
			MAJOR(dev), MINOR(dev),
			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
	if (ret)
		return ret;

	/* Blocking writes requires exclusive opener */
	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder)
		return -EINVAL;

	/*
	 * We're using error pointers to indicate to ->release() when we
	 * failed to open that block device. Also this doesn't make sense.
	 */
	if (WARN_ON_ONCE(IS_ERR(holder)))
		return -EINVAL;

	return 0;
}

static void blkdev_put_part(struct block_device *part)
{
	struct block_device *whole = bdev_whole(part);
@@ -775,83 +806,55 @@ static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
		bdev->bd_writers++;
}

static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
static void bdev_yield_write_access(struct file *bdev_file)
{
	struct block_device *bdev;

	if (bdev_allow_write_mounted)
		return;

	bdev = file_bdev(bdev_file);
	/* Yield exclusive or shared write access. */
	if (mode & BLK_OPEN_RESTRICT_WRITES)
	if (bdev_file->f_mode & FMODE_WRITE) {
		if (bdev_writes_blocked(bdev))
			bdev_unblock_writes(bdev);
	else if (mode & BLK_OPEN_WRITE)
		else
			bdev->bd_writers--;
	}
}

/**
 * bdev_open_by_dev - open a block device by device number
 * @dev: device number of block device to open
 * bdev_open - open a block device
 * @bdev: block device to open
 * @mode: open mode (BLK_OPEN_*)
 * @holder: exclusive holder identifier
 * @hops: holder operations
 * @bdev_file: file for the block device
 *
 * Open the block device described by device number @dev. If @holder is not
 * %NULL, the block device is opened with exclusive access.  Exclusive opens may
 * nest for the same @holder.
 *
 * Use this interface ONLY if you really do not have anything better - i.e. when
 * you are behind a truly sucky interface and all you are given is a device
 * number.  Everything else should use bdev_open_by_path().
 * Open the block device. If @holder is not %NULL, the block device is opened
 * with exclusive access.  Exclusive opens may nest for the same @holder.
 *
 * CONTEXT:
 * Might sleep.
 *
 * RETURNS:
 * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
 * failure.
 * zero on success, -errno on failure.
 */
struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
				     const struct blk_holder_ops *hops)
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
	      const struct blk_holder_ops *hops, struct file *bdev_file)
{
	struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
					     GFP_KERNEL);
	struct block_device *bdev;
	bool unblock_events = true;
	struct gendisk *disk;
	struct gendisk *disk = bdev->bd_disk;
	int ret;

	if (!handle)
		return ERR_PTR(-ENOMEM);

	ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
			MAJOR(dev), MINOR(dev),
			((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
			((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
	if (ret)
		goto free_handle;

	/* Blocking writes requires exclusive opener */
	if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
		ret = -EINVAL;
		goto free_handle;
	}

	bdev = blkdev_get_no_open(dev);
	if (!bdev) {
		ret = -ENXIO;
		goto free_handle;
	}
	disk = bdev->bd_disk;

	if (holder) {
		mode |= BLK_OPEN_EXCL;
		ret = bd_prepare_to_claim(bdev, holder, hops);
		if (ret)
			goto put_blkdev;
			return ret;
	} else {
		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
			ret = -EIO;
			goto put_blkdev;
		}
		if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL))
			return -EIO;
	}

	disk_block_events(disk);
@@ -892,10 +895,16 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,

	if (unblock_events)
		disk_unblock_events(disk);
	handle->bdev = bdev;
	handle->holder = holder;
	handle->mode = mode;
	return handle;

	bdev_file->f_flags |= O_LARGEFILE;
	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
	if (bdev_nowait(bdev))
		bdev_file->f_mode |= FMODE_NOWAIT;
	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
	bdev_file->private_data = holder;

	return 0;
put_module:
	module_put(disk->fops->owner);
abort_claiming:
@@ -903,36 +912,80 @@ struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
		bd_abort_claiming(bdev, holder);
	mutex_unlock(&disk->open_mutex);
	disk_unblock_events(disk);
put_blkdev:
	blkdev_put_no_open(bdev);
free_handle:
	kfree(handle);
	return ERR_PTR(ret);
	return ret;
}
EXPORT_SYMBOL(bdev_open_by_dev);

/**
 * bdev_open_by_path - open a block device by name
 * @path: path to the block device to open
 * @mode: open mode (BLK_OPEN_*)
 * @holder: exclusive holder identifier
 * @hops: holder operations
 *
 * Open the block device described by the device file at @path.  If @holder is
 * not %NULL, the block device is opened with exclusive access.  Exclusive opens
 * may nest for the same @holder.
 *
 * CONTEXT:
 * Might sleep.
/*
 * If BLK_OPEN_WRITE_IOCTL is set then this is a historical quirk
 * associated with the floppy driver where it has allowed ioctls if the
 * file was opened for writing, but does not allow reads or writes.
 * Make sure that this quirk is reflected in @f_flags.
 *
 * RETURNS:
 * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
 * failure.
 * It can also happen if a block device is opened as O_RDWR | O_WRONLY.
 */
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
		void *holder, const struct blk_holder_ops *hops)
static unsigned blk_to_file_flags(blk_mode_t mode)
{
	unsigned int flags = 0;

	if ((mode & (BLK_OPEN_READ | BLK_OPEN_WRITE)) ==
	    (BLK_OPEN_READ | BLK_OPEN_WRITE))
		flags |= O_RDWR;
	else if (mode & BLK_OPEN_WRITE_IOCTL)
		flags |= O_RDWR | O_WRONLY;
	else if (mode & BLK_OPEN_WRITE)
		flags |= O_WRONLY;
	else if (mode & BLK_OPEN_READ)
		flags |= O_RDONLY; /* homeopathic, because O_RDONLY is 0 */
	else
		WARN_ON_ONCE(true);

	if (mode & BLK_OPEN_NDELAY)
		flags |= O_NDELAY;

	return flags;
}

struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
				   const struct blk_holder_ops *hops)
{
	struct file *bdev_file;
	struct block_device *bdev;
	unsigned int flags;
	int ret;

	ret = bdev_permission(dev, mode, holder);
	if (ret)
		return ERR_PTR(ret);

	bdev = blkdev_get_no_open(dev);
	if (!bdev)
		return ERR_PTR(-ENXIO);

	flags = blk_to_file_flags(mode);
	bdev_file = alloc_file_pseudo_noaccount(bdev->bd_inode,
			blockdev_mnt, "", flags | O_LARGEFILE, &def_blk_fops);
	if (IS_ERR(bdev_file)) {
		blkdev_put_no_open(bdev);
		return bdev_file;
	}
	ihold(bdev->bd_inode);

	ret = bdev_open(bdev, mode, holder, hops, bdev_file);
	if (ret) {
		/* We failed to open the block device. Let ->release() know. */
		bdev_file->private_data = ERR_PTR(ret);
		fput(bdev_file);
		return ERR_PTR(ret);
	}
	return bdev_file;
}
EXPORT_SYMBOL(bdev_file_open_by_dev);

struct file *bdev_file_open_by_path(const char *path, blk_mode_t mode,
				    void *holder,
				    const struct blk_holder_ops *hops)
{
	struct bdev_handle *handle;
	struct file *file;
	dev_t dev;
	int error;

@@ -940,22 +993,28 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
	if (error)
		return ERR_PTR(error);

	handle = bdev_open_by_dev(dev, mode, holder, hops);
	if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
	    bdev_read_only(handle->bdev)) {
		bdev_release(handle);
		return ERR_PTR(-EACCES);
	file = bdev_file_open_by_dev(dev, mode, holder, hops);
	if (!IS_ERR(file) && (mode & BLK_OPEN_WRITE)) {
		if (bdev_read_only(file_bdev(file))) {
			fput(file);
			file = ERR_PTR(-EACCES);
		}
	}

	return handle;
	return file;
}
EXPORT_SYMBOL(bdev_open_by_path);
EXPORT_SYMBOL(bdev_file_open_by_path);

void bdev_release(struct bdev_handle *handle)
void bdev_release(struct file *bdev_file)
{
	struct block_device *bdev = handle->bdev;
	struct block_device *bdev = file_bdev(bdev_file);
	void *holder = bdev_file->private_data;
	struct gendisk *disk = bdev->bd_disk;

	/* We failed to open that block device. */
	if (IS_ERR(holder))
		goto put_no_open;

	/*
	 * Sync early if it looks like we're the last one.  If someone else
	 * opens the block device between now and the decrement of bd_openers
@@ -967,10 +1026,10 @@ void bdev_release(struct bdev_handle *handle)
		sync_blockdev(bdev);

	mutex_lock(&disk->open_mutex);
	bdev_yield_write_access(bdev, handle->mode);
	bdev_yield_write_access(bdev_file);

	if (handle->holder)
		bd_end_claim(bdev, handle->holder);
	if (holder)
		bd_end_claim(bdev, holder);

	/*
	 * Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -986,10 +1045,9 @@ void bdev_release(struct bdev_handle *handle)
	mutex_unlock(&disk->open_mutex);

	module_put(disk->fops->owner);
put_no_open:
	blkdev_put_no_open(bdev);
	kfree(handle);
}
EXPORT_SYMBOL(bdev_release);

/**
 * lookup_bdev() - Look up a struct block_device by name.
+4 −0
Original line number Diff line number Diff line
@@ -516,4 +516,8 @@ static inline int req_ref_read(struct request *req)
	return atomic_read(&req->ref);
}

void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
	      const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
#endif /* BLK_INTERNAL_H */
+20 −26
Original line number Diff line number Diff line
@@ -569,18 +569,17 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
blk_mode_t file_to_blk_mode(struct file *file)
{
	blk_mode_t mode = 0;
	struct bdev_handle *handle = file->private_data;

	if (file->f_mode & FMODE_READ)
		mode |= BLK_OPEN_READ;
	if (file->f_mode & FMODE_WRITE)
		mode |= BLK_OPEN_WRITE;
	/*
	 * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
	 * determine whether the open was exclusive for already open files.
	 * do_dentry_open() clears O_EXCL from f_flags, use file->private_data
	 * to determine whether the open was exclusive for already open files.
	 */
	if (handle)
		mode |= handle->mode & BLK_OPEN_EXCL;
	if (file->private_data)
		mode |= BLK_OPEN_EXCL;
	else if (file->f_flags & O_EXCL)
		mode |= BLK_OPEN_EXCL;
	if (file->f_flags & O_NDELAY)
@@ -599,36 +598,31 @@ blk_mode_t file_to_blk_mode(struct file *file)

static int blkdev_open(struct inode *inode, struct file *filp)
{
	struct bdev_handle *handle;
	struct block_device *bdev;
	blk_mode_t mode;

	/*
	 * Preserve backwards compatibility and allow large file access
	 * even if userspace doesn't ask for it explicitly. Some mkfs
	 * binary needs it. We might want to drop this workaround
	 * during an unstable branch.
	 */
	filp->f_flags |= O_LARGEFILE;
	filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
	int ret;

	mode = file_to_blk_mode(filp);
	handle = bdev_open_by_dev(inode->i_rdev, mode,
			mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
	if (IS_ERR(handle))
		return PTR_ERR(handle);
	/* Use the file as the holder. */
	if (mode & BLK_OPEN_EXCL)
		filp->private_data = filp;
	ret = bdev_permission(inode->i_rdev, mode, filp->private_data);
	if (ret)
		return ret;

	if (bdev_nowait(handle->bdev))
		filp->f_mode |= FMODE_NOWAIT;
	bdev = blkdev_get_no_open(inode->i_rdev);
	if (!bdev)
		return -ENXIO;

	filp->f_mapping = handle->bdev->bd_inode->i_mapping;
	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
	filp->private_data = handle;
	return 0;
	ret = bdev_open(bdev, mode, filp->private_data, NULL, filp);
	if (ret)
		blkdev_put_no_open(bdev);
	return ret;
}

static int blkdev_release(struct inode *inode, struct file *filp)
{
	bdev_release(filp->private_data);
	bdev_release(filp);
	return 0;
}

+6 −6
Original line number Diff line number Diff line
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);

int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
{
	struct bdev_handle *handle;
	struct file *file;
	int ret = 0;

	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
	}

	set_bit(GD_NEED_PART_SCAN, &disk->state);
	handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
				  NULL);
	if (IS_ERR(handle))
		ret = PTR_ERR(handle);
	file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL,
				     NULL, NULL);
	if (IS_ERR(file))
		ret = PTR_ERR(file);
	else
		bdev_release(handle);
		fput(file);

	/*
	 * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
+4 −5
Original line number Diff line number Diff line
@@ -471,7 +471,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
		int __user *argp)
{
	int ret, n;
	struct bdev_handle *handle;
	struct file *file;

	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
@@ -483,12 +483,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
	if (mode & BLK_OPEN_EXCL)
		return set_blocksize(bdev, n);

	handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
	if (IS_ERR(handle))
	file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
	if (IS_ERR(file))
		return -EBUSY;
	ret = set_blocksize(bdev, n);
	bdev_release(handle);

	fput(file);
	return ret;
}

Loading