Commit 2cfa582b authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.14/dm-changes' of...

Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper updates from Mike Snitzer:

 - Various DM persistent-data library improvements and fixes that
   benefit both the DM thinp and cache targets.

 - A few small DM kcopyd efficiency improvements.

 - Significant zoned related block core, DM core and DM zoned target
   changes that culminate with adding zoned append emulation (which is
   required to properly fix DM crypt's zoned support).

 - Various DM writecache target changes that improve efficiency. Adds an
   optional "metadata_only" feature that only promotes bios flagged with
   REQ_META. But the most significant improvement is writecache's
   ability to pause writeback, for a confiurable time, if/when the
   working set is larger than the cache (and the cache is full) -- this
   ensures performance is no worse than the slower origin device.

* tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits)
  dm writecache: make writeback pause configurable
  dm writecache: pause writeback if cache full and origin being written directly
  dm io tracker: factor out IO tracker
  dm btree remove: assign new_root only when removal succeeds
  dm zone: fix dm_revalidate_zones() memory allocation
  dm ps io affinity: remove redundant continue statement
  dm writecache: add optional "metadata_only" parameter
  dm writecache: add "cleaner" and "max_age" to Documentation
  dm writecache: write at least 4k when committing
  dm writecache: flush origin device when writing and cache is full
  dm writecache: have ssd writeback wait if the kcopyd workqueue is busy
  dm writecache: use list_move instead of list_del/list_add in writecache_writeback()
  dm writecache: commit just one block, not a full page
  dm writecache: remove unused gfp_t argument from wc_add_block()
  dm crypt: Fix zoned block device support
  dm: introduce zone append emulation
  dm: rearrange core declarations for extended use from dm-zone.c
  block: introduce BIO_ZONE_WRITE_LOCKED bio flag
  block: introduce bio zone helpers
  block: improve handling of all zones reset operation
  ...
parents dbe69e43 5c0de3d7
Loading
Loading
Loading
Loading
+23 −2
Original line number Diff line number Diff line
@@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
Constructor parameters:

1. type of the cache device - "p" or "s"

	- p - persistent memory
	- s - SSD
2. the underlying device that will be cached
@@ -21,7 +20,6 @@ Constructor parameters:
   size)
5. the number of optional parameters (the parameters with an argument
   count as two)

	start_sector n		(default: 0)
		offset from the start of cache device in 512-byte sectors
	high_watermark n	(default: 50)
@@ -53,6 +51,27 @@ Constructor parameters:

		- some underlying devices perform better with fua, some
		  with nofua. The user should test it
	cleaner
		when this option is activated (either in the constructor
		arguments or by a message), the cache will not promote
		new writes (however, writes to already cached blocks are
		promoted, to avoid data corruption due to misordered
		writes) and it will gradually writeback any cached
		data. The userspace can then monitor the cleaning
		process with "dmsetup status". When the number of cached
		blocks drops to zero, userspace can unload the
		dm-writecache target and replace it with dm-linear or
		other targets.
	max_age n
		specifies the maximum age of a block in milliseconds. If
		a block is stored in the cache for too long, it will be
		written to the underlying device and cleaned up.
	metadata_only
		only metadata is promoted to the cache. This option
		improves performance for heavier REQ_META workloads.
	pause_writeback n	(default: 3000)
		pause writeback if there was some write I/O redirected to
		the origin volume in the last n milliseconds

Status:
1. error indicator - 0 if there was no error, otherwise error number
@@ -77,3 +96,5 @@ Messages:
		5. resume the device, so that it will use the linear
		   target
		6. the cache device is now inactive and it can be deleted
	cleaner
		See above "cleaner" constructor documentation.
+92 −27
Original line number Diff line number Diff line
@@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);

static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
						sector_t sector,
						sector_t nr_sectors)
static inline unsigned long *blk_alloc_zone_bitmap(int node,
						   unsigned int nr_zones)
{
	if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
		return false;
	return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
			    GFP_NOIO, node);
}

static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
				  void *data)
{
	/*
	 * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
	 * of the applicable zone range is the entire disk.
	 * For an all-zones reset, ignore conventional, empty, read-only
	 * and offline zones.
	 */
	return !sector && nr_sectors == get_capacity(bdev->bd_disk);
	switch (zone->cond) {
	case BLK_ZONE_COND_NOT_WP:
	case BLK_ZONE_COND_EMPTY:
	case BLK_ZONE_COND_READONLY:
	case BLK_ZONE_COND_OFFLINE:
		return 0;
	default:
		set_bit(idx, (unsigned long *)data);
		return 0;
	}
}

static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
					  gfp_t gfp_mask)
{
	struct request_queue *q = bdev_get_queue(bdev);
	sector_t capacity = get_capacity(bdev->bd_disk);
	sector_t zone_sectors = blk_queue_zone_sectors(q);
	unsigned long *need_reset;
	struct bio *bio = NULL;
	sector_t sector = 0;
	int ret;

	need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
	if (!need_reset)
		return -ENOMEM;

	ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
				q->nr_zones, blk_zone_need_reset_cb,
				need_reset);
	if (ret < 0)
		goto out_free_need_reset;

	ret = 0;
	while (sector < capacity) {
		if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
			sector += zone_sectors;
			continue;
		}

		bio = blk_next_bio(bio, 0, gfp_mask);
		bio_set_dev(bio, bdev);
		bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
		bio->bi_iter.bi_sector = sector;
		sector += zone_sectors;

		/* This may take a while, so be nice to others */
		cond_resched();
	}

	if (bio) {
		ret = submit_bio_wait(bio);
		bio_put(bio);
	}

out_free_need_reset:
	kfree(need_reset);
	return ret;
}

static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
{
	struct bio bio;

	bio_init(&bio, NULL, 0);
	bio_set_dev(&bio, bdev);
	bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;

	return submit_bio_wait(&bio);
}

/**
@@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
	sector_t capacity = get_capacity(bdev->bd_disk);
	sector_t end_sector = sector + nr_sectors;
	struct bio *bio = NULL;
	int ret;
	int ret = 0;

	if (!blk_queue_is_zoned(q))
		return -EOPNOTSUPP;
@@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
	if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
		return -EINVAL;

	while (sector < end_sector) {
		bio = blk_next_bio(bio, 0, gfp_mask);
		bio_set_dev(bio, bdev);

	/*
		 * Special case for the zone reset operation that reset all
		 * zones, this is useful for applications like mkfs.
	 * In the case of a zone reset operation over all zones,
	 * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
	 * command. For other devices, we emulate this command behavior by
	 * identifying the zones needing a reset.
	 */
		if (op == REQ_OP_ZONE_RESET &&
		    blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
			bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
			break;
	if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
		if (!blk_queue_zone_resetall(q))
			return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
		return blkdev_zone_reset_all(bdev, gfp_mask);
	}

	while (sector < end_sector) {
		bio = blk_next_bio(bio, 0, gfp_mask);
		bio_set_dev(bio, bdev);
		bio->bi_opf = op | REQ_SYNC;
		bio->bi_iter.bi_sector = sector;
		sector += zone_sectors;
@@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
	return ret;
}

static inline unsigned long *blk_alloc_zone_bitmap(int node,
						   unsigned int nr_zones)
{
	return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
			    GFP_NOIO, node);
}

void blk_queue_free_zone_bitmaps(struct request_queue *q)
{
	kfree(q->conv_zones_bitmap);
+4 −0
Original line number Diff line number Diff line
@@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs			+= dm-uevent.o
endif

ifeq ($(CONFIG_BLK_DEV_ZONED),y)
dm-mod-objs			+= dm-zone.o
endif

ifeq ($(CONFIG_DM_VERITY_FEC),y)
dm-verity-objs			+= dm-verity-fec.o
endif
+6 −76
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include "dm-bio-prison-v2.h"
#include "dm-bio-record.h"
#include "dm-cache-metadata.h"
#include "dm-io-tracker.h"

#include <linux/dm-io.h>
#include <linux/dm-kcopyd.h>
@@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,

/*----------------------------------------------------------------*/

struct io_tracker {
	spinlock_t lock;

	/*
	 * Sectors of in-flight IO.
	 */
	sector_t in_flight;

	/*
	 * The time, in jiffies, when this device became idle (if it is
	 * indeed idle).
	 */
	unsigned long idle_time;
	unsigned long last_update_time;
};

static void iot_init(struct io_tracker *iot)
{
	spin_lock_init(&iot->lock);
	iot->in_flight = 0ul;
	iot->idle_time = 0ul;
	iot->last_update_time = jiffies;
}

static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
	if (iot->in_flight)
		return false;

	return time_after(jiffies, iot->idle_time + jifs);
}

static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
{
	bool r;

	spin_lock_irq(&iot->lock);
	r = __iot_idle_for(iot, jifs);
	spin_unlock_irq(&iot->lock);

	return r;
}

static void iot_io_begin(struct io_tracker *iot, sector_t len)
{
	spin_lock_irq(&iot->lock);
	iot->in_flight += len;
	spin_unlock_irq(&iot->lock);
}

static void __iot_io_end(struct io_tracker *iot, sector_t len)
{
	if (!len)
		return;

	iot->in_flight -= len;
	if (!iot->in_flight)
		iot->idle_time = jiffies;
}

static void iot_io_end(struct io_tracker *iot, sector_t len)
{
	unsigned long flags;

	spin_lock_irqsave(&iot->lock, flags);
	__iot_io_end(iot, len);
	spin_unlock_irqrestore(&iot->lock, flags);
}

/*----------------------------------------------------------------*/

/*
 * Represents a chunk of future work.  'input' allows continuations to pass
 * values between themselves, typically error values.
@@ -470,7 +400,7 @@ struct cache {
	struct batcher committer;
	struct work_struct commit_ws;

	struct io_tracker tracker;
	struct dm_io_tracker tracker;

	mempool_t migration_pool;

@@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
	if (accountable_bio(cache, bio)) {
		pb = get_per_bio_data(bio);
		pb->len = bio_sectors(bio);
		iot_io_begin(&cache->tracker, pb->len);
		dm_iot_io_begin(&cache->tracker, pb->len);
	}
}

@@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
{
	struct per_bio_data *pb = get_per_bio_data(bio);

	iot_io_end(&cache->tracker, pb->len);
	dm_iot_io_end(&cache->tracker, pb->len);
}

static void accounted_request(struct cache *cache, struct bio *bio)
@@ -1642,7 +1572,7 @@ enum busy {

static enum busy spare_migration_bandwidth(struct cache *cache)
{
	bool idle = iot_idle_for(&cache->tracker, HZ);
	bool idle = dm_iot_idle_for(&cache->tracker, HZ);
	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
		cache->sectors_per_block;

@@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)

	batcher_init(&cache->committer, commit_op, cache,
		     issue_op, cache, cache->wq);
	iot_init(&cache->tracker);
	dm_iot_init(&cache->tracker);

	init_rwsem(&cache->background_work_lock);
	prevent_background_work(cache);
+65 −0
Original line number Diff line number Diff line
@@ -114,8 +114,27 @@ struct mapped_device {
	bool init_tio_pdu:1;

	struct srcu_struct io_barrier;

#ifdef CONFIG_BLK_DEV_ZONED
	unsigned int nr_zones;
	unsigned int *zwp_offset;
#endif
};

/*
 * Bits for the flags field of struct mapped_device.
 */
#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_DEFERRED_REMOVE 6
#define DMF_SUSPENDED_INTERNALLY 7
#define DMF_POST_SUSPENDING 8
#define DMF_EMULATE_ZONE_APPEND 9

void disable_discard(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
@@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
	return &md->stats;
}

static inline bool dm_emulate_zone_append(struct mapped_device *md)
{
	if (blk_queue_is_zoned(md->queue))
		return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
	return false;
}

#define DM_TABLE_MAX_DEPTH 16

struct dm_table {
@@ -173,6 +199,45 @@ struct dm_table {
#endif
};

/*
 * One of these is allocated per clone bio.
 */
#define DM_TIO_MAGIC 7282014
struct dm_target_io {
	unsigned int magic;
	struct dm_io *io;
	struct dm_target *ti;
	unsigned int target_bio_nr;
	unsigned int *len_ptr;
	bool inside_dm_io;
	struct bio clone;
};

/*
 * One of these is allocated per original bio.
 * It contains the first clone used for that original.
 */
#define DM_IO_MAGIC 5191977
struct dm_io {
	unsigned int magic;
	struct mapped_device *md;
	blk_status_t status;
	atomic_t io_count;
	struct bio *orig_bio;
	unsigned long start_time;
	spinlock_t endio_lock;
	struct dm_stats_aux stats_aux;
	/* last member of dm_target_io is 'struct bio' */
	struct dm_target_io tio;
};

static inline void dm_io_inc_pending(struct dm_io *io)
{
	atomic_inc(&io->io_count);
}

void dm_io_dec_pending(struct dm_io *io, blk_status_t error);

static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
	return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
Loading