Commit 3253aba3 authored by Andreas Hindborg's avatar Andreas Hindborg Committed by Jens Axboe
Browse files

rust: block: introduce `kernel::block::mq` module

Add initial abstractions for working with blk-mq.

This patch is a maintained, refactored subset of code originally published
by Wedson Almeida Filho <wedsonaf@gmail.com> [1].

[1] https://github.com/wedsonaf/linux/tree/f2cfd2fe0e2ca4e90994f96afe268bbd4382a891/rust/kernel/blk/mq.rs



Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: default avatarAndreas Hindborg <a.hindborg@samsung.com>
Reviewed-by: default avatarBenno Lossin <benno.lossin@proton.me>
Link: https://lore.kernel.org/r/20240611114551.228679-2-nmi@metaspace.dk


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent c2670cf7
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@
 */

#include <kunit/test.h>
#include <linux/blk_types.h>
#include <linux/blk-mq.h>
#include <linux/errname.h>
#include <linux/ethtool.h>
#include <linux/jiffies.h>
@@ -20,6 +22,7 @@

/* `bindgen` gets confused at certain things. */
const size_t RUST_CONST_HELPER_ARCH_SLAB_MINALIGN = ARCH_SLAB_MINALIGN;
const size_t RUST_CONST_HELPER_PAGE_SIZE = PAGE_SIZE;
const gfp_t RUST_CONST_HELPER_GFP_ATOMIC = GFP_ATOMIC;
const gfp_t RUST_CONST_HELPER_GFP_KERNEL = GFP_KERNEL;
const gfp_t RUST_CONST_HELPER_GFP_KERNEL_ACCOUNT = GFP_KERNEL_ACCOUNT;
+16 −0
Original line number Diff line number Diff line
@@ -186,3 +186,19 @@ static_assert(
	__alignof__(size_t) == __alignof__(uintptr_t),
	"Rust code expects C `size_t` to match Rust `usize`"
);

// This will soon be moved to a separate file, so no need to merge with above.
#include <linux/blk-mq.h>
#include <linux/blkdev.h>

void *rust_helper_blk_mq_rq_to_pdu(struct request *rq)
{
	return blk_mq_rq_to_pdu(rq);
}
EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_to_pdu);

struct request *rust_helper_blk_mq_rq_from_pdu(void *pdu)
{
	return blk_mq_rq_from_pdu(pdu);
}
EXPORT_SYMBOL_GPL(rust_helper_blk_mq_rq_from_pdu);

rust/kernel/block.rs

0 → 100644
+5 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

//! Types for working with the block layer.

pub mod mq;
+98 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

//! This module provides types for implementing block drivers that interface the
//! blk-mq subsystem.
//!
//! To implement a block device driver, a Rust module must do the following:
//!
//! - Implement [`Operations`] for a type `T`.
//! - Create a [`TagSet<T>`].
//! - Create a [`GenDisk<T>`], via the [`GenDiskBuilder`].
//! - Add the disk to the system by calling [`GenDiskBuilder::build`] passing in
//!   the `TagSet` reference.
//!
//! The types available in this module that have direct C counterparts are:
//!
//! - The [`TagSet`] type that abstracts the C type `struct tag_set`.
//! - The [`GenDisk`] type that abstracts the C type `struct gendisk`.
//! - The [`Request`] type that abstracts the C type `struct request`.
//!
//! The kernel will interface with the block device driver by calling the method
//! implementations of the `Operations` trait.
//!
//! IO requests are passed to the driver as [`kernel::types::ARef<Request>`]
//! instances. The `Request` type is a wrapper around the C `struct request`.
//! The driver must mark end of processing by calling one of the
//! `Request::end`, methods. Failure to do so can lead to deadlock or timeout
//! errors. Please note that the C function `blk_mq_start_request` is implicitly
//! called when the request is queued with the driver.
//!
//! The `TagSet` is responsible for creating and maintaining a mapping between
//! `Request`s and integer ids as well as carrying a pointer to the vtable
//! generated by `Operations`. This mapping is useful for associating
//! completions from hardware with the correct `Request` instance. The `TagSet`
//! determines the maximum queue depth by setting the number of `Request`
//! instances available to the driver, and it determines the number of queues to
//! instantiate for the driver. If possible, a driver should allocate one queue
//! per core, to keep queue data local to a core.
//!
//! One `TagSet` instance can be shared between multiple `GenDisk` instances.
//! This can be useful when implementing drivers where one piece of hardware
//! with one set of IO resources are represented to the user as multiple disks.
//!
//! One significant difference between block device drivers implemented with
//! these Rust abstractions and drivers implemented in C, is that the Rust
//! drivers have to own a reference count on the `Request` type when the IO is
//! in flight. This is to ensure that the C `struct request` instances backing
//! the Rust `Request` instances are live while the Rust driver holds a
//! reference to the `Request`. In addition, the conversion of an integer tag to
//! a `Request` via the `TagSet` would not be sound without this bookkeeping.
//!
//! [`GenDisk`]: gen_disk::GenDisk
//! [`GenDisk<T>`]: gen_disk::GenDisk
//! [`GenDiskBuilder`]: gen_disk::GenDiskBuilder
//! [`GenDiskBuilder::build`]: gen_disk::GenDiskBuilder::build
//!
//! # Example
//!
//! ```rust
//! use kernel::{
//!     alloc::flags,
//!     block::mq::*,
//!     new_mutex,
//!     prelude::*,
//!     sync::{Arc, Mutex},
//!     types::{ARef, ForeignOwnable},
//! };
//!
//! struct MyBlkDevice;
//!
//! #[vtable]
//! impl Operations for MyBlkDevice {
//!
//!     fn queue_rq(rq: ARef<Request<Self>>, _is_last: bool) -> Result {
//!         Request::end_ok(rq);
//!         Ok(())
//!     }
//!
//!     fn commit_rqs() {}
//! }
//!
//! let tagset: Arc<TagSet<MyBlkDevice>> =
//!     Arc::pin_init(TagSet::new(1, 256, 1), flags::GFP_KERNEL)?;
//! let mut disk = gen_disk::GenDiskBuilder::new()
//!     .capacity_sectors(4096)
//!     .build(format_args!("myblk"), tagset)?;
//!
//! # Ok::<(), kernel::error::Error>(())
//! ```

pub mod gen_disk;
mod operations;
mod raw_writer;
mod request;
mod tag_set;

pub use operations::Operations;
pub use request::Request;
pub use tag_set::TagSet;
+215 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0

//! Generic disk abstraction.
//!
//! C header: [`include/linux/blkdev.h`](srctree/include/linux/blkdev.h)
//! C header: [`include/linux/blk_mq.h`](srctree/include/linux/blk_mq.h)

use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet};
use crate::error;
use crate::{bindings, error::from_err_ptr, error::Result, sync::Arc};
use core::fmt::{self, Write};

/// A builder for [`GenDisk`].
///
/// Use this struct to configure and add new [`GenDisk`] to the VFS.
pub struct GenDiskBuilder {
    rotational: bool,
    logical_block_size: u32,
    physical_block_size: u32,
    capacity_sectors: u64,
}

impl Default for GenDiskBuilder {
    fn default() -> Self {
        Self {
            rotational: false,
            logical_block_size: bindings::PAGE_SIZE as u32,
            physical_block_size: bindings::PAGE_SIZE as u32,
            capacity_sectors: 0,
        }
    }
}

impl GenDiskBuilder {
    /// Create a new instance.
    pub fn new() -> Self {
        Self::default()
    }

    /// Set the rotational media attribute for the device to be built.
    pub fn rotational(mut self, rotational: bool) -> Self {
        self.rotational = rotational;
        self
    }

    /// Validate block size by verifying that it is between 512 and `PAGE_SIZE`,
    /// and that it is a power of two.
    fn validate_block_size(size: u32) -> Result<()> {
        if !(512..=bindings::PAGE_SIZE as u32).contains(&size) || !size.is_power_of_two() {
            Err(error::code::EINVAL)
        } else {
            Ok(())
        }
    }

    /// Set the logical block size of the device to be built.
    ///
    /// This method will check that block size is a power of two and between 512
    /// and 4096. If not, an error is returned and the block size is not set.
    ///
    /// This is the smallest unit the storage device can address. It is
    /// typically 4096 bytes.
    pub fn logical_block_size(mut self, block_size: u32) -> Result<Self> {
        Self::validate_block_size(block_size)?;
        self.logical_block_size = block_size;
        Ok(self)
    }

    /// Set the physical block size of the device to be built.
    ///
    /// This method will check that block size is a power of two and between 512
    /// and 4096. If not, an error is returned and the block size is not set.
    ///
    /// This is the smallest unit a physical storage device can write
    /// atomically. It is usually the same as the logical block size but may be
    /// bigger. One example is SATA drives with 4096 byte physical block size
    /// that expose a 512 byte logical block size to the operating system.
    pub fn physical_block_size(mut self, block_size: u32) -> Result<Self> {
        Self::validate_block_size(block_size)?;
        self.physical_block_size = block_size;
        Ok(self)
    }

    /// Set the capacity of the device to be built, in sectors (512 bytes).
    pub fn capacity_sectors(mut self, capacity: u64) -> Self {
        self.capacity_sectors = capacity;
        self
    }

    /// Build a new `GenDisk` and add it to the VFS.
    pub fn build<T: Operations>(
        self,
        name: fmt::Arguments<'_>,
        tagset: Arc<TagSet<T>>,
    ) -> Result<GenDisk<T>> {
        let lock_class_key = crate::sync::LockClassKey::new();

        // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
        let gendisk = from_err_ptr(unsafe {
            bindings::__blk_mq_alloc_disk(
                tagset.raw_tag_set(),
                core::ptr::null_mut(), // TODO: We can pass queue limits right here
                core::ptr::null_mut(),
                lock_class_key.as_ptr(),
            )
        })?;

        const TABLE: bindings::block_device_operations = bindings::block_device_operations {
            submit_bio: None,
            open: None,
            release: None,
            ioctl: None,
            compat_ioctl: None,
            check_events: None,
            unlock_native_capacity: None,
            getgeo: None,
            set_read_only: None,
            swap_slot_free_notify: None,
            report_zones: None,
            devnode: None,
            alternative_gpt_sector: None,
            get_unique_id: None,
            // TODO: Set to THIS_MODULE. Waiting for const_refs_to_static feature to
            // be merged (unstable in rustc 1.78 which is staged for linux 6.10)
            // https://github.com/rust-lang/rust/issues/119618
            owner: core::ptr::null_mut(),
            pr_ops: core::ptr::null_mut(),
            free_disk: None,
            poll_bio: None,
        };

        // SAFETY: `gendisk` is a valid pointer as we initialized it above
        unsafe { (*gendisk).fops = &TABLE };

        let mut raw_writer = RawWriter::from_array(
            // SAFETY: `gendisk` points to a valid and initialized instance. We
            // have exclusive access, since the disk is not added to the VFS
            // yet.
            unsafe { &mut (*gendisk).disk_name },
        )?;
        raw_writer.write_fmt(name)?;
        raw_writer.write_char('\0')?;

        // SAFETY: `gendisk` points to a valid and initialized instance of
        // `struct gendisk`. We have exclusive access, so we cannot race.
        unsafe {
            bindings::blk_queue_logical_block_size((*gendisk).queue, self.logical_block_size)
        };

        // SAFETY: `gendisk` points to a valid and initialized instance of
        // `struct gendisk`. We have exclusive access, so we cannot race.
        unsafe {
            bindings::blk_queue_physical_block_size((*gendisk).queue, self.physical_block_size)
        };

        // SAFETY: `gendisk` points to a valid and initialized instance of
        // `struct gendisk`. `set_capacity` takes a lock to synchronize this
        // operation, so we will not race.
        unsafe { bindings::set_capacity(gendisk, self.capacity_sectors) };

        if !self.rotational {
            // SAFETY: `gendisk` points to a valid and initialized instance of
            // `struct gendisk`. This operation uses a relaxed atomic bit flip
            // operation, so there is no race on this field.
            unsafe { bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue) };
        } else {
            // SAFETY: `gendisk` points to a valid and initialized instance of
            // `struct gendisk`. This operation uses a relaxed atomic bit flip
            // operation, so there is no race on this field.
            unsafe {
                bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*gendisk).queue)
            };
        }

        crate::error::to_result(
            // SAFETY: `gendisk` points to a valid and initialized instance of
            // `struct gendisk`.
            unsafe {
                bindings::device_add_disk(core::ptr::null_mut(), gendisk, core::ptr::null_mut())
            },
        )?;

        // INVARIANT: `gendisk` was initialized above.
        // INVARIANT: `gendisk` was added to the VFS via `device_add_disk` above.
        Ok(GenDisk {
            _tagset: tagset,
            gendisk,
        })
    }
}

/// A generic block device.
///
/// # Invariants
///
///  - `gendisk` must always point to an initialized and valid `struct gendisk`.
///  - `gendisk` was added to the VFS through a call to
///     `bindings::device_add_disk`.
pub struct GenDisk<T: Operations> {
    _tagset: Arc<TagSet<T>>,
    gendisk: *mut bindings::gendisk,
}

// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
// `TagSet` It is safe to send this to other threads as long as T is Send.
unsafe impl<T: Operations + Send> Send for GenDisk<T> {}

impl<T: Operations> Drop for GenDisk<T> {
    fn drop(&mut self) {
        // SAFETY: By type invariant, `self.gendisk` points to a valid and
        // initialized instance of `struct gendisk`, and it was previously added
        // to the VFS.
        unsafe { bindings::del_gendisk(self.gendisk) };
    }
}
Loading