diff --git a/Documentation/core-api/dma-api-howto.rst b/Documentation/core-api/dma-api-howto.rst index 96fce2a9aa90..e97743ab0f26 100644 --- a/Documentation/core-api/dma-api-howto.rst +++ b/Documentation/core-api/dma-api-howto.rst @@ -146,6 +146,58 @@ What about block I/O and networking buffers? The block I/O and networking subsystems make sure that the buffers they use are valid for you to DMA from/to. +__dma_from_device_group_begin/end annotations +============================================= + +As explained previously, when a structure contains a DMA_FROM_DEVICE / +DMA_BIDIRECTIONAL buffer (device writes to memory) alongside fields that the +CPU writes to, cache line sharing between the DMA buffer and CPU-written fields +can cause data corruption on CPUs with DMA-incoherent caches. + +The ``__dma_from_device_group_begin(GROUP)/__dma_from_device_group_end(GROUP)`` +macros ensure proper alignment to prevent this:: + + struct my_device { + spinlock_t lock1; + __dma_from_device_group_begin(); + char dma_buffer1[16]; + char dma_buffer2[16]; + __dma_from_device_group_end(); + spinlock_t lock2; + }; + +To isolate a DMA buffer from adjacent fields, use +``__dma_from_device_group_begin(GROUP)`` before the first DMA buffer +field and ``__dma_from_device_group_end(GROUP)`` after the last DMA +buffer field (with the same GROUP name). This protects both the head +and tail of the buffer from cache line sharing. + +The GROUP parameter is an optional identifier that names the DMA buffer group +(in case you have several in the same structure):: + + struct my_device { + spinlock_t lock1; + __dma_from_device_group_begin(buffer1); + char dma_buffer1[16]; + __dma_from_device_group_end(buffer1); + spinlock_t lock2; + __dma_from_device_group_begin(buffer2); + char dma_buffer2[16]; + __dma_from_device_group_end(buffer2); + }; + +On cache-coherent platforms these macros expand to zero-length array markers. +On non-coherent platforms, they also ensure the minimal DMA alignment, which +can be as large as 128 bytes. + +.. note:: + + It is allowed (though somewhat fragile) to include extra fields, not + intended for DMA from the device, within the group (in order to pack the + structure tightly) - but only as long as the CPU does not write these + fields while any fields in the group are mapped for DMA_FROM_DEVICE or + DMA_BIDIRECTIONAL. + DMA addressing capabilities =========================== diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 0bdc2be65e57..1d7bfad73b1c 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -148,3 +148,12 @@ DMA_ATTR_MMIO is appropriate. For architectures that require cache flushing for DMA coherence DMA_ATTR_MMIO will not perform any cache flushing. The address provided must never be mapped cacheable into the CPU. + +DMA_ATTR_CPU_CACHE_CLEAN +------------------------ + +This attribute indicates the CPU will not dirty any cacheline overlapping this +DMA_FROM_DEVICE/DMA_BIDIRECTIONAL buffer while it is mapped. This allows +multiple small buffers to safely share a cacheline without risk of data +corruption, suppressing DMA debug warnings about overlapping mappings. +All mappings sharing a cacheline should have this attribute. diff --git a/Documentation/userspace-api/vduse.rst b/Documentation/userspace-api/vduse.rst index bdb880e01132..81479d47c8b9 100644 --- a/Documentation/userspace-api/vduse.rst +++ b/Documentation/userspace-api/vduse.rst @@ -230,4 +230,57 @@ able to start the dataplane processing as follows: 5. Inject an interrupt for specific virtqueue with the VDUSE_INJECT_VQ_IRQ ioctl after the used ring is filled. +Enabling ASID (API version 1) +------------------------------ + +VDUSE supports per-address-space identifiers (ASIDs) starting with API +version 1. Set it up with ioctl(VDUSE_SET_API_VERSION) on `/dev/vduse/control` +and pass `VDUSE_API_VERSION_1` before creating a new VDUSE instance with +ioctl(VDUSE_CREATE_DEV). + +Afterwards, you can use the member asid of ioctl(VDUSE_VQ_SETUP) argument to +select the address space of the IOTLB you are querying. The driver could +change the address space of any virtqueue group by using the +VDUSE_SET_VQ_GROUP_ASID VDUSE message type, and the VDUSE instance needs to +reply with VDUSE_REQ_RESULT_OK if it was possible to change it. + +Similarly, you can use ioctl(VDUSE_IOTLB_GET_FD2) to obtain the file descriptor +describing an IOVA region of a specific ASID. Example usage: + +.. code-block:: c + + static void *iova_to_va(int dev_fd, uint32_t asid, uint64_t iova, + uint64_t *len) + { + int fd; + void *addr; + size_t size; + struct vduse_iotlb_entry_v2 entry = { 0 }; + + entry.v1.start = iova; + entry.v1.last = iova; + entry.asid = asid; + + fd = ioctl(dev_fd, VDUSE_IOTLB_GET_FD2, &entry); + if (fd < 0) + return NULL; + + size = entry.v1.last - entry.v1.start + 1; + *len = entry.v1.last - iova + 1; + addr = mmap(0, size, perm_to_prot(entry.v1.perm), MAP_SHARED, + fd, entry.v1.offset); + close(fd); + if (addr == MAP_FAILED) + return NULL; + + /* + * Using some data structures such as linked list to store + * the iotlb mapping. The munmap(2) should be called for the + * cached mapping when the corresponding VDUSE_UPDATE_IOTLB + * message is received or the device is reset. + */ + + return addr + iova - entry.v1.start; + } + For more details on the uAPI, please see include/uapi/linux/vduse.h. diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c index dd998f4fe4f2..eb80a031c7be 100644 --- a/drivers/char/hw_random/virtio-rng.c +++ b/drivers/char/hw_random/virtio-rng.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -28,11 +29,13 @@ struct virtrng_info { unsigned int data_avail; unsigned int data_idx; /* minimal size returned by rng_buffer_size() */ + __dma_from_device_group_begin(); #if SMP_CACHE_BYTES < 32 u8 data[32]; #else u8 data[SMP_CACHE_BYTES]; #endif + __dma_from_device_group_end(); }; static void random_recv_done(struct virtqueue *vq) diff --git a/drivers/gpio/gpio-virtio.c b/drivers/gpio/gpio-virtio.c index 17e040991e46..ed6e0e90fa8a 100644 --- a/drivers/gpio/gpio-virtio.c +++ b/drivers/gpio/gpio-virtio.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -24,9 +25,13 @@ struct virtio_gpio_line { struct mutex lock; /* Protects line operation */ struct completion completion; - struct virtio_gpio_request req ____cacheline_aligned; - struct virtio_gpio_response res ____cacheline_aligned; + unsigned int rxlen; + + __dma_from_device_group_begin(); + struct virtio_gpio_request req; + struct virtio_gpio_response res; + __dma_from_device_group_end(); }; struct vgpio_irq_line { @@ -37,8 +42,10 @@ struct vgpio_irq_line { bool update_pending; bool queue_pending; - struct virtio_gpio_irq_request ireq ____cacheline_aligned; - struct virtio_gpio_irq_response ires ____cacheline_aligned; + __dma_from_device_group_begin(); + struct virtio_gpio_irq_request ireq; + struct virtio_gpio_irq_response ires; + __dma_from_device_group_end(); }; struct virtio_gpio { diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 6b1d8bcd06b9..89322717b181 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "sd.h" @@ -61,7 +62,7 @@ struct virtio_scsi_cmd { struct virtio_scsi_event_node { struct virtio_scsi *vscsi; - struct virtio_scsi_event event; + struct virtio_scsi_event *event; struct work_struct work; }; @@ -89,6 +90,11 @@ struct virtio_scsi { struct virtio_scsi_vq ctrl_vq; struct virtio_scsi_vq event_vq; + + __dma_from_device_group_begin(); + struct virtio_scsi_event events[VIRTIO_SCSI_EVENT_LEN]; + __dma_from_device_group_end(); + struct virtio_scsi_vq req_vqs[]; }; @@ -237,12 +243,12 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi, unsigned long flags; INIT_WORK(&event_node->work, virtscsi_handle_event); - sg_init_one(&sg, &event_node->event, sizeof(struct virtio_scsi_event)); + sg_init_one(&sg, event_node->event, sizeof(struct virtio_scsi_event)); spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags); - err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node, - GFP_ATOMIC); + err = virtqueue_add_inbuf_cache_clean(vscsi->event_vq.vq, &sg, 1, event_node, + GFP_ATOMIC); if (!err) virtqueue_kick(vscsi->event_vq.vq); @@ -257,6 +263,7 @@ static int virtscsi_kick_event_all(struct virtio_scsi *vscsi) for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) { vscsi->event_list[i].vscsi = vscsi; + vscsi->event_list[i].event = &vscsi->events[i]; virtscsi_kick_event(vscsi, &vscsi->event_list[i]); } @@ -380,7 +387,7 @@ static void virtscsi_handle_event(struct work_struct *work) struct virtio_scsi_event_node *event_node = container_of(work, struct virtio_scsi_event_node, work); struct virtio_scsi *vscsi = event_node->vscsi; - struct virtio_scsi_event *event = &event_node->event; + struct virtio_scsi_event *event = event_node->event; if (event->event & cpu_to_virtio32(vscsi->vdev, VIRTIO_SCSI_T_EVENTS_MISSED)) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index ddaa1366704b..b7e46338815f 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2125,6 +2125,74 @@ static void teardown_steering(struct mlx5_vdpa_net *ndev) mlx5_destroy_flow_table(ndev->rxft); } +static int mlx5_vdpa_change_mac(struct mlx5_vdpa_net *ndev, + struct mlx5_core_dev *pfmdev, + const u8 *new_mac) +{ + struct mlx5_vdpa_dev *mvdev = &ndev->mvdev; + u8 old_mac[ETH_ALEN]; + + if (is_zero_ether_addr(new_mac)) + return -EINVAL; + + if (!is_zero_ether_addr(ndev->config.mac)) { + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", + ndev->config.mac); + return -EIO; + } + } + + if (mlx5_mpfs_add_mac(pfmdev, (u8 *)new_mac)) { + mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", + new_mac); + return -EIO; + } + + /* backup the original mac address so that if failed to add the forward rules + * we could restore it + */ + ether_addr_copy(old_mac, ndev->config.mac); + + ether_addr_copy(ndev->config.mac, new_mac); + + /* Need recreate the flow table entry, so that the packet could forward back + */ + mac_vlan_del(ndev, old_mac, 0, false); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { + mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); + + /* Although it hardly run here, we still need double check */ + if (is_zero_ether_addr(old_mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n"); + return -EIO; + } + + /* Try to restore original mac address to MFPS table, and try to restore + * the forward rule entry. + */ + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n", + ndev->config.mac); + } + + if (mlx5_mpfs_add_mac(pfmdev, old_mac)) { + mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n", + old_mac); + } + + ether_addr_copy(ndev->config.mac, old_mac); + + if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) + mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n"); + + return -EIO; + } + + return 0; +} + static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) { struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); @@ -2132,12 +2200,13 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) virtio_net_ctrl_ack status = VIRTIO_NET_ERR; struct mlx5_core_dev *pfmdev; size_t read; - u8 mac[ETH_ALEN], mac_back[ETH_ALEN]; + u8 mac[ETH_ALEN]; pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); switch (cmd) { case VIRTIO_NET_CTRL_MAC_ADDR_SET: - read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN); + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, + (void *)mac, ETH_ALEN); if (read != ETH_ALEN) break; @@ -2145,66 +2214,8 @@ static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) status = VIRTIO_NET_OK; break; } - - if (is_zero_ether_addr(mac)) - break; - - if (!is_zero_ether_addr(ndev->config.mac)) { - if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { - mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", - ndev->config.mac); - break; - } - } - - if (mlx5_mpfs_add_mac(pfmdev, mac)) { - mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", - mac); - break; - } - - /* backup the original mac address so that if failed to add the forward rules - * we could restore it - */ - memcpy(mac_back, ndev->config.mac, ETH_ALEN); - - memcpy(ndev->config.mac, mac, ETH_ALEN); - - /* Need recreate the flow table entry, so that the packet could forward back - */ - mac_vlan_del(ndev, mac_back, 0, false); - - if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) { - mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n"); - - /* Although it hardly run here, we still need double check */ - if (is_zero_ether_addr(mac_back)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n"); - break; - } - - /* Try to restore original mac address to MFPS table, and try to restore - * the forward rule entry. - */ - if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n", - ndev->config.mac); - } - - if (mlx5_mpfs_add_mac(pfmdev, mac_back)) { - mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n", - mac_back); - } - - memcpy(ndev->config.mac, mac_back, ETH_ALEN); - - if (mac_vlan_add(ndev, ndev->config.mac, 0, false)) - mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n"); - - break; - } - - status = VIRTIO_NET_OK; + status = mlx5_vdpa_change_mac(ndev, pfmdev, mac) ? VIRTIO_NET_ERR : + VIRTIO_NET_OK; break; default: @@ -3640,9 +3651,6 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); int err = 0; - if (group >= MLX5_VDPA_NUMVQ_GROUPS) - return -EINVAL; - mvdev->mres.group2asid[group] = asid; mutex_lock(&mvdev->mres.lock); @@ -4044,7 +4052,6 @@ static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev, const struct vdpa_dev_set_config *add_config) { - struct virtio_net_config *config; struct mlx5_core_dev *pfmdev; struct mlx5_vdpa_dev *mvdev; struct mlx5_vdpa_net *ndev; @@ -4054,16 +4061,23 @@ static int mlx5_vdpa_set_attr(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device * mvdev = to_mvdev(dev); ndev = to_mlx5_vdpa_ndev(mvdev); mdev = mvdev->mdev; - config = &ndev->config; down_write(&ndev->reslock); - if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + + if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR)) { + if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); + } else { + mlx5_vdpa_warn(mvdev, "device running, skip updating MAC\n"); + err = -EBUSY; + goto out; + } pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev)); - err = mlx5_mpfs_add_mac(pfmdev, config->mac); - if (!err) - ether_addr_copy(config->mac, add_config->net.mac); + err = mlx5_vdpa_change_mac(ndev, pfmdev, + (u8 *)add_config->net.mac); } +out: up_write(&ndev->reslock); return err; } diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c1c6431950e1..df9c7ddc5d78 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -606,12 +606,6 @@ static int vdpasim_set_group_asid(struct vdpa_device *vdpa, unsigned int group, struct vhost_iotlb *iommu; int i; - if (group > vdpasim->dev_attr.ngroups) - return -EINVAL; - - if (asid >= vdpasim->dev_attr.nas) - return -EINVAL; - iommu = &vdpasim->iommu[asid]; mutex_lock(&vdpasim->mutex); diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c index 4352b5cf74f0..0a9f668467a8 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.c +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -493,17 +493,15 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, vduse_domain_free_iova(iovad, dma_addr, size); } -void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, - size_t size, dma_addr_t *dma_addr, - gfp_t flag) +dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, void *orig) { struct iova_domain *iovad = &domain->consistent_iovad; unsigned long limit = domain->iova_limit; dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); - void *orig = alloc_pages_exact(size, flag); - if (!iova || !orig) - goto err; + if (!iova) + return DMA_MAPPING_ERROR; spin_lock(&domain->iotlb_lock); if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1, @@ -514,27 +512,20 @@ void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, } spin_unlock(&domain->iotlb_lock); - *dma_addr = iova; + return iova; - return orig; err: - *dma_addr = DMA_MAPPING_ERROR; - if (orig) - free_pages_exact(orig, size); - if (iova) - vduse_domain_free_iova(iovad, iova, size); + vduse_domain_free_iova(iovad, iova, size); - return NULL; + return DMA_MAPPING_ERROR; } void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs) + dma_addr_t dma_addr, unsigned long attrs) { struct iova_domain *iovad = &domain->consistent_iovad; struct vhost_iotlb_map *map; struct vdpa_map_file *map_file; - phys_addr_t pa; spin_lock(&domain->iotlb_lock); map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr, @@ -546,12 +537,10 @@ void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, map_file = (struct vdpa_map_file *)map->opaque; fput(map_file->file); kfree(map_file); - pa = map->addr; vhost_iotlb_map_free(domain->iotlb, map); spin_unlock(&domain->iotlb_lock); vduse_domain_free_iova(iovad, dma_addr, size); - free_pages_exact(phys_to_virt(pa), size); } static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf) diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h index a923971a64f5..e50e55d1396f 100644 --- a/drivers/vdpa/vdpa_user/iova_domain.h +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -65,13 +65,11 @@ void vduse_domain_unmap_page(struct vduse_iova_domain *domain, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); -void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, - size_t size, dma_addr_t *dma_addr, - gfp_t flag); +dma_addr_t vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, void *orig); void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, - void *vaddr, dma_addr_t dma_addr, - unsigned long attrs); + dma_addr_t dma_addr, unsigned long attrs); void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c index ae357d014564..405d59610f76 100644 --- a/drivers/vdpa/vdpa_user/vduse_dev.c +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -9,6 +9,7 @@ */ #include "linux/virtio_net.h" +#include #include #include #include @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -39,6 +41,8 @@ #define DRV_LICENSE "GPL v2" #define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_DEV_MAX_GROUPS 0xffff +#define VDUSE_DEV_MAX_AS 0xffff #define VDUSE_MAX_BOUNCE_SIZE (1024 * 1024 * 1024) #define VDUSE_MIN_BOUNCE_SIZE (1024 * 1024) #define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) @@ -48,6 +52,15 @@ #define IRQ_UNBOUND -1 +/* + * VDUSE instance have not asked the vduse API version, so assume 0. + * + * Old devices may not ask for the device version and assume it is 0. Keep + * this value for these. From the moment the VDUSE instance ask for the + * version, convert to the latests supported one and continue regular flow + */ +#define VDUSE_API_VERSION_NOT_ASKED U64_MAX + struct vduse_virtqueue { u16 index; u16 num_max; @@ -58,6 +71,7 @@ struct vduse_virtqueue { struct vdpa_vq_state state; bool ready; bool kicked; + u32 group; spinlock_t kick_lock; spinlock_t irq_lock; struct eventfd_ctx *kickfd; @@ -83,11 +97,23 @@ struct vduse_umem { struct mm_struct *mm; }; +struct vduse_as { + struct vduse_iova_domain *domain; + struct vduse_umem *umem; + struct mutex mem_lock; +}; + +struct vduse_vq_group { + rwlock_t as_lock; + struct vduse_as *as; /* Protected by as_lock */ + struct vduse_dev *dev; +}; + struct vduse_dev { struct vduse_vdpa *vdev; struct device *dev; struct vduse_virtqueue **vqs; - struct vduse_iova_domain *domain; + struct vduse_as *as; char *name; struct mutex lock; spinlock_t msg_lock; @@ -114,8 +140,9 @@ struct vduse_dev { u8 status; u32 vq_num; u32 vq_align; - struct vduse_umem *umem; - struct mutex mem_lock; + u32 ngroups; + u32 nas; + struct vduse_vq_group *groups; unsigned int bounce_size; struct mutex domain_lock; }; @@ -305,7 +332,7 @@ static int vduse_dev_set_status(struct vduse_dev *dev, u8 status) return vduse_dev_msg_sync(dev, &msg); } -static int vduse_dev_update_iotlb(struct vduse_dev *dev, +static int vduse_dev_update_iotlb(struct vduse_dev *dev, u32 asid, u64 start, u64 last) { struct vduse_dev_msg msg = { 0 }; @@ -314,8 +341,14 @@ static int vduse_dev_update_iotlb(struct vduse_dev *dev, return -EINVAL; msg.req.type = VDUSE_UPDATE_IOTLB; - msg.req.iova.start = start; - msg.req.iova.last = last; + if (dev->api_version < VDUSE_API_VERSION_1) { + msg.req.iova.start = start; + msg.req.iova.last = last; + } else { + msg.req.iova_v2.start = start; + msg.req.iova_v2.last = last; + msg.req.iova_v2.asid = asid; + } return vduse_dev_msg_sync(dev, &msg); } @@ -430,11 +463,14 @@ static __poll_t vduse_dev_poll(struct file *file, poll_table *wait) static void vduse_dev_reset(struct vduse_dev *dev) { int i; - struct vduse_iova_domain *domain = dev->domain; /* The coherent mappings are handled in vduse_dev_free_coherent() */ - if (domain && domain->bounce_map) - vduse_domain_reset_bounce_map(domain); + for (i = 0; i < dev->nas; i++) { + struct vduse_iova_domain *domain = dev->as[i].domain; + + if (domain && domain->bounce_map) + vduse_domain_reset_bounce_map(domain); + } down_write(&dev->rwsem); @@ -592,6 +628,63 @@ static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, return 0; } +static u32 vduse_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (dev->api_version < VDUSE_API_VERSION_1) + return 0; + + return dev->vqs[idx]->group; +} + +static union virtio_map vduse_get_vq_map(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + u32 vq_group = vduse_get_vq_group(vdpa, idx); + union virtio_map ret = { + .group = &dev->groups[vq_group], + }; + + return ret; +} + +DEFINE_GUARD(vq_group_as_read_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + read_lock(&_T->as_lock), + if (_T->dev->nas > 1) + read_unlock(&_T->as_lock)) + +DEFINE_GUARD(vq_group_as_write_lock, struct vduse_vq_group *, + if (_T->dev->nas > 1) + write_lock(&_T->as_lock), + if (_T->dev->nas > 1) + write_unlock(&_T->as_lock)) + +static int vduse_set_group_asid(struct vdpa_device *vdpa, unsigned int group, + unsigned int asid) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_dev_msg msg = { 0 }; + int r; + + if (dev->api_version < VDUSE_API_VERSION_1) + return -EINVAL; + + msg.req.type = VDUSE_SET_VQ_GROUP_ASID; + msg.req.vq_group_asid.group = group; + msg.req.vq_group_asid.asid = asid; + + r = vduse_dev_msg_sync(dev, &msg); + if (r < 0) + return r; + + guard(vq_group_as_write_lock)(&dev->groups[group]); + dev->groups[group].as = &dev->as[asid]; + + return 0; +} + static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, struct vdpa_vq_state *state) { @@ -763,13 +856,13 @@ static int vduse_vdpa_set_map(struct vdpa_device *vdpa, struct vduse_dev *dev = vdpa_to_vduse(vdpa); int ret; - ret = vduse_domain_set_map(dev->domain, iotlb); + ret = vduse_domain_set_map(dev->as[asid].domain, iotlb); if (ret) return ret; - ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); + ret = vduse_dev_update_iotlb(dev, asid, 0ULL, ULLONG_MAX); if (ret) { - vduse_domain_clear_map(dev->domain, iotlb); + vduse_domain_clear_map(dev->as[asid].domain, iotlb); return ret; } @@ -789,6 +882,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .set_vq_cb = vduse_vdpa_set_vq_cb, .set_vq_num = vduse_vdpa_set_vq_num, .get_vq_size = vduse_vdpa_get_vq_size, + .get_vq_group = vduse_get_vq_group, .set_vq_ready = vduse_vdpa_set_vq_ready, .get_vq_ready = vduse_vdpa_get_vq_ready, .set_vq_state = vduse_vdpa_set_vq_state, @@ -811,6 +905,8 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = { .get_vq_affinity = vduse_vdpa_get_vq_affinity, .reset = vduse_vdpa_reset, .set_map = vduse_vdpa_set_map, + .set_group_asid = vduse_set_group_asid, + .get_vq_map = vduse_get_vq_map, .free = vduse_vdpa_free, }; @@ -818,8 +914,13 @@ static void vduse_dev_sync_single_for_device(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_iova_domain *domain; + if (!token.group) + return; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_device(domain, dma_addr, size, dir); } @@ -827,8 +928,13 @@ static void vduse_dev_sync_single_for_cpu(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_iova_domain *domain; + if (!token.group) + return; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; vduse_domain_sync_single_for_cpu(domain, dma_addr, size, dir); } @@ -837,8 +943,13 @@ static dma_addr_t vduse_dev_map_page(union virtio_map token, struct page *page, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_iova_domain *domain; + if (!token.group) + return DMA_MAPPING_ERROR; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; return vduse_domain_map_page(domain, page, offset, size, dir, attrs); } @@ -846,43 +957,71 @@ static void vduse_dev_unmap_page(union virtio_map token, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + struct vduse_iova_domain *domain; - return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); + if (!token.group) + return; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); } static void *vduse_dev_alloc_coherent(union virtio_map token, size_t size, dma_addr_t *dma_addr, gfp_t flag) { - struct vduse_iova_domain *domain = token.iova_domain; - unsigned long iova; void *addr; *dma_addr = DMA_MAPPING_ERROR; - addr = vduse_domain_alloc_coherent(domain, size, - (dma_addr_t *)&iova, flag); + if (!token.group) + return NULL; + + addr = alloc_pages_exact(size, flag); if (!addr) return NULL; - *dma_addr = (dma_addr_t)iova; + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + *dma_addr = vduse_domain_alloc_coherent(domain, size, addr); + if (*dma_addr == DMA_MAPPING_ERROR) + goto err; + } return addr; + +err: + free_pages_exact(addr, size); + return NULL; } static void vduse_dev_free_coherent(union virtio_map token, size_t size, void *vaddr, dma_addr_t dma_addr, unsigned long attrs) { - struct vduse_iova_domain *domain = token.iova_domain; + if (!token.group) + return; - vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); + { + struct vduse_iova_domain *domain; + + guard(vq_group_as_read_lock)(token.group); + domain = token.group->as->domain; + vduse_domain_free_coherent(domain, size, dma_addr, attrs); + } + + free_pages_exact(vaddr, size); } static bool vduse_dev_need_sync(union virtio_map token, dma_addr_t dma_addr) { - struct vduse_iova_domain *domain = token.iova_domain; + if (!token.group) + return false; - return dma_addr < domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return dma_addr < token.group->as->domain->bounce_size; } static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) @@ -894,9 +1033,11 @@ static int vduse_dev_mapping_error(union virtio_map token, dma_addr_t dma_addr) static size_t vduse_dev_max_mapping_size(union virtio_map token) { - struct vduse_iova_domain *domain = token.iova_domain; + if (!token.group) + return 0; - return domain->bounce_size; + guard(vq_group_as_read_lock)(token.group); + return token.group->as->domain->bounce_size; } static const struct virtio_map_ops vduse_map_ops = { @@ -1036,39 +1177,40 @@ unlock: return ret; } -static int vduse_dev_dereg_umem(struct vduse_dev *dev, +static int vduse_dev_dereg_umem(struct vduse_dev *dev, u32 asid, u64 iova, u64 size) { int ret; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -ENOENT; - if (!dev->umem) + if (!dev->as[asid].umem) goto unlock; ret = -EINVAL; - if (!dev->domain) + if (!dev->as[asid].domain) goto unlock; - if (dev->umem->iova != iova || size != dev->domain->bounce_size) + if (dev->as[asid].umem->iova != iova || + size != dev->as[asid].domain->bounce_size) goto unlock; - vduse_domain_remove_user_bounce_pages(dev->domain); - unpin_user_pages_dirty_lock(dev->umem->pages, - dev->umem->npages, true); - atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm); - mmdrop(dev->umem->mm); - vfree(dev->umem->pages); - kfree(dev->umem); - dev->umem = NULL; + vduse_domain_remove_user_bounce_pages(dev->as[asid].domain); + unpin_user_pages_dirty_lock(dev->as[asid].umem->pages, + dev->as[asid].umem->npages, true); + atomic64_sub(dev->as[asid].umem->npages, &dev->as[asid].umem->mm->pinned_vm); + mmdrop(dev->as[asid].umem->mm); + vfree(dev->as[asid].umem->pages); + kfree(dev->as[asid].umem); + dev->as[asid].umem = NULL; ret = 0; unlock: - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } static int vduse_dev_reg_umem(struct vduse_dev *dev, - u64 iova, u64 uaddr, u64 size) + u32 asid, u64 iova, u64 uaddr, u64 size) { struct page **page_list = NULL; struct vduse_umem *umem = NULL; @@ -1076,14 +1218,14 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, unsigned long npages, lock_limit; int ret; - if (!dev->domain || !dev->domain->bounce_map || - size != dev->domain->bounce_size || + if (!dev->as[asid].domain || !dev->as[asid].domain->bounce_map || + size != dev->as[asid].domain->bounce_size || iova != 0 || uaddr & ~PAGE_MASK) return -EINVAL; - mutex_lock(&dev->mem_lock); + mutex_lock(&dev->as[asid].mem_lock); ret = -EEXIST; - if (dev->umem) + if (dev->as[asid].umem) goto unlock; ret = -ENOMEM; @@ -1107,7 +1249,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, goto out; } - ret = vduse_domain_add_user_bounce_pages(dev->domain, + ret = vduse_domain_add_user_bounce_pages(dev->as[asid].domain, page_list, pinned); if (ret) goto out; @@ -1120,7 +1262,7 @@ static int vduse_dev_reg_umem(struct vduse_dev *dev, umem->mm = current->mm; mmgrab(current->mm); - dev->umem = umem; + dev->as[asid].umem = umem; out: if (ret && pinned > 0) unpin_user_pages(page_list, pinned); @@ -1131,7 +1273,7 @@ unlock: vfree(page_list); kfree(umem); } - mutex_unlock(&dev->mem_lock); + mutex_unlock(&dev->as[asid].mem_lock); return ret; } @@ -1151,6 +1293,54 @@ static void vduse_vq_update_effective_cpu(struct vduse_virtqueue *vq) vq->irq_effective_cpu = curr_cpu; } +static int vduse_dev_iotlb_entry(struct vduse_dev *dev, + struct vduse_iotlb_entry_v2 *entry, + struct file **f, uint64_t *capability) +{ + u32 asid; + int r = -EINVAL; + struct vhost_iotlb_map *map; + + if (entry->start > entry->last || entry->asid >= dev->nas) + return -EINVAL; + + asid = array_index_nospec(entry->asid, dev->nas); + mutex_lock(&dev->domain_lock); + + if (!dev->as[asid].domain) + goto out; + + spin_lock(&dev->as[asid].domain->iotlb_lock); + map = vhost_iotlb_itree_first(dev->as[asid].domain->iotlb, + entry->start, entry->last); + if (map) { + if (f) { + const struct vdpa_map_file *map_file; + + map_file = (struct vdpa_map_file *)map->opaque; + entry->offset = map_file->offset; + *f = get_file(map_file->file); + } + entry->start = map->start; + entry->last = map->last; + entry->perm = map->perm; + if (capability) { + *capability = 0; + + if (dev->as[asid].domain->bounce_map && map->start == 0 && + map->last == dev->as[asid].domain->bounce_size - 1) + *capability |= VDUSE_IOVA_CAP_UMEM; + } + + r = 0; + } + spin_unlock(&dev->as[asid].domain->iotlb_lock); + +out: + mutex_unlock(&dev->domain_lock); + return r; +} + static long vduse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1162,44 +1352,36 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, return -EPERM; switch (cmd) { - case VDUSE_IOTLB_GET_FD: { - struct vduse_iotlb_entry entry; - struct vhost_iotlb_map *map; - struct vdpa_map_file *map_file; + case VDUSE_IOTLB_GET_FD: + case VDUSE_IOTLB_GET_FD2: { + struct vduse_iotlb_entry_v2 entry = {0}; struct file *f = NULL; + ret = -ENOIOCTLCMD; + if (dev->api_version < VDUSE_API_VERSION_1 && + cmd == VDUSE_IOTLB_GET_FD2) + break; + ret = -EFAULT; - if (copy_from_user(&entry, argp, sizeof(entry))) + if (copy_from_user(&entry, argp, _IOC_SIZE(cmd))) break; ret = -EINVAL; - if (entry.start > entry.last) + if (!is_mem_zero((const char *)entry.reserved, + sizeof(entry.reserved))) break; - mutex_lock(&dev->domain_lock); - if (!dev->domain) { - mutex_unlock(&dev->domain_lock); + ret = vduse_dev_iotlb_entry(dev, &entry, &f, NULL); + if (ret) break; - } - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, - entry.start, entry.last); - if (map) { - map_file = (struct vdpa_map_file *)map->opaque; - f = get_file(map_file->file); - entry.offset = map_file->offset; - entry.start = map->start; - entry.last = map->last; - entry.perm = map->perm; - } - spin_unlock(&dev->domain->iotlb_lock); - mutex_unlock(&dev->domain_lock); + ret = -EINVAL; if (!f) break; - ret = -EFAULT; - if (copy_to_user(argp, &entry, sizeof(entry))) { + ret = copy_to_user(argp, &entry, _IOC_SIZE(cmd)); + if (ret) { + ret = -EFAULT; fput(f); break; } @@ -1252,12 +1434,24 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, if (config.index >= dev->vq_num) break; - if (!is_mem_zero((const char *)config.reserved, - sizeof(config.reserved))) + if (dev->api_version < VDUSE_API_VERSION_1) { + if (config.group) + break; + } else { + if (config.group >= dev->ngroups) + break; + if (dev->status & VIRTIO_CONFIG_S_DRIVER_OK) + break; + } + + if (config.reserved1 || + !is_mem_zero((const char *)config.reserved2, + sizeof(config.reserved2))) break; index = array_index_nospec(config.index, dev->vq_num); dev->vqs[index]->num_max = config.max_size; + dev->vqs[index]->group = config.group; ret = 0; break; } @@ -1336,6 +1530,7 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, } case VDUSE_IOTLB_REG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1343,17 +1538,21 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || umem.asid >= dev->nas) break; mutex_lock(&dev->domain_lock); - ret = vduse_dev_reg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_reg_umem(dev, asid, umem.iova, umem.uaddr, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_DEREG_UMEM: { struct vduse_iova_umem umem; + u32 asid; ret = -EFAULT; if (copy_from_user(&umem, argp, sizeof(umem))) @@ -1361,51 +1560,49 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd, ret = -EINVAL; if (!is_mem_zero((const char *)umem.reserved, - sizeof(umem.reserved))) + sizeof(umem.reserved)) || + (dev->api_version < VDUSE_API_VERSION_1 && + umem.asid != 0) || + umem.asid >= dev->nas) break; + mutex_lock(&dev->domain_lock); - ret = vduse_dev_dereg_umem(dev, umem.iova, + asid = array_index_nospec(umem.asid, dev->nas); + ret = vduse_dev_dereg_umem(dev, asid, umem.iova, umem.size); mutex_unlock(&dev->domain_lock); break; } case VDUSE_IOTLB_GET_INFO: { struct vduse_iova_info info; - struct vhost_iotlb_map *map; + struct vduse_iotlb_entry_v2 entry; ret = -EFAULT; if (copy_from_user(&info, argp, sizeof(info))) break; - ret = -EINVAL; - if (info.start > info.last) - break; - if (!is_mem_zero((const char *)info.reserved, sizeof(info.reserved))) break; - mutex_lock(&dev->domain_lock); - if (!dev->domain) { - mutex_unlock(&dev->domain_lock); + if (dev->api_version < VDUSE_API_VERSION_1) { + if (info.asid) + break; + } else if (info.asid >= dev->nas) break; - } - spin_lock(&dev->domain->iotlb_lock); - map = vhost_iotlb_itree_first(dev->domain->iotlb, - info.start, info.last); - if (map) { - info.start = map->start; - info.last = map->last; - info.capability = 0; - if (dev->domain->bounce_map && map->start == 0 && - map->last == dev->domain->bounce_size - 1) - info.capability |= VDUSE_IOVA_CAP_UMEM; - } - spin_unlock(&dev->domain->iotlb_lock); - mutex_unlock(&dev->domain_lock); - if (!map) + + entry.start = info.start; + entry.last = info.last; + entry.asid = info.asid; + ret = vduse_dev_iotlb_entry(dev, &entry, NULL, + &info.capability); + if (ret < 0) break; + info.start = entry.start; + info.last = entry.last; + info.asid = entry.asid; + ret = -EFAULT; if (copy_to_user(argp, &info, sizeof(info))) break; @@ -1426,8 +1623,10 @@ static int vduse_dev_release(struct inode *inode, struct file *file) struct vduse_dev *dev = file->private_data; mutex_lock(&dev->domain_lock); - if (dev->domain) - vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size); + for (int i = 0; i < dev->nas; i++) + if (dev->as[i].domain) + vduse_dev_dereg_umem(dev, i, 0, + dev->as[i].domain->bounce_size); mutex_unlock(&dev->domain_lock); spin_lock(&dev->msg_lock); /* Make sure the inflight messages can processed after reconncection */ @@ -1646,7 +1845,6 @@ static struct vduse_dev *vduse_dev_create(void) return NULL; mutex_init(&dev->lock); - mutex_init(&dev->mem_lock); mutex_init(&dev->domain_lock); spin_lock_init(&dev->msg_lock); INIT_LIST_HEAD(&dev->send_list); @@ -1697,9 +1895,13 @@ static int vduse_destroy_dev(char *name) idr_remove(&vduse_idr, dev->minor); kvfree(dev->config); vduse_dev_deinit_vqs(dev); - if (dev->domain) - vduse_domain_destroy(dev->domain); + for (int i = 0; i < dev->nas; i++) { + if (dev->as[i].domain) + vduse_domain_destroy(dev->as[i].domain); + } + kfree(dev->as); kfree(dev->name); + kfree(dev->groups); vduse_dev_destroy(dev); module_put(THIS_MODULE); @@ -1737,12 +1939,25 @@ static bool features_is_valid(struct vduse_dev_config *config) return true; } -static bool vduse_validate_config(struct vduse_dev_config *config) +static bool vduse_validate_config(struct vduse_dev_config *config, + u64 api_version) { if (!is_mem_zero((const char *)config->reserved, sizeof(config->reserved))) return false; + if (api_version < VDUSE_API_VERSION_1 && + (config->ngroups || config->nas)) + return false; + + if (api_version >= VDUSE_API_VERSION_1) { + if (!config->ngroups || config->ngroups > VDUSE_DEV_MAX_GROUPS) + return false; + + if (!config->nas || config->nas > VDUSE_DEV_MAX_AS) + return false; + } + if (config->vq_align > PAGE_SIZE) return false; @@ -1806,7 +2021,8 @@ static ssize_t bounce_size_store(struct device *device, ret = -EPERM; mutex_lock(&dev->domain_lock); - if (dev->domain) + /* Assuming that if the first domain is allocated, all are allocated */ + if (dev->as[0].domain) goto unlock; ret = kstrtouint(buf, 10, &bounce_size); @@ -1858,6 +2074,27 @@ static int vduse_create_dev(struct vduse_dev_config *config, dev->device_features = config->features; dev->device_id = config->device_id; dev->vendor_id = config->vendor_id; + + dev->nas = (dev->api_version < VDUSE_API_VERSION_1) ? 1 : config->nas; + dev->as = kcalloc(dev->nas, sizeof(dev->as[0]), GFP_KERNEL); + if (!dev->as) + goto err_as; + for (int i = 0; i < dev->nas; i++) + mutex_init(&dev->as[i].mem_lock); + + dev->ngroups = (dev->api_version < VDUSE_API_VERSION_1) + ? 1 + : config->ngroups; + dev->groups = kcalloc(dev->ngroups, sizeof(dev->groups[0]), + GFP_KERNEL); + if (!dev->groups) + goto err_vq_groups; + for (u32 i = 0; i < dev->ngroups; ++i) { + dev->groups[i].dev = dev; + rwlock_init(&dev->groups[i].as_lock); + dev->groups[i].as = &dev->as[0]; + } + dev->name = kstrdup(config->name, GFP_KERNEL); if (!dev->name) goto err_str; @@ -1894,6 +2131,10 @@ err_dev: err_idr: kfree(dev->name); err_str: + kfree(dev->groups); +err_vq_groups: + kfree(dev->as); +err_as: vduse_dev_destroy(dev); err: return ret; @@ -1909,6 +2150,8 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, mutex_lock(&vduse_lock); switch (cmd) { case VDUSE_GET_API_VERSION: + if (control->api_version == VDUSE_API_VERSION_NOT_ASKED) + control->api_version = VDUSE_API_VERSION_1; ret = put_user(control->api_version, (u64 __user *)argp); break; case VDUSE_SET_API_VERSION: { @@ -1919,7 +2162,7 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; - if (api_version > VDUSE_API_VERSION) + if (api_version > VDUSE_API_VERSION_1) break; ret = 0; @@ -1936,7 +2179,9 @@ static long vduse_ioctl(struct file *file, unsigned int cmd, break; ret = -EINVAL; - if (vduse_validate_config(&config) == false) + if (control->api_version == VDUSE_API_VERSION_NOT_ASKED) + control->api_version = VDUSE_API_VERSION; + if (!vduse_validate_config(&config, control->api_version)) break; buf = vmemdup_user(argp + size, config.config_size); @@ -1986,7 +2231,7 @@ static int vduse_open(struct inode *inode, struct file *file) if (!control) return -ENOMEM; - control->api_version = VDUSE_API_VERSION; + control->api_version = VDUSE_API_VERSION_NOT_ASKED; file->private_data = control; return 0; @@ -2017,7 +2262,7 @@ static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, &vduse_vdpa_config_ops, &vduse_map_ops, - 1, 1, name, true); + dev->ngroups, dev->nas, name, true); if (IS_ERR(vdev)) return PTR_ERR(vdev); @@ -2032,7 +2277,8 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, const struct vdpa_dev_set_config *config) { struct vduse_dev *dev; - int ret; + size_t domain_bounce_size; + int ret, i; mutex_lock(&vduse_lock); dev = vduse_find_dev(name); @@ -2046,27 +2292,41 @@ static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name, return ret; mutex_lock(&dev->domain_lock); - if (!dev->domain) - dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, - dev->bounce_size); - mutex_unlock(&dev->domain_lock); - if (!dev->domain) { - put_device(&dev->vdev->vdpa.dev); - return -ENOMEM; + ret = 0; + + domain_bounce_size = dev->bounce_size / dev->nas; + for (i = 0; i < dev->nas; ++i) { + dev->as[i].domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + domain_bounce_size); + if (!dev->as[i].domain) { + ret = -ENOMEM; + goto err; + } } - dev->vdev->vdpa.vmap.iova_domain = dev->domain; + mutex_unlock(&dev->domain_lock); + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); - if (ret) { - put_device(&dev->vdev->vdpa.dev); - mutex_lock(&dev->domain_lock); - vduse_domain_destroy(dev->domain); - dev->domain = NULL; - mutex_unlock(&dev->domain_lock); - return ret; - } + if (ret) + goto err_register; return 0; + +err_register: + mutex_lock(&dev->domain_lock); + +err: + for (int j = 0; j < i; j++) { + if (dev->as[j].domain) { + vduse_domain_destroy(dev->as[j].domain); + dev->as[j].domain = NULL; + } + } + mutex_unlock(&dev->domain_lock); + + put_device(&dev->vdev->vdpa.dev); + + return ret; } static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 05a481e4c385..cdee8f320dca 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -680,8 +680,10 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, case VHOST_VDPA_SET_GROUP_ASID: if (copy_from_user(&s, argp, sizeof(s))) return -EFAULT; - if (s.num >= vdpa->nas) + if (idx >= vdpa->ngroups || s.num >= vdpa->nas) return -EINVAL; + if (ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK) + return -EBUSY; if (!ops->set_group_asid) return -EOPNOTSUPP; return ops->set_group_asid(vdpa, idx, s.num); @@ -1527,6 +1529,7 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start != notify.size) return -ENOTSUPP; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); vma->vm_ops = &vhost_vdpa_vm_ops; return 0; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index bccdc9eab267..fcf7f10adbbf 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1444,13 +1444,13 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, ({ \ int ret; \ if (!vq->iotlb) { \ - ret = __put_user(x, ptr); \ + ret = put_user(x, ptr); \ } else { \ __typeof__(ptr) to = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), VHOST_ADDR_USED); \ if (to != NULL) \ - ret = __put_user(x, to); \ + ret = put_user(x, to); \ else \ ret = -EFAULT; \ } \ @@ -1489,14 +1489,14 @@ static inline int vhost_put_used_idx(struct vhost_virtqueue *vq) ({ \ int ret; \ if (!vq->iotlb) { \ - ret = __get_user(x, ptr); \ + ret = get_user(x, ptr); \ } else { \ __typeof__(ptr) from = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ sizeof(*ptr), \ type); \ if (from != NULL) \ - ret = __get_user(x, from); \ + ret = get_user(x, from); \ else \ ret = -EFAULT; \ } \ diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index d0728285b6ce..74df16677da8 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -16,7 +17,9 @@ struct virtio_input { char serial[64]; char phys[64]; struct virtqueue *evt, *sts; + __dma_from_device_group_begin(); struct virtio_input_event evts[64]; + __dma_from_device_group_end(); spinlock_t lock; bool ready; }; @@ -27,7 +30,7 @@ static void virtinput_queue_evtbuf(struct virtio_input *vi, struct scatterlist sg[1]; sg_init_one(sg, evtbuf, sizeof(*evtbuf)); - virtqueue_add_inbuf(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); + virtqueue_add_inbuf_cache_clean(vi->evt, sg, 1, evtbuf, GFP_ATOMIC); } static void virtinput_recv_events(struct virtqueue *vq) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ddab68959671..4fe0f78df5ec 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -67,6 +67,13 @@ #define LAST_ADD_TIME_INVALID(vq) #endif +enum vq_layout { + VQ_LAYOUT_SPLIT = 0, + VQ_LAYOUT_PACKED, + VQ_LAYOUT_SPLIT_IN_ORDER, + VQ_LAYOUT_PACKED_IN_ORDER, +}; + struct vring_desc_state_split { void *data; /* Data for callback. */ @@ -74,6 +81,7 @@ struct vring_desc_state_split { * allocated together. So we won't stress more to the memory allocator. */ struct vring_desc *indir_desc; + u32 total_in_len; }; struct vring_desc_state_packed { @@ -85,6 +93,7 @@ struct vring_desc_state_packed { struct vring_packed_desc *indir_desc; u16 num; /* Descriptor list length. */ u16 last; /* The last desc state in a list. */ + u32 total_in_len; /* In length for the skipped buffer. */ }; struct vring_desc_extra { @@ -159,12 +168,30 @@ struct vring_virtqueue_packed { size_t event_size_in_bytes; }; +struct vring_virtqueue; + +struct virtqueue_ops { + int (*add)(struct vring_virtqueue *vq, struct scatterlist *sgs[], + unsigned int total_sg, unsigned int out_sgs, + unsigned int in_sgs, void *data, + void *ctx, bool premapped, gfp_t gfp, + unsigned long attr); + void *(*get)(struct vring_virtqueue *vq, unsigned int *len, void **ctx); + bool (*kick_prepare)(struct vring_virtqueue *vq); + void (*disable_cb)(struct vring_virtqueue *vq); + bool (*enable_cb_delayed)(struct vring_virtqueue *vq); + unsigned int (*enable_cb_prepare)(struct vring_virtqueue *vq); + bool (*poll)(const struct vring_virtqueue *vq, + unsigned int last_used_idx); + void *(*detach_unused_buf)(struct vring_virtqueue *vq); + bool (*more_used)(const struct vring_virtqueue *vq); + int (*resize)(struct vring_virtqueue *vq, u32 num); + void (*reset)(struct vring_virtqueue *vq); +}; + struct vring_virtqueue { struct virtqueue vq; - /* Is this a packed ring? */ - bool packed_ring; - /* Is DMA API used? */ bool use_map_api; @@ -180,8 +207,26 @@ struct vring_virtqueue { /* Host publishes avail event idx */ bool event; - /* Head of free buffer list. */ + enum vq_layout layout; + + /* + * Without IN_ORDER it's the head of free buffer list. With + * IN_ORDER and SPLIT, it's the next available buffer + * index. With IN_ORDER and PACKED, it's unused. + */ unsigned int free_head; + + /* + * With IN_ORDER, once we see an in-order batch, this stores + * this last entry, and until we return the last buffer. + * After this, id is set to UINT_MAX to mark it invalid. + * Unused without IN_ORDER. + */ + struct used_entry { + u32 id; + u32 len; + } batch_last; + /* Number we've added since last sync. */ unsigned int num_added; @@ -193,6 +238,11 @@ struct vring_virtqueue { */ u16 last_used_idx; + /* With IN_ORDER and SPLIT, last descriptor id we used to + * detach buffer. + */ + u16 last_used; + /* Hint for event idx: already triggered no need to disable. */ bool event_triggered; @@ -231,6 +281,19 @@ static void vring_free(struct virtqueue *_vq); #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) + +static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq) +{ + return vq->layout == VQ_LAYOUT_PACKED || + vq->layout == VQ_LAYOUT_PACKED_IN_ORDER; +} + +static inline bool virtqueue_is_in_order(const struct vring_virtqueue *vq) +{ + return vq->layout == VQ_LAYOUT_SPLIT_IN_ORDER || + vq->layout == VQ_LAYOUT_PACKED_IN_ORDER; +} + static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, unsigned int total_sg) { @@ -382,7 +445,7 @@ static int vring_mapping_error(const struct vring_virtqueue *vq, /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr, - u32 *len, bool premapped) + u32 *len, bool premapped, unsigned long attr) { if (premapped) { *addr = sg_dma_address(sg); @@ -410,7 +473,7 @@ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist */ *addr = virtqueue_map_page_attrs(&vq->vq, sg_page(sg), sg->offset, sg->length, - direction, 0); + direction, attr); if (vring_mapping_error(vq, *addr)) return -ENOMEM; @@ -433,11 +496,13 @@ static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); else vq->last_used_idx = 0; + vq->last_used = 0; + vq->event_triggered = false; vq->num_added = 0; @@ -476,7 +541,7 @@ out: return extra->next; } -static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, +static struct vring_desc *alloc_indirect_split(struct vring_virtqueue *vq, unsigned int total_sg, gfp_t gfp) { @@ -505,7 +570,7 @@ static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, return desc; } -static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, +static inline unsigned int virtqueue_add_desc_split(struct vring_virtqueue *vq, struct vring_desc *desc, struct vring_desc_extra *extra, unsigned int i, @@ -513,11 +578,12 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, unsigned int len, u16 flags, bool premapped) { + struct virtio_device *vdev = vq->vq.vdev; u16 next; - desc[i].flags = cpu_to_virtio16(vq->vdev, flags); - desc[i].addr = cpu_to_virtio64(vq->vdev, addr); - desc[i].len = cpu_to_virtio32(vq->vdev, len); + desc[i].flags = cpu_to_virtio16(vdev, flags); + desc[i].addr = cpu_to_virtio64(vdev, addr); + desc[i].len = cpu_to_virtio32(vdev, len); extra[i].addr = premapped ? DMA_MAPPING_ERROR : addr; extra[i].len = len; @@ -525,12 +591,12 @@ static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, next = extra[i].next; - desc[i].next = cpu_to_virtio16(vq->vdev, next); + desc[i].next = cpu_to_virtio16(vdev, next); return next; } -static inline int virtqueue_add_split(struct virtqueue *_vq, +static inline int virtqueue_add_split(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, @@ -538,13 +604,15 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { - struct vring_virtqueue *vq = to_vvq(_vq); struct vring_desc_extra *extra; struct scatterlist *sg; struct vring_desc *desc; - unsigned int i, n, avail, descs_used, prev, err_idx; + unsigned int i, n, avail, descs_used, err_idx, sg_count = 0; + /* Total length for in-order */ + unsigned int total_in_len = 0; int head; bool indirect; @@ -565,7 +633,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, head = vq->free_head; if (virtqueue_use_indirect(vq, total_sg)) - desc = alloc_indirect_split(_vq, total_sg, gfp); + desc = alloc_indirect_split(vq, total_sg, gfp); else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); @@ -604,42 +672,43 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; + u16 flags = 0; - if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, premapped)) + if (++sg_count != total_sg) + flags |= VRING_DESC_F_NEXT; + + if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr, &len, + premapped, attr)) goto unmap_release; - prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, - VRING_DESC_F_NEXT, - premapped); + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, + len, flags, premapped); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; u32 len; + u16 flags = VRING_DESC_F_WRITE; - if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, premapped)) + if (++sg_count != total_sg) + flags |= VRING_DESC_F_NEXT; + + if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr, &len, + premapped, attr)) goto unmap_release; - prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ - i = virtqueue_add_desc_split(_vq, desc, extra, i, addr, len, - VRING_DESC_F_NEXT | - VRING_DESC_F_WRITE, - premapped); + i = virtqueue_add_desc_split(vq, desc, extra, i, addr, + len, flags, premapped); + total_in_len += len; } } - /* Last one doesn't continue. */ - desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); - if (!indirect && vring_need_unmap_buffer(vq, &extra[prev])) - vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= - ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ @@ -649,7 +718,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, if (vring_mapping_error(vq, addr)) goto unmap_release; - virtqueue_add_desc_split(_vq, vq->split.vring.desc, + virtqueue_add_desc_split(vq, vq->split.vring.desc, vq->split.desc_extra, head, addr, total_sg * sizeof(struct vring_desc), @@ -660,7 +729,12 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, vq->vq.num_free -= descs_used; /* Update free pointer */ - if (indirect) + if (virtqueue_is_in_order(vq)) { + vq->free_head += descs_used; + if (vq->free_head >= vq->split.vring.num) + vq->free_head -= vq->split.vring.num; + vq->split.desc_state[head].total_in_len = total_in_len; + } else if (indirect) vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; @@ -675,13 +749,13 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); - vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); + vq->split.vring.avail->ring[avail] = cpu_to_virtio16(vq->vq.vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; - vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev, vq->split.avail_idx_shadow); vq->num_added++; @@ -691,7 +765,7 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, /* This is very unlikely, but theoretically possible. Kick * just in case. */ if (unlikely(vq->num_added == (1 << 16) - 1)) - virtqueue_kick(_vq); + virtqueue_kick(&vq->vq); return 0; @@ -717,9 +791,8 @@ unmap_release: return -ENOMEM; } -static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) +static bool virtqueue_kick_prepare_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old; bool needs_kick; @@ -736,23 +809,54 @@ static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) LAST_ADD_TIME_INVALID(vq); if (vq->event) { - needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, + needs_kick = vring_need_event(virtio16_to_cpu(vq->vq.vdev, vring_avail_event(&vq->split.vring)), new, old); } else { needs_kick = !(vq->split.vring.used->flags & - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, VRING_USED_F_NO_NOTIFY)); } END_USE(vq); return needs_kick; } -static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, - void **ctx) +static void detach_indirect_split(struct vring_virtqueue *vq, + unsigned int head) +{ + struct vring_desc_extra *extra = vq->split.desc_extra; + struct vring_desc *indir_desc = vq->split.desc_state[head].indir_desc; + unsigned int j; + u32 len, num; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + len = vq->split.desc_extra[head].len; + + BUG_ON(!(vq->split.desc_extra[head].flags & + VRING_DESC_F_INDIRECT)); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + num = len / sizeof(struct vring_desc); + + extra = (struct vring_desc_extra *)&indir_desc[num]; + + if (vq->use_map_api) { + for (j = 0; j < num; j++) + vring_unmap_one_split(vq, &extra[j]); + } + + kfree(indir_desc); + vq->split.desc_state[head].indir_desc = NULL; +} + +static unsigned detach_buf_split_in_order(struct vring_virtqueue *vq, + unsigned int head, + void **ctx) { struct vring_desc_extra *extra; - unsigned int i, j; + unsigned int i; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ @@ -764,59 +868,56 @@ static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, i = head; while (vq->split.vring.desc[i].flags & nextflag) { - vring_unmap_one_split(vq, &extra[i]); - i = vq->split.desc_extra[i].next; + i = vring_unmap_one_split(vq, &extra[i]); vq->vq.num_free++; } vring_unmap_one_split(vq, &extra[i]); - vq->split.desc_extra[i].next = vq->free_head; - vq->free_head = head; /* Plus final descriptor */ vq->vq.num_free++; - if (vq->indirect) { - struct vring_desc *indir_desc = - vq->split.desc_state[head].indir_desc; - u32 len, num; - - /* Free the indirect table, if any, now that it's unmapped. */ - if (!indir_desc) - return; - len = vq->split.desc_extra[head].len; - - BUG_ON(!(vq->split.desc_extra[head].flags & - VRING_DESC_F_INDIRECT)); - BUG_ON(len == 0 || len % sizeof(struct vring_desc)); - - num = len / sizeof(struct vring_desc); - - extra = (struct vring_desc_extra *)&indir_desc[num]; - - if (vq->use_map_api) { - for (j = 0; j < num; j++) - vring_unmap_one_split(vq, &extra[j]); - } - - kfree(indir_desc); - vq->split.desc_state[head].indir_desc = NULL; - } else if (ctx) { + if (vq->indirect) + detach_indirect_split(vq, head); + else if (ctx) *ctx = vq->split.desc_state[head].indir_desc; - } + + return i; +} + +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, + void **ctx) +{ + unsigned int i = detach_buf_split_in_order(vq, head, ctx); + + vq->split.desc_extra[i].next = vq->free_head; + vq->free_head = head; +} + +static bool virtqueue_poll_split(const struct vring_virtqueue *vq, + unsigned int last_used_idx) +{ + return (u16)last_used_idx != virtio16_to_cpu(vq->vq.vdev, + vq->split.vring.used->idx); } static bool more_used_split(const struct vring_virtqueue *vq) { - return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, - vq->split.vring.used->idx); + return virtqueue_poll_split(vq, vq->last_used_idx); } -static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, +static bool more_used_split_in_order(const struct vring_virtqueue *vq) +{ + if (vq->batch_last.id != UINT_MAX) + return true; + + return virtqueue_poll_split(vq, vq->last_used_idx); +} + +static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, unsigned int *len, void **ctx) { - struct vring_virtqueue *vq = to_vvq(_vq); void *ret; unsigned int i; u16 last_used; @@ -838,9 +939,9 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); - i = virtio32_to_cpu(_vq->vdev, + i = virtio32_to_cpu(vq->vq.vdev, vq->split.vring.used->ring[last_used].id); - *len = virtio32_to_cpu(_vq->vdev, + *len = virtio32_to_cpu(vq->vq.vdev, vq->split.vring.used->ring[last_used].len); if (unlikely(i >= vq->split.vring.num)) { @@ -862,7 +963,7 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), - cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); @@ -870,10 +971,78 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, return ret; } -static void virtqueue_disable_cb_split(struct virtqueue *_vq) +static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue *vq, + unsigned int *len, + void **ctx) { - struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int num = vq->split.vring.num; + unsigned int num_free = vq->vq.num_free; + u16 last_used, last_used_idx; + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + last_used = vq->last_used & (num - 1); + last_used_idx = vq->last_used_idx & (num - 1); + + if (vq->batch_last.id == UINT_MAX) { + if (!more_used_split_in_order(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + /* + * Only get used array entries after they have been + * exposed by host. + */ + virtio_rmb(vq->weak_barriers); + + vq->batch_last.id = virtio32_to_cpu(vq->vq.vdev, + vq->split.vring.used->ring[last_used_idx].id); + vq->batch_last.len = virtio32_to_cpu(vq->vq.vdev, + vq->split.vring.used->ring[last_used_idx].len); + } + + if (vq->batch_last.id == last_used) { + vq->batch_last.id = UINT_MAX; + *len = vq->batch_last.len; + } else { + *len = vq->split.desc_state[last_used].total_in_len; + } + + if (unlikely(!vq->split.desc_state[last_used].data)) { + BAD_RING(vq, "id %u is not a head!\n", last_used); + return NULL; + } + + /* detach_buf_split clears data, so grab it now. */ + ret = vq->split.desc_state[last_used].data; + detach_buf_split_in_order(vq, last_used, ctx); + + vq->last_used_idx++; + vq->last_used += (vq->vq.num_free - num_free); + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) + virtio_store_mb(vq->weak_barriers, + &vring_used_event(&vq->split.vring), + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx)); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + +static void virtqueue_disable_cb_split(struct vring_virtqueue *vq) +{ if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; @@ -889,14 +1058,13 @@ static void virtqueue_disable_cb_split(struct virtqueue *_vq) vring_used_event(&vq->split.vring) = 0x0; else vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } } -static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) +static unsigned int virtqueue_enable_cb_prepare_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used_idx; START_USE(vq); @@ -910,26 +1078,17 @@ static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } - vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev, + vring_used_event(&vq->split.vring) = cpu_to_virtio16(vq->vq.vdev, last_used_idx = vq->last_used_idx); END_USE(vq); return last_used_idx; } -static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx) +static bool virtqueue_enable_cb_delayed_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - - return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, - vq->split.vring.used->idx); -} - -static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) -{ - struct vring_virtqueue *vq = to_vvq(_vq); u16 bufs; START_USE(vq); @@ -943,7 +1102,7 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = - cpu_to_virtio16(_vq->vdev, + cpu_to_virtio16(vq->vq.vdev, vq->split.avail_flags_shadow); } /* TODO: tune this threshold */ @@ -951,9 +1110,9 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), - cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); + cpu_to_virtio16(vq->vq.vdev, vq->last_used_idx + bufs)); - if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx) + if (unlikely((u16)(virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx) - vq->last_used_idx) > bufs)) { END_USE(vq); return false; @@ -963,9 +1122,8 @@ static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) return true; } -static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) +static void *virtqueue_detach_unused_buf_split(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; @@ -976,9 +1134,12 @@ static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) continue; /* detach_buf_split clears data, so grab it now. */ buf = vq->split.desc_state[i].data; - detach_buf_split(vq, i, NULL); + if (virtqueue_is_in_order(vq)) + detach_buf_split_in_order(vq, i, NULL); + else + detach_buf_split(vq, i, NULL); vq->split.avail_idx_shadow--; - vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, + vq->split.vring.avail->idx = cpu_to_virtio16(vq->vq.vdev, vq->split.avail_idx_shadow); END_USE(vq); return buf; @@ -1009,7 +1170,7 @@ static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split } } -static void virtqueue_reinit_split(struct vring_virtqueue *vq) +static void virtqueue_reset_split(struct vring_virtqueue *vq) { int num; @@ -1039,6 +1200,7 @@ static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, /* Put everything in free lists. */ vq->free_head = 0; + vq->batch_last.id = UINT_MAX; } static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) @@ -1131,6 +1293,8 @@ static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, return 0; } +static const struct virtqueue_ops split_ops; + static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, @@ -1148,7 +1312,6 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, if (!vq) return NULL; - vq->packed_ring = false; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; @@ -1168,6 +1331,8 @@ static struct virtqueue *__vring_new_virtqueue_split(unsigned int index, vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ? + VQ_LAYOUT_SPLIT_IN_ORDER : VQ_LAYOUT_SPLIT; if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; @@ -1223,11 +1388,10 @@ static struct virtqueue *vring_create_virtqueue_split( return vq; } -static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) +static int virtqueue_resize_split(struct vring_virtqueue *vq, u32 num) { struct vring_virtqueue_split vring_split = {}; - struct vring_virtqueue *vq = to_vvq(_vq); - struct virtio_device *vdev = _vq->vdev; + struct virtio_device *vdev = vq->vq.vdev; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, @@ -1253,7 +1417,7 @@ static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) err_state_extra: vring_free_split(&vring_split, vdev, vq->map); err: - virtqueue_reinit_split(vq); + virtqueue_reset_split(vq); return -ENOMEM; } @@ -1326,13 +1490,15 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int in_sgs, void *data, bool premapped, - gfp_t gfp) + gfp_t gfp, + u16 id, + unsigned long attr) { struct vring_desc_extra *extra; struct vring_packed_desc *desc; struct scatterlist *sg; - unsigned int i, n, err_idx, len; - u16 head, id; + unsigned int i, n, err_idx, len, total_in_len = 0; + u16 head; dma_addr_t addr; head = vq->packed.next_avail_idx; @@ -1350,14 +1516,12 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, } i = 0; - id = vq->free_head; - BUG_ON(id == vq->packed.vring.num); for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - &addr, &len, premapped)) + &addr, &len, premapped, attr)) goto unmap_release; desc[i].flags = cpu_to_le16(n < out_sgs ? @@ -1371,6 +1535,8 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, extra[i].flags = n < out_sgs ? 0 : VRING_DESC_F_WRITE; } + if (n >= out_sgs) + total_in_len += len; i++; } } @@ -1417,13 +1583,15 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; - vq->free_head = vq->packed.desc_extra[id].next; + if (!virtqueue_is_in_order(vq)) + vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; + vq->packed.desc_state[id].total_in_len = total_in_len; vq->num_added += 1; @@ -1444,7 +1612,7 @@ unmap_release: return -ENOMEM; } -static inline int virtqueue_add_packed(struct virtqueue *_vq, +static inline int virtqueue_add_packed(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, @@ -1452,9 +1620,9 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { - struct vring_virtqueue *vq = to_vvq(_vq); struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, c, descs_used, err_idx, len; @@ -1477,8 +1645,11 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, BUG_ON(total_sg == 0); if (virtqueue_use_indirect(vq, total_sg)) { + id = vq->free_head; + BUG_ON(id == vq->packed.vring.num); err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, - in_sgs, data, premapped, gfp); + in_sgs, data, premapped, gfp, + id, attr); if (err != -ENOMEM) { END_USE(vq); return err; @@ -1514,7 +1685,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - &addr, &len, premapped)) + &addr, &len, premapped, attr)) goto unmap_release; flags = cpu_to_le16(vq->packed.avail_used_flags | @@ -1599,9 +1770,164 @@ unmap_release: return -EIO; } -static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq) +static inline int virtqueue_add_packed_in_order(struct vring_virtqueue *vq, + struct scatterlist *sgs[], + unsigned int total_sg, + unsigned int out_sgs, + unsigned int in_sgs, + void *data, + void *ctx, + bool premapped, + gfp_t gfp, + unsigned long attr) +{ + struct vring_packed_desc *desc; + struct scatterlist *sg; + unsigned int i, n, sg_count, err_idx, total_in_len = 0; + __le16 head_flags, flags; + u16 head, avail_used_flags; + bool avail_wrap_counter; + int err; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + + LAST_ADD_TIME_UPDATE(vq); + + BUG_ON(total_sg == 0); + + if (virtqueue_use_indirect(vq, total_sg)) { + err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, + in_sgs, data, premapped, gfp, + vq->packed.next_avail_idx, + attr); + if (err != -ENOMEM) { + END_USE(vq); + return err; + } + + /* fall back on direct */ + } + + head = vq->packed.next_avail_idx; + avail_used_flags = vq->packed.avail_used_flags; + avail_wrap_counter = vq->packed.avail_wrap_counter; + + WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect); + + desc = vq->packed.vring.desc; + i = head; + + if (unlikely(vq->vq.num_free < total_sg)) { + pr_debug("Can't add buf len %i - avail = %i\n", + total_sg, vq->vq.num_free); + END_USE(vq); + return -ENOSPC; + } + + sg_count = 0; + for (n = 0; n < out_sgs + in_sgs; n++) { + for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr; + u32 len; + + flags = 0; + if (++sg_count != total_sg) + flags |= cpu_to_le16(VRING_DESC_F_NEXT); + if (n >= out_sgs) + flags |= cpu_to_le16(VRING_DESC_F_WRITE); + + if (vring_map_one_sg(vq, sg, n < out_sgs ? + DMA_TO_DEVICE : DMA_FROM_DEVICE, + &addr, &len, premapped, attr)) + goto unmap_release; + + flags |= cpu_to_le16(vq->packed.avail_used_flags); + + if (i == head) + head_flags = flags; + else + desc[i].flags = flags; + + desc[i].addr = cpu_to_le64(addr); + desc[i].len = cpu_to_le32(len); + desc[i].id = cpu_to_le16(head); + + if (unlikely(vq->use_map_api)) { + vq->packed.desc_extra[i].addr = premapped ? + DMA_MAPPING_ERROR : addr; + vq->packed.desc_extra[i].len = len; + vq->packed.desc_extra[i].flags = + le16_to_cpu(flags); + } + + if ((unlikely(++i >= vq->packed.vring.num))) { + i = 0; + vq->packed.avail_used_flags ^= + 1 << VRING_PACKED_DESC_F_AVAIL | + 1 << VRING_PACKED_DESC_F_USED; + vq->packed.avail_wrap_counter ^= 1; + } + + if (n >= out_sgs) + total_in_len += len; + } + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= total_sg; + + /* Update free pointer */ + vq->packed.next_avail_idx = i; + + /* Store token. */ + vq->packed.desc_state[head].num = total_sg; + vq->packed.desc_state[head].data = data; + vq->packed.desc_state[head].indir_desc = ctx; + vq->packed.desc_state[head].total_in_len = total_in_len; + + /* + * A driver MUST NOT make the first descriptor in the list + * available before all subsequent descriptors comprising + * the list are made available. + */ + virtio_wmb(vq->weak_barriers); + vq->packed.vring.desc[head].flags = head_flags; + vq->num_added += total_sg; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + + return 0; + +unmap_release: + err_idx = i; + i = head; + vq->packed.avail_used_flags = avail_used_flags; + vq->packed.avail_wrap_counter = avail_wrap_counter; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + vring_unmap_extra_packed(vq, &vq->packed.desc_extra[i]); + i++; + if (i >= vq->packed.vring.num) + i = 0; + } + + END_USE(vq); + return -EIO; +} + +static bool virtqueue_kick_prepare_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old, off_wrap, flags, wrap_counter, event_idx; bool needs_kick; union { @@ -1648,8 +1974,8 @@ out: return needs_kick; } -static void detach_buf_packed(struct vring_virtqueue *vq, - unsigned int id, void **ctx) +static void detach_buf_packed_in_order(struct vring_virtqueue *vq, + unsigned int id, void **ctx) { struct vring_desc_state_packed *state = NULL; struct vring_packed_desc *desc; @@ -1660,8 +1986,6 @@ static void detach_buf_packed(struct vring_virtqueue *vq, /* Clear data ptr. */ state->data = NULL; - vq->packed.desc_extra[state->last].next = vq->free_head; - vq->free_head = id; vq->vq.num_free += state->num; if (unlikely(vq->use_map_api)) { @@ -1698,6 +2022,17 @@ static void detach_buf_packed(struct vring_virtqueue *vq, } } +static void detach_buf_packed(struct vring_virtqueue *vq, + unsigned int id, void **ctx) +{ + struct vring_desc_state_packed *state = &vq->packed.desc_state[id]; + + vq->packed.desc_extra[state->last].next = vq->free_head; + vq->free_head = id; + + detach_buf_packed_in_order(vq, id, ctx); +} + static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, u16 idx, bool used_wrap_counter) { @@ -1711,23 +2046,123 @@ static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, return avail == used && used == used_wrap_counter; } -static bool more_used_packed(const struct vring_virtqueue *vq) +static bool virtqueue_poll_packed(const struct vring_virtqueue *vq, + unsigned int off_wrap) { - u16 last_used; - u16 last_used_idx; - bool used_wrap_counter; + bool wrap_counter; + u16 used_idx; - last_used_idx = READ_ONCE(vq->last_used_idx); - last_used = packed_last_used(last_used_idx); - used_wrap_counter = packed_used_wrap_counter(last_used_idx); - return is_used_desc_packed(vq, last_used, used_wrap_counter); + wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; + used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); + + return is_used_desc_packed(vq, used_idx, wrap_counter); } -static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, +static bool more_used_packed(const struct vring_virtqueue *vq) +{ + return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); +} + +static void update_last_used_idx_packed(struct vring_virtqueue *vq, + u16 id, u16 last_used, + u16 used_wrap_counter) +{ + last_used += vq->packed.desc_state[id].num; + if (unlikely(last_used >= vq->packed.vring.num)) { + last_used -= vq->packed.vring.num; + used_wrap_counter ^= 1; + } + + last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); + WRITE_ONCE(vq->last_used_idx, last_used); + + /* + * If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. + */ + if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) + virtio_store_mb(vq->weak_barriers, + &vq->packed.vring.driver->off_wrap, + cpu_to_le16(vq->last_used_idx)); +} + +static bool more_used_packed_in_order(const struct vring_virtqueue *vq) +{ + if (vq->batch_last.id != UINT_MAX) + return true; + + return virtqueue_poll_packed(vq, READ_ONCE(vq->last_used_idx)); +} + +static void *virtqueue_get_buf_ctx_packed_in_order(struct vring_virtqueue *vq, + unsigned int *len, + void **ctx) +{ + unsigned int num = vq->packed.vring.num; + u16 last_used, last_used_idx; + bool used_wrap_counter; + void *ret; + + START_USE(vq); + + if (unlikely(vq->broken)) { + END_USE(vq); + return NULL; + } + + last_used_idx = vq->last_used_idx; + used_wrap_counter = packed_used_wrap_counter(last_used_idx); + last_used = packed_last_used(last_used_idx); + + if (vq->batch_last.id == UINT_MAX) { + if (!more_used_packed_in_order(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + /* Only get used elements after they have been exposed by host. */ + virtio_rmb(vq->weak_barriers); + vq->batch_last.id = + le16_to_cpu(vq->packed.vring.desc[last_used].id); + vq->batch_last.len = + le32_to_cpu(vq->packed.vring.desc[last_used].len); + } + + if (vq->batch_last.id == last_used) { + vq->batch_last.id = UINT_MAX; + *len = vq->batch_last.len; + } else { + *len = vq->packed.desc_state[last_used].total_in_len; + } + + if (unlikely(last_used >= num)) { + BAD_RING(vq, "id %u out of range\n", last_used); + return NULL; + } + if (unlikely(!vq->packed.desc_state[last_used].data)) { + BAD_RING(vq, "id %u is not a head!\n", last_used); + return NULL; + } + + /* detach_buf_packed clears data, so grab it now. */ + ret = vq->packed.desc_state[last_used].data; + detach_buf_packed_in_order(vq, last_used, ctx); + + update_last_used_idx_packed(vq, last_used, last_used, + used_wrap_counter); + + LAST_ADD_TIME_INVALID(vq); + + END_USE(vq); + return ret; +} + +static void *virtqueue_get_buf_ctx_packed(struct vring_virtqueue *vq, unsigned int *len, void **ctx) { - struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int num = vq->packed.vring.num; u16 last_used, id, last_used_idx; bool used_wrap_counter; void *ret; @@ -1754,7 +2189,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, id = le16_to_cpu(vq->packed.vring.desc[last_used].id); *len = le32_to_cpu(vq->packed.vring.desc[last_used].len); - if (unlikely(id >= vq->packed.vring.num)) { + if (unlikely(id >= num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } @@ -1767,24 +2202,7 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, ret = vq->packed.desc_state[id].data; detach_buf_packed(vq, id, ctx); - last_used += vq->packed.desc_state[id].num; - if (unlikely(last_used >= vq->packed.vring.num)) { - last_used -= vq->packed.vring.num; - used_wrap_counter ^= 1; - } - - last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); - WRITE_ONCE(vq->last_used_idx, last_used); - - /* - * If we expect an interrupt for the next entry, tell host - * by writing event index and flush out the write before - * the read in the next get_buf call. - */ - if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) - virtio_store_mb(vq->weak_barriers, - &vq->packed.vring.driver->off_wrap, - cpu_to_le16(vq->last_used_idx)); + update_last_used_idx_packed(vq, id, last_used, used_wrap_counter); LAST_ADD_TIME_INVALID(vq); @@ -1792,10 +2210,8 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, return ret; } -static void virtqueue_disable_cb_packed(struct virtqueue *_vq) +static void virtqueue_disable_cb_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -1811,10 +2227,8 @@ static void virtqueue_disable_cb_packed(struct virtqueue *_vq) } } -static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) +static unsigned int virtqueue_enable_cb_prepare_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - START_USE(vq); /* @@ -1844,21 +2258,8 @@ static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) return vq->last_used_idx; } -static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap) +static bool virtqueue_enable_cb_delayed_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); - bool wrap_counter; - u16 used_idx; - - wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; - used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); - - return is_used_desc_packed(vq, used_idx, wrap_counter); -} - -static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) -{ - struct vring_virtqueue *vq = to_vvq(_vq); u16 used_idx, wrap_counter, last_used_idx; u16 bufs; @@ -1917,9 +2318,8 @@ static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) return true; } -static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) +static void *virtqueue_detach_unused_buf_packed(struct vring_virtqueue *vq) { - struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; @@ -1930,7 +2330,10 @@ static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) continue; /* detach_buf clears data, so grab it now. */ buf = vq->packed.desc_state[i].data; - detach_buf_packed(vq, i, NULL); + if (virtqueue_is_in_order(vq)) + detach_buf_packed_in_order(vq, i, NULL); + else + detach_buf_packed(vq, i, NULL); END_USE(vq); return buf; } @@ -1956,6 +2359,8 @@ static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) for (i = 0; i < num - 1; i++) desc_extra[i].next = i + 1; + desc_extra[num - 1].next = 0; + return desc_extra; } @@ -2087,22 +2492,30 @@ static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, { vq->packed = *vring_packed; - /* Put everything in free lists. */ - vq->free_head = 0; + if (virtqueue_is_in_order(vq)) { + vq->batch_last.id = UINT_MAX; + } else { + /* + * Put everything in free lists. Note that + * next_avail_idx is sufficient with IN_ORDER so + * free_head is unused. + */ + vq->free_head = 0; + } } - -static void virtqueue_reinit_packed(struct vring_virtqueue *vq) +static void virtqueue_reset_packed(struct vring_virtqueue *vq) { memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); - virtqueue_init(vq, vq->packed.vring.num); virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); } +static const struct virtqueue_ops packed_ops; + static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, @@ -2133,13 +2546,14 @@ static struct virtqueue *__vring_new_virtqueue_packed(unsigned int index, #else vq->broken = false; #endif - vq->packed_ring = true; vq->map = map; vq->use_map_api = vring_use_map_api(vdev); vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); + vq->layout = virtio_has_feature(vdev, VIRTIO_F_IN_ORDER) ? + VQ_LAYOUT_PACKED_IN_ORDER : VQ_LAYOUT_PACKED; if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; @@ -2192,11 +2606,10 @@ static struct virtqueue *vring_create_virtqueue_packed( return vq; } -static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) +static int virtqueue_resize_packed(struct vring_virtqueue *vq, u32 num) { struct vring_virtqueue_packed vring_packed = {}; - struct vring_virtqueue *vq = to_vvq(_vq); - struct virtio_device *vdev = _vq->vdev; + struct virtio_device *vdev = vq->vq.vdev; int err; if (vring_alloc_queue_packed(&vring_packed, vdev, num, vq->map)) @@ -2218,10 +2631,66 @@ static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) err_state_extra: vring_free_packed(&vring_packed, vdev, vq->map); err_ring: - virtqueue_reinit_packed(vq); + virtqueue_reset_packed(vq); return -ENOMEM; } +static const struct virtqueue_ops split_ops = { + .add = virtqueue_add_split, + .get = virtqueue_get_buf_ctx_split, + .kick_prepare = virtqueue_kick_prepare_split, + .disable_cb = virtqueue_disable_cb_split, + .enable_cb_delayed = virtqueue_enable_cb_delayed_split, + .enable_cb_prepare = virtqueue_enable_cb_prepare_split, + .poll = virtqueue_poll_split, + .detach_unused_buf = virtqueue_detach_unused_buf_split, + .more_used = more_used_split, + .resize = virtqueue_resize_split, + .reset = virtqueue_reset_split, +}; + +static const struct virtqueue_ops packed_ops = { + .add = virtqueue_add_packed, + .get = virtqueue_get_buf_ctx_packed, + .kick_prepare = virtqueue_kick_prepare_packed, + .disable_cb = virtqueue_disable_cb_packed, + .enable_cb_delayed = virtqueue_enable_cb_delayed_packed, + .enable_cb_prepare = virtqueue_enable_cb_prepare_packed, + .poll = virtqueue_poll_packed, + .detach_unused_buf = virtqueue_detach_unused_buf_packed, + .more_used = more_used_packed, + .resize = virtqueue_resize_packed, + .reset = virtqueue_reset_packed, +}; + +static const struct virtqueue_ops split_in_order_ops = { + .add = virtqueue_add_split, + .get = virtqueue_get_buf_ctx_split_in_order, + .kick_prepare = virtqueue_kick_prepare_split, + .disable_cb = virtqueue_disable_cb_split, + .enable_cb_delayed = virtqueue_enable_cb_delayed_split, + .enable_cb_prepare = virtqueue_enable_cb_prepare_split, + .poll = virtqueue_poll_split, + .detach_unused_buf = virtqueue_detach_unused_buf_split, + .more_used = more_used_split_in_order, + .resize = virtqueue_resize_split, + .reset = virtqueue_reset_split, +}; + +static const struct virtqueue_ops packed_in_order_ops = { + .add = virtqueue_add_packed_in_order, + .get = virtqueue_get_buf_ctx_packed_in_order, + .kick_prepare = virtqueue_kick_prepare_packed, + .disable_cb = virtqueue_disable_cb_packed, + .enable_cb_delayed = virtqueue_enable_cb_delayed_packed, + .enable_cb_prepare = virtqueue_enable_cb_prepare_packed, + .poll = virtqueue_poll_packed, + .detach_unused_buf = virtqueue_detach_unused_buf_packed, + .more_used = more_used_packed_in_order, + .resize = virtqueue_resize_packed, + .reset = virtqueue_reset_packed, +}; + static int virtqueue_disable_and_recycle(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { @@ -2264,6 +2733,54 @@ static int virtqueue_enable_after_reset(struct virtqueue *_vq) * Generic functions and exported symbols. */ +#define VIRTQUEUE_CALL(vq, op, ...) \ + ({ \ + typeof(vq) __VIRTQUEUE_CALL_vq = (vq); \ + typeof(split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__)) ret; \ + \ + switch (__VIRTQUEUE_CALL_vq->layout) { \ + case VQ_LAYOUT_SPLIT: \ + ret = split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED: \ + ret = packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__);\ + break; \ + case VQ_LAYOUT_SPLIT_IN_ORDER: \ + ret = split_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED_IN_ORDER: \ + ret = packed_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + default: \ + BUG(); \ + break; \ + } \ + ret; \ +}) + +#define VOID_VIRTQUEUE_CALL(vq, op, ...) \ + ({ \ + typeof(vq) __VIRTQUEUE_CALL_vq = (vq); \ + \ + switch (__VIRTQUEUE_CALL_vq->layout) { \ + case VQ_LAYOUT_SPLIT: \ + split_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED: \ + packed_ops.op(__VIRTQUEUE_CALL_vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_SPLIT_IN_ORDER: \ + split_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + case VQ_LAYOUT_PACKED_IN_ORDER: \ + packed_in_order_ops.op(vq, ##__VA_ARGS__); \ + break; \ + default: \ + BUG(); \ + break; \ + } \ +}) + static inline int virtqueue_add(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -2272,14 +2789,14 @@ static inline int virtqueue_add(struct virtqueue *_vq, void *data, void *ctx, bool premapped, - gfp_t gfp) + gfp_t gfp, + unsigned long attr) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, premapped, gfp) : - virtqueue_add_split(_vq, sgs, total_sg, - out_sgs, in_sgs, data, ctx, premapped, gfp); + return VIRTQUEUE_CALL(vq, add, sgs, total_sg, + out_sgs, in_sgs, data, + ctx, premapped, gfp, attr); } /** @@ -2317,7 +2834,7 @@ int virtqueue_add_sgs(struct virtqueue *_vq, total_sg++; } return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, - data, NULL, false, gfp); + data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); @@ -2339,7 +2856,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); @@ -2362,7 +2879,7 @@ int virtqueue_add_outbuf_premapped(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, true, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf_premapped); @@ -2384,10 +2901,38 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); +/** + * virtqueue_add_inbuf_cache_clean - expose input buffers with cache clean + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @gfp: how to do memory allocations (if necessary). + * + * Same as virtqueue_add_inbuf but passes DMA_ATTR_CPU_CACHE_CLEAN to indicate + * that the CPU will not dirty any cacheline overlapping this buffer while it + * is available, and to suppress overlapping cacheline warnings in DMA debug + * builds. + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, false, gfp, + DMA_ATTR_CPU_CACHE_CLEAN); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_cache_clean); + /** * virtqueue_add_inbuf_ctx - expose input buffers to other end * @vq: the struct virtqueue we're talking about. @@ -2408,7 +2953,7 @@ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, void *ctx, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, false, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); @@ -2433,7 +2978,7 @@ int virtqueue_add_inbuf_premapped(struct virtqueue *vq, void *ctx, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, true, gfp, 0); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_premapped); @@ -2469,8 +3014,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) : - virtqueue_kick_prepare_split(_vq); + return VIRTQUEUE_CALL(vq, kick_prepare); } EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); @@ -2540,8 +3084,7 @@ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) : - virtqueue_get_buf_ctx_split(_vq, len, ctx); + return VIRTQUEUE_CALL(vq, get, len, ctx); } EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); @@ -2563,10 +3106,7 @@ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - if (vq->packed_ring) - virtqueue_disable_cb_packed(_vq); - else - virtqueue_disable_cb_split(_vq); + VOID_VIRTQUEUE_CALL(vq, disable_cb); } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); @@ -2589,8 +3129,7 @@ unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) if (vq->event_triggered) vq->event_triggered = false; - return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : - virtqueue_enable_cb_prepare_split(_vq); + return VIRTQUEUE_CALL(vq, enable_cb_prepare); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); @@ -2611,8 +3150,8 @@ bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) return false; virtio_mb(vq->weak_barriers); - return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) : - virtqueue_poll_split(_vq, last_used_idx); + + return VIRTQUEUE_CALL(vq, poll, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_poll); @@ -2655,8 +3194,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) if (vq->event_triggered) data_race(vq->event_triggered = false); - return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : - virtqueue_enable_cb_delayed_split(_vq); + return VIRTQUEUE_CALL(vq, enable_cb_delayed); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); @@ -2672,14 +3210,13 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) : - virtqueue_detach_unused_buf_split(_vq); + return VIRTQUEUE_CALL(vq, detach_unused_buf); } EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); static inline bool more_used(const struct vring_virtqueue *vq) { - return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq); + return VIRTQUEUE_CALL(vq, more_used); } /** @@ -2809,7 +3346,7 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, if (!num) return -EINVAL; - if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) + if (virtqueue_get_vring_size(_vq) == num) return 0; err = virtqueue_disable_and_recycle(_vq, recycle); @@ -2818,10 +3355,7 @@ int virtqueue_resize(struct virtqueue *_vq, u32 num, if (recycle_done) recycle_done(_vq); - if (vq->packed_ring) - err = virtqueue_resize_packed(_vq, num); - else - err = virtqueue_resize_split(_vq, num); + err = VIRTQUEUE_CALL(vq, resize, num); err_reset = virtqueue_enable_after_reset(_vq); if (err_reset) @@ -2859,10 +3393,7 @@ int virtqueue_reset(struct virtqueue *_vq, if (recycle_done) recycle_done(_vq); - if (vq->packed_ring) - virtqueue_reinit_packed(vq); - else - virtqueue_reinit_split(vq); + VOID_VIRTQUEUE_CALL(vq, reset); return virtqueue_enable_after_reset(_vq); } @@ -2905,7 +3436,7 @@ static void vring_free(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->we_own_ring) { - if (vq->packed_ring) { + if (virtqueue_is_packed(vq)) { vring_free_queue(vq->vq.vdev, vq->packed.ring_size_in_bytes, vq->packed.vring.desc, @@ -2934,7 +3465,7 @@ static void vring_free(struct virtqueue *_vq) vq->map); } } - if (!vq->packed_ring) { + if (!virtqueue_is_packed(vq)) { kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } @@ -2959,7 +3490,7 @@ u32 vring_notification_data(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); u16 next; - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) next = (vq->packed.next_avail_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | vq->packed.avail_wrap_counter << @@ -2992,6 +3523,8 @@ void vring_transport_features(struct virtio_device *vdev) break; case VIRTIO_F_NOTIFICATION_DATA: break; + case VIRTIO_F_IN_ORDER: + break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); @@ -3012,7 +3545,8 @@ unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) const struct vring_virtqueue *vq = to_vvq(_vq); - return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; + return virtqueue_is_packed(vq) ? vq->packed.vring.num : + vq->split.vring.num; } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); @@ -3095,7 +3629,7 @@ dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.ring_dma_addr; return vq->split.queue_dma_addr; @@ -3108,7 +3642,7 @@ dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.driver_event_dma_addr; return vq->split.queue_dma_addr + @@ -3122,7 +3656,7 @@ dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) BUG_ON(!vq->we_own_ring); - if (vq->packed_ring) + if (virtqueue_is_packed(vq)) return vq->packed.device_event_dma_addr; return vq->split.queue_dma_addr + diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index aa36a0d1d9df..29973baa0581 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -7,6 +7,7 @@ #include #include #include +#include /** * List of possible attributes associated with a DMA mapping. The semantics @@ -78,6 +79,13 @@ */ #define DMA_ATTR_MMIO (1UL << 10) +/* + * DMA_ATTR_CPU_CACHE_CLEAN: Indicates the CPU will not dirty any cacheline + * overlapping this buffer while it is mapped for DMA. All mappings sharing + * a cacheline must have this attribute for this to be considered safe. + */ +#define DMA_ATTR_CPU_CACHE_CLEAN (1UL << 11) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a @@ -703,6 +711,18 @@ static inline int dma_get_cache_alignment(void) } #endif +#ifdef ARCH_HAS_DMA_MINALIGN +#define ____dma_from_device_aligned __aligned(ARCH_DMA_MINALIGN) +#else +#define ____dma_from_device_aligned +#endif +/* Mark start of DMA buffer */ +#define __dma_from_device_group_begin(GROUP) \ + __cacheline_group_begin(GROUP) ____dma_from_device_aligned +/* Mark end of DMA buffer */ +#define __dma_from_device_group_end(GROUP) \ + __cacheline_group_end(GROUP) ____dma_from_device_aligned + static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 4cf21d6e9cfd..2bfe3baa63f4 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -312,7 +312,9 @@ struct vdpa_map_file { * @idx: virtqueue index * Returns the affinity mask * @set_group_asid: Set address space identifier for a - * virtqueue group (optional) + * virtqueue group (optional). Caller must + * prevent this from being executed concurrently + * with set_status. * @vdev: vdpa device * @group: virtqueue group * @asid: address space id for this group diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 3626eb694728..3bbc4cb6a672 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -43,13 +43,13 @@ struct virtqueue { void *priv; }; -struct vduse_iova_domain; +struct vduse_vq_group; union virtio_map { /* Device that performs DMA */ struct device *dma_dev; - /* VDUSE specific mapping data */ - struct vduse_iova_domain *iova_domain; + /* VDUSE specific virtqueue group for doing map */ + struct vduse_vq_group *group; }; int virtqueue_add_outbuf(struct virtqueue *vq, @@ -62,6 +62,11 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_inbuf_cache_clean(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + gfp_t gfp); + int virtqueue_add_inbuf_ctx(struct virtqueue *vq, struct scatterlist sg[], unsigned int num, void *data, diff --git a/include/uapi/linux/vduse.h b/include/uapi/linux/vduse.h index 10ad71aa00d6..361eea511c21 100644 --- a/include/uapi/linux/vduse.h +++ b/include/uapi/linux/vduse.h @@ -10,6 +10,10 @@ #define VDUSE_API_VERSION 0 +/* VQ groups and ASID support */ + +#define VDUSE_API_VERSION_1 1 + /* * Get the version of VDUSE API that kernel supported (VDUSE_API_VERSION). * This is used for future extension. @@ -27,6 +31,8 @@ * @features: virtio features * @vq_num: the number of virtqueues * @vq_align: the allocation alignment of virtqueue's metadata + * @ngroups: number of vq groups that VDUSE device declares + * @nas: number of address spaces that VDUSE device declares * @reserved: for future use, needs to be initialized to zero * @config_size: the size of the configuration space * @config: the buffer of the configuration space @@ -41,7 +47,9 @@ struct vduse_dev_config { __u64 features; __u32 vq_num; __u32 vq_align; - __u32 reserved[13]; + __u32 ngroups; /* if VDUSE_API_VERSION >= 1 */ + __u32 nas; /* if VDUSE_API_VERSION >= 1 */ + __u32 reserved[11]; __u32 config_size; __u8 config[]; }; @@ -118,14 +126,18 @@ struct vduse_config_data { * struct vduse_vq_config - basic configuration of a virtqueue * @index: virtqueue index * @max_size: the max size of virtqueue - * @reserved: for future use, needs to be initialized to zero + * @reserved1: for future use, needs to be initialized to zero + * @group: virtqueue group + * @reserved2: for future use, needs to be initialized to zero * * Structure used by VDUSE_VQ_SETUP ioctl to setup a virtqueue. */ struct vduse_vq_config { __u32 index; __u16 max_size; - __u16 reserved[13]; + __u16 reserved1; + __u32 group; + __u16 reserved2[10]; }; /* @@ -156,6 +168,16 @@ struct vduse_vq_state_packed { __u16 last_used_idx; }; +/** + * struct vduse_vq_group_asid - virtqueue group ASID + * @group: Index of the virtqueue group + * @asid: Address space ID of the group + */ +struct vduse_vq_group_asid { + __u32 group; + __u32 asid; +}; + /** * struct vduse_vq_info - information of a virtqueue * @index: virtqueue index @@ -215,6 +237,7 @@ struct vduse_vq_eventfd { * @uaddr: start address of userspace memory, it must be aligned to page size * @iova: start of the IOVA region * @size: size of the IOVA region + * @asid: Address space ID of the IOVA region * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM @@ -224,7 +247,8 @@ struct vduse_iova_umem { __u64 uaddr; __u64 iova; __u64 size; - __u64 reserved[3]; + __u32 asid; + __u32 reserved[5]; }; /* Register userspace memory for IOVA regions */ @@ -238,6 +262,7 @@ struct vduse_iova_umem { * @start: start of the IOVA region * @last: last of the IOVA region * @capability: capability of the IOVA region + * @asid: Address space ID of the IOVA region, only if device API version >= 1 * @reserved: for future use, needs to be initialized to zero * * Structure used by VDUSE_IOTLB_GET_INFO ioctl to get information of @@ -248,7 +273,8 @@ struct vduse_iova_info { __u64 last; #define VDUSE_IOVA_CAP_UMEM (1 << 0) __u64 capability; - __u64 reserved[3]; + __u32 asid; /* Only if device API version >= 1 */ + __u32 reserved[5]; }; /* @@ -257,6 +283,32 @@ struct vduse_iova_info { */ #define VDUSE_IOTLB_GET_INFO _IOWR(VDUSE_BASE, 0x1a, struct vduse_iova_info) +/** + * struct vduse_iotlb_entry_v2 - entry of IOTLB to describe one IOVA region + * + * @v1: the original vduse_iotlb_entry + * @asid: address space ID of the IOVA region + * @reserved: for future use, needs to be initialized to zero + * + * Structure used by VDUSE_IOTLB_GET_FD2 ioctl to find an overlapped IOVA region. + */ +struct vduse_iotlb_entry_v2 { + __u64 offset; + __u64 start; + __u64 last; + __u8 perm; + __u8 padding[7]; + __u32 asid; + __u32 reserved[11]; +}; + +/* + * Same as VDUSE_IOTLB_GET_FD but with vduse_iotlb_entry_v2 argument that + * support extra fields. + */ +#define VDUSE_IOTLB_GET_FD2 _IOWR(VDUSE_BASE, 0x1b, struct vduse_iotlb_entry_v2) + + /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */ /** @@ -265,11 +317,14 @@ struct vduse_iova_info { * @VDUSE_SET_STATUS: set the device status * @VDUSE_UPDATE_IOTLB: Notify userspace to update the memory mapping for * specified IOVA range via VDUSE_IOTLB_GET_FD ioctl + * @VDUSE_SET_VQ_GROUP_ASID: Notify userspace to update the address space of a + * virtqueue group. */ enum vduse_req_type { VDUSE_GET_VQ_STATE, VDUSE_SET_STATUS, VDUSE_UPDATE_IOTLB, + VDUSE_SET_VQ_GROUP_ASID, }; /** @@ -304,6 +359,19 @@ struct vduse_iova_range { __u64 last; }; +/** + * struct vduse_iova_range_v2 - IOVA range [start, last] if API_VERSION >= 1 + * @start: start of the IOVA range + * @last: last of the IOVA range + * @asid: address space ID of the IOVA range + */ +struct vduse_iova_range_v2 { + __u64 start; + __u64 last; + __u32 asid; + __u32 padding; +}; + /** * struct vduse_dev_request - control request * @type: request type @@ -312,6 +380,8 @@ struct vduse_iova_range { * @vq_state: virtqueue state, only index field is available * @s: device status * @iova: IOVA range for updating + * @iova_v2: IOVA range for updating if API_VERSION >= 1 + * @vq_group_asid: ASID of a virtqueue group * @padding: padding * * Structure used by read(2) on /dev/vduse/$NAME. @@ -324,6 +394,11 @@ struct vduse_dev_request { struct vduse_vq_state vq_state; struct vduse_dev_status s; struct vduse_iova_range iova; + /* Following members but padding exist only if vduse api + * version >= 1 + */ + struct vduse_iova_range_v2 iova_v2; + struct vduse_vq_group_asid vq_group_asid; __u32 padding[32]; }; }; diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index f8c20d3de8da..3c478582a3c2 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -31,9 +31,6 @@ * SUCH DAMAGE. * * Copyright Rusty Russell IBM Corporation 2007. */ -#ifndef __KERNEL__ -#include -#endif #include #include @@ -202,7 +199,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, vr->num = num; vr->desc = p; vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); - vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 138ede653de4..43d6a996d7a7 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -63,6 +63,7 @@ enum map_err_types { * @sg_mapped_ents: 'mapped_ents' from dma_map_sg * @paddr: physical start address of the mapping * @map_err_type: track whether dma_mapping_error() was checked + * @is_cache_clean: driver promises not to write to buffer while mapped * @stack_len: number of backtrace entries in @stack_entries * @stack_entries: stack of backtrace history */ @@ -76,7 +77,8 @@ struct dma_debug_entry { int sg_call_ents; int sg_mapped_ents; phys_addr_t paddr; - enum map_err_types map_err_type; + enum map_err_types map_err_type; + bool is_cache_clean; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; @@ -472,12 +474,15 @@ static int active_cacheline_dec_overlap(phys_addr_t cln) return active_cacheline_set_overlap(cln, --overlap); } -static int active_cacheline_insert(struct dma_debug_entry *entry) +static int active_cacheline_insert(struct dma_debug_entry *entry, + bool *overlap_cache_clean) { phys_addr_t cln = to_cacheline_number(entry); unsigned long flags; int rc; + *overlap_cache_clean = false; + /* If the device is not writing memory then we don't have any * concerns about the cpu consuming stale data. This mitigates * legitimate usages of overlapping mappings. @@ -487,8 +492,16 @@ static int active_cacheline_insert(struct dma_debug_entry *entry) spin_lock_irqsave(&radix_lock, flags); rc = radix_tree_insert(&dma_active_cacheline, cln, entry); - if (rc == -EEXIST) + if (rc == -EEXIST) { + struct dma_debug_entry *existing; + active_cacheline_inc_overlap(cln); + existing = radix_tree_lookup(&dma_active_cacheline, cln); + /* A lookup failure here after we got -EEXIST is unexpected. */ + WARN_ON(!existing); + if (existing) + *overlap_cache_clean = existing->is_cache_clean; + } spin_unlock_irqrestore(&radix_lock, flags); return rc; @@ -583,19 +596,24 @@ DEFINE_SHOW_ATTRIBUTE(dump); */ static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) { + bool overlap_cache_clean; struct hash_bucket *bucket; unsigned long flags; int rc; + entry->is_cache_clean = !!(attrs & DMA_ATTR_CPU_CACHE_CLEAN); + bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); put_hash_bucket(bucket, flags); - rc = active_cacheline_insert(entry); + rc = active_cacheline_insert(entry, &overlap_cache_clean); if (rc == -ENOMEM) { pr_err_once("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + } else if (rc == -EEXIST && + !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && + !(entry->is_cache_clean && overlap_cache_clean) && !(IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && is_swiotlb_active(entry->dev))) { err_printk(entry->dev, entry, diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 3f7ea2db9bd7..357e80ac3f3a 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -54,13 +55,6 @@ struct virtio_vsock { int rx_buf_nr; int rx_buf_max_nr; - /* The following fields are protected by event_lock. - * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. - */ - struct mutex event_lock; - bool event_run; - struct virtio_vsock_event event_list[8]; - u32 guest_cid; bool seqpacket_allow; @@ -74,6 +68,15 @@ struct virtio_vsock { */ struct scatterlist *out_sgs[MAX_SKB_FRAGS + 1]; struct scatterlist out_bufs[MAX_SKB_FRAGS + 1]; + + /* The following fields are protected by event_lock. + * vqs[VSOCK_VQ_EVENT] must be accessed with event_lock held. + */ + struct mutex event_lock; + bool event_run; + __dma_from_device_group_begin(); + struct virtio_vsock_event event_list[8]; + __dma_from_device_group_end(); }; static u32 virtio_transport_get_local_cid(void) @@ -390,7 +393,7 @@ static int virtio_vsock_event_fill_one(struct virtio_vsock *vsock, sg_init_one(&sg, event, sizeof(*event)); - return virtqueue_add_inbuf(vq, &sg, 1, event, GFP_KERNEL); + return virtqueue_add_inbuf_cache_clean(vq, &sg, 1, event, GFP_KERNEL); } /* event_lock must be held */ diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 15f0556eeafd..e56374662ff7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1102,7 +1102,9 @@ our $declaration_macros = qr{(?x: (?:$Storage\s+)?(?:[A-Z_][A-Z0-9]*_){0,2}(?:DEFINE|DECLARE)(?:_[A-Z0-9]+){1,6}\s*\(| (?:$Storage\s+)?[HLP]?LIST_HEAD\s*\(| (?:SKCIPHER_REQUEST|SHASH_DESC|AHASH_REQUEST)_ON_STACK\s*\(| - (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\( + (?:$Storage\s+)?(?:XA_STATE|XA_STATE_ORDER)\s*\(| + __cacheline_group_(?:begin|end)(?:_aligned)?\s*\(| + __dma_from_device_group_(?:begin|end)\s*\( )}; our %allow_repeated_words = (