summaryrefslogtreecommitdiff
path: root/block/bio.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 11:39:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-26 11:39:36 -0700
commit6f59de9bc0d576eb5a5edfea470527902315e924 (patch)
treeeeb477da0caa8e9569074f448169fb792fac90d9 /block/bio.c
parent3e406741b19890c3d8a2ed126aa7c23b106ca9e1 (diff)
parent533c87e2ed742454957f14d7bef9f48d5a72e72d (diff)
Merge tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - ublk updates: - Add support for updating the size of a ublk instance - Zero-copy improvements - Auto-registering of buffers for zero-copy - Series simplifying and improving GET_DATA and request lookup - Series adding quiesce support - Lots of selftests additions - Various cleanups - NVMe updates via Christoph: - add per-node DMA pools and use them for PRP/SGL allocations (Caleb Sander Mateos, Keith Busch) - nvme-fcloop refcounting fixes (Daniel Wagner) - support delayed removal of the multipath node and optionally support the multipath node for private namespaces (Nilay Shroff) - support shared CQs in the PCI endpoint target code (Wilfred Mallawa) - support admin-queue only authentication (Hannes Reinecke) - use the crc32c library instead of the crypto API (Eric Biggers) - misc cleanups (Christoph Hellwig, Marcelo Moreira, Hannes Reinecke, Leon Romanovsky, Gustavo A. R. Silva) - MD updates via Yu: - Fix that normal IO can be starved by sync IO, found by mkfs on newly created large raid5, with some clean up patches for bdev inflight counters - Clean up brd, getting rid of atomic kmaps and bvec poking - Add loop driver specifically for zoned IO testing - Eliminate blk-rq-qos calls with a static key, if not enabled - Improve hctx locking for when a plug has IO for multiple queues pending - Remove block layer bouncing support, which in turn means we can remove the per-node bounce stat as well - Improve blk-throttle support - Improve delay support for blk-throttle - Improve brd discard support - Unify IO scheduler switching. This should also fix a bunch of lockdep warnings we've been seeing, after enabling lockdep support for queue freezing/unfreezeing - Add support for block write streams via FDP (flexible data placement) on NVMe - Add a bunch of block helpers, facilitating the removal of a bunch of duplicated boilerplate code - Remove obsolete BLK_MQ pci and virtio Kconfig options - Add atomic/untorn write support to blktrace - Various little cleanups and fixes * tag 'for-6.16/block-20250523' of git://git.kernel.dk/linux: (186 commits) selftests: ublk: add test for UBLK_F_QUIESCE ublk: add feature UBLK_F_QUIESCE selftests: ublk: add test case for UBLK_U_CMD_UPDATE_SIZE traceevent/block: Add REQ_ATOMIC flag to block trace events ublk: run auto buf unregisgering in same io_ring_ctx with registering io_uring: add helper io_uring_cmd_ctx_handle() ublk: remove io argument from ublk_auto_buf_reg_fallback() ublk: handle ublk_set_auto_buf_reg() failure correctly in ublk_fetch() selftests: ublk: add test for covering UBLK_AUTO_BUF_REG_FALLBACK selftests: ublk: support UBLK_F_AUTO_BUF_REG ublk: support UBLK_AUTO_BUF_REG_FALLBACK ublk: register buffer to local io_uring with provided buf index via UBLK_F_AUTO_BUF_REG ublk: prepare for supporting to register request buffer automatically ublk: convert to refcount_t selftests: ublk: make IO & device removal test more stressful nvme: rename nvme_mpath_shutdown_disk to nvme_mpath_remove_disk nvme: introduce multipath_always_on module param nvme-multipath: introduce delayed removal of the multipath head node nvme-pci: derive and better document max segments limits nvme-pci: use struct_size for allocation struct nvme_dev ...
Diffstat (limited to 'block/bio.c')
-rw-r--r--block/bio.c158
1 files changed, 126 insertions, 32 deletions
diff --git a/block/bio.c b/block/bio.c
index 4be592d37fb6..3c0a558c90f5 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_flags = 0;
bio->bi_ioprio = 0;
bio->bi_write_hint = 0;
+ bio->bi_write_stream = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -827,6 +828,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_write_stream = bio_src->bi_write_stream;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
@@ -918,7 +920,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
}
static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
+ unsigned int len, unsigned int off)
{
size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@@ -931,9 +933,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
return false;
- *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
- PAGE_MASK));
- if (!*same_page) {
+ if ((vec_end_addr & PAGE_MASK) != ((page_addr + off) & PAGE_MASK)) {
if (IS_ENABLED(CONFIG_KMSAN))
return false;
if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
@@ -953,8 +953,7 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
* helpers to split. Hopefully this will go away soon.
*/
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
- struct page *page, unsigned len, unsigned offset,
- bool *same_page)
+ struct page *page, unsigned len, unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = bvec_phys(bv);
@@ -964,7 +963,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
return false;
if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
- return bvec_try_merge_page(bv, page, len, offset, same_page);
+ return bvec_try_merge_page(bv, page, len, offset);
}
/**
@@ -990,6 +989,22 @@ void __bio_add_page(struct bio *bio, struct page *page,
EXPORT_SYMBOL_GPL(__bio_add_page);
/**
+ * bio_add_virt_nofail - add data in the direct kernel mapping to a bio
+ * @bio: destination bio
+ * @vaddr: data to add
+ * @len: length of the data to add, may cross pages
+ *
+ * Add the data at @vaddr to @bio. The caller must have ensure a segment
+ * is available for the added data. No merging into an existing segment
+ * will be performed.
+ */
+void bio_add_virt_nofail(struct bio *bio, void *vaddr, unsigned len)
+{
+ __bio_add_page(bio, virt_to_page(vaddr), len, offset_in_page(vaddr));
+}
+EXPORT_SYMBOL_GPL(bio_add_virt_nofail);
+
+/**
* bio_add_page - attempt to add page(s) to bio
* @bio: destination bio
* @page: start page to add
@@ -1002,8 +1017,6 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
- bool same_page = false;
-
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_iter.bi_size > UINT_MAX - len)
@@ -1011,7 +1024,7 @@ int bio_add_page(struct bio *bio, struct page *page,
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- page, len, offset, &same_page)) {
+ page, len, offset)) {
bio->bi_iter.bi_size += len;
return len;
}
@@ -1058,6 +1071,61 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
}
EXPORT_SYMBOL(bio_add_folio);
+/**
+ * bio_add_vmalloc_chunk - add a vmalloc chunk to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio and return how many bytes were added.
+ * This may be less than the amount originally asked. Returns 0 if no data
+ * could be added to @bio.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+unsigned int bio_add_vmalloc_chunk(struct bio *bio, void *vaddr, unsigned len)
+{
+ unsigned int offset = offset_in_page(vaddr);
+
+ len = min(len, PAGE_SIZE - offset);
+ if (bio_add_page(bio, vmalloc_to_page(vaddr), len, offset) < len)
+ return 0;
+ if (op_is_write(bio_op(bio)))
+ flush_kernel_vmap_range(vaddr, len);
+ return len;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc_chunk);
+
+/**
+ * bio_add_vmalloc - add a vmalloc region to a bio
+ * @bio: destination bio
+ * @vaddr: vmalloc address to add
+ * @len: total length in bytes of the data to add
+ *
+ * Add data starting at @vaddr to @bio. Return %true on success or %false if
+ * @bio does not have enough space for the payload.
+ *
+ * This helper calls flush_kernel_vmap_range() for the range added. For reads
+ * the caller still needs to manually call invalidate_kernel_vmap_range() in
+ * the completion handler.
+ */
+bool bio_add_vmalloc(struct bio *bio, void *vaddr, unsigned int len)
+{
+ do {
+ unsigned int added = bio_add_vmalloc_chunk(bio, vaddr, len);
+
+ if (!added)
+ return false;
+ vaddr += added;
+ len -= added;
+ } while (len);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(bio_add_vmalloc);
+
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct folio_iter fi;
@@ -1088,27 +1156,6 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len,
- size_t offset)
-{
- bool same_page = false;
-
- if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
- return -EIO;
-
- if (bio->bi_vcnt > 0 &&
- bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
- folio_page(folio, 0), len, offset,
- &same_page)) {
- bio->bi_iter.bi_size += len;
- if (same_page && bio_flagged(bio, BIO_PAGE_PINNED))
- unpin_user_folio(folio, 1);
- return 0;
- }
- bio_add_folio_nofail(bio, folio, len, offset);
- return 0;
-}
-
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
@@ -1203,6 +1250,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
for (left = size, i = 0; left > 0; left -= len, i += num_pages) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
+ unsigned int old_vcnt = bio->bi_vcnt;
folio_offset = ((size_t)folio_page_idx(folio, page) <<
PAGE_SHIFT) + offset;
@@ -1215,7 +1263,23 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
len = get_contig_folio_len(&num_pages, pages, i,
folio, left, offset);
- bio_iov_add_folio(bio, folio, len, folio_offset);
+ if (!bio_add_folio(bio, folio, len, folio_offset)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (bio_flagged(bio, BIO_PAGE_PINNED)) {
+ /*
+ * We're adding another fragment of a page that already
+ * was part of the last segment. Undo our pin as the
+ * page was pinned when an earlier fragment of it was
+ * added to the bio and __bio_release_pages expects a
+ * single pin per page.
+ */
+ if (offset && bio->bi_vcnt == old_vcnt)
+ unpin_user_folio(folio, 1);
+ }
offset = 0;
}
@@ -1301,6 +1365,36 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
+/**
+ * bdev_rw_virt - synchronously read into / write from kernel mapping
+ * @bdev: block device to access
+ * @sector: sector to access
+ * @data: data to read/write
+ * @len: length in byte to read/write
+ * @op: operation (e.g. REQ_OP_READ/REQ_OP_WRITE)
+ *
+ * Performs synchronous I/O to @bdev for @data/@len. @data must be in
+ * the kernel direct mapping and not a vmalloc address.
+ */
+int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
+ size_t len, enum req_op op)
+{
+ struct bio_vec bv;
+ struct bio bio;
+ int error;
+
+ if (WARN_ON_ONCE(is_vmalloc_addr(data)))
+ return -EIO;
+
+ bio_init(&bio, bdev, &bv, 1, op);
+ bio.bi_iter.bi_sector = sector;
+ bio_add_virt_nofail(&bio, data, len);
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+ return error;
+}
+EXPORT_SYMBOL_GPL(bdev_rw_virt);
+
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);