Commit d4135134 authored by Filipe Manana's avatar Filipe Manana Committed by David Sterba
Browse files

btrfs: avoid blocking on space revervation when doing nowait dio writes



When doing a NOWAIT direct IO write, if we can NOCOW then it means we can
proceed with the non-blocking, NOWAIT path. However reserving the metadata
space and qgroup meta space can often result in blocking - flushing
delalloc, wait for ordered extents to complete, trigger transaction
commits, etc, going against the semantics of a NOWAIT write.

So make the NOWAIT write path to try to reserve all the metadata it needs
without resulting in a blocking behaviour - if we get -ENOSPC or -EDQUOT
then return -EAGAIN to make the caller fallback to a blocking direct IO
write.

This is part of a patchset comprised of the following patches:

  btrfs: avoid blocking on page locks with nowait dio on compressed range
  btrfs: avoid blocking nowait dio when locking file range
  btrfs: avoid double nocow check when doing nowait dio writes
  btrfs: stop allocating a path when checking if cross reference exists
  btrfs: free path at can_nocow_extent() before checking for checksum items
  btrfs: release path earlier at can_nocow_extent()
  btrfs: avoid blocking when allocating context for nowait dio read/write
  btrfs: avoid blocking on space revervation when doing nowait dio writes

The following test was run before and after applying this patchset:

  $ cat io-uring-nodatacow-test.sh
  #!/bin/bash

  DEV=/dev/sdc
  MNT=/mnt/sdc

  MOUNT_OPTIONS="-o ssd -o nodatacow"
  MKFS_OPTIONS="-R free-space-tree -O no-holes"

  NUM_JOBS=4
  FILE_SIZE=8G
  RUN_TIME=300

  cat <<EOF > /tmp/fio-job.ini
  [io_uring_rw]
  rw=randrw
  fsync=0
  fallocate=posix
  group_reporting=1
  direct=1
  ioengine=io_uring
  iodepth=64
  bssplit=4k/20:8k/20:16k/20:32k/10:64k/10:128k/5:256k/5:512k/5:1m/5
  filesize=$FILE_SIZE
  runtime=$RUN_TIME
  time_based
  filename=foobar
  directory=$MNT
  numjobs=$NUM_JOBS
  thread
  EOF

  echo performance | \
     tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

  umount $MNT &> /dev/null
  mkfs.btrfs -f $MKFS_OPTIONS $DEV &> /dev/null
  mount $MOUNT_OPTIONS $DEV $MNT

  fio /tmp/fio-job.ini

  umount $MNT

The test was run a 12 cores box with 64G of ram, using a non-debug kernel
config (Debian's default config) and a spinning disk.

Result before the patchset:

 READ: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec
WRITE: bw=407MiB/s (427MB/s), 407MiB/s-407MiB/s (427MB/s-427MB/s), io=119GiB (128GB), run=300175-300175msec

Result after the patchset:

 READ: bw=436MiB/s (457MB/s), 436MiB/s-436MiB/s (457MB/s-457MB/s), io=128GiB (137GB), run=300044-300044msec
WRITE: bw=435MiB/s (456MB/s), 435MiB/s-435MiB/s (456MB/s-456MB/s), io=128GiB (137GB), run=300044-300044msec

That's about +7.2% throughput for reads and +6.9% for writes.

Signed-off-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 4f208dcc
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2893,7 +2893,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);

int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
				    u64 disk_num_bytes);
				    u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end);
+5 −4
Original line number Diff line number Diff line
@@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}

int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
				    u64 disk_num_bytes)
				    u64 disk_num_bytes, bool noflush)
{
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
	 * If we have a transaction open (can happen if we call truncate_block
	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
	 */
	if (btrfs_is_free_space_inode(inode)) {
	if (noflush || btrfs_is_free_space_inode(inode)) {
		flush = BTRFS_RESERVE_NO_FLUSH;
	} else {
		if (current->journal_info)
@@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
	 */
	calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
				&meta_reserve, &qgroup_reserve);
	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
						 noflush);
	if (ret)
		return ret;
	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
	ret = btrfs_check_data_free_space(inode, reserved, start, len);
	if (ret < 0)
		return ret;
	ret = btrfs_delalloc_reserve_metadata(inode, len, len);
	ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
	if (ret < 0) {
		btrfs_free_reserved_data_space(inode, *reserved, start, len);
		extent_changeset_free(*reserved);
+1 −1
Original line number Diff line number Diff line
@@ -1684,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
		WARN_ON(reserve_bytes == 0);
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
						      reserve_bytes,
						      reserve_bytes);
						      reserve_bytes, false);
		if (ret) {
			if (!only_release_metadata)
				btrfs_free_reserved_data_space(BTRFS_I(inode),
+9 −4
Original line number Diff line number Diff line
@@ -4705,7 +4705,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
			goto out;
		}
	}
	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize);
	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
	if (ret < 0) {
		if (!only_release_metadata)
			btrfs_free_reserved_data_space(inode, data_reserved,
@@ -7415,6 +7415,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
					 u64 start, u64 len,
					 unsigned int iomap_flags)
{
	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	struct extent_map *em = *map;
	int type;
@@ -7454,12 +7455,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
		struct extent_map *em2;

		/* We can NOCOW, so only need to reserve metadata space. */
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len);
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
						      nowait);
		if (ret < 0) {
			/* Our caller expects us to free the input extent map. */
			free_extent_map(em);
			*map = NULL;
			btrfs_dec_nocow_writers(fs_info, block_start);
			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
				ret = -EAGAIN;
			goto out;
		}
		space_reserved = true;
@@ -7483,7 +7487,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
		free_extent_map(em);
		*map = NULL;

		if (iomap_flags & IOMAP_NOWAIT)
		if (nowait)
			return -EAGAIN;

		/* We have to COW, so need to reserve metadata and data space. */
@@ -10801,7 +10805,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
	if (ret)
		goto out_free_data_space;
	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes);
	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
					      false);
	if (ret)
		goto out_qgroup_free_data;

+3 −2
Original line number Diff line number Diff line
@@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}

int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
				enum btrfs_qgroup_rsv_type type, bool enforce)
				enum btrfs_qgroup_rsv_type type, bool enforce,
				bool noflush)
{
	int ret;

	ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
	if (ret <= 0 && ret != -EDQUOT)
	if ((ret <= 0 && ret != -EDQUOT) || noflush)
		return ret;

	ret = try_flush_qgroup(root);
Loading