Commit 8d1f0177 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'for-5.16/io_uring-2021-10-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Light on new features - basically just the hybrid mode support.

  Outside of that it's just fixes, cleanups, and performance
  improvements.

  In detail:

   - Add ring related information to the fdinfo output (Hao)

   - Hybrid async mode (Hao)

   - Support for batched issue on block (me)

   - sqe error trace improvement (me)

   - IOPOLL efficiency improvements (Pavel)

   - submit state cleanups and improvements (Pavel)

   - Completion side improvements (Pavel)

   - Drain improvements (Pavel)

   - Buffer selection cleanups (Pavel)

   - Fixed file node improvements (Pavel)

   - io-wq setup cancelation fix (Pavel)

   - Various other performance improvements and cleanups (Pavel)

   - Misc fixes (Arnd, Bixuan, Changcheng, Hao, me, Noah)"

* tag 'for-5.16/io_uring-2021-10-29' of git://git.kernel.dk/linux-block: (97 commits)
  io-wq: remove worker to owner tw dependency
  io_uring: harder fdinfo sq/cq ring iterating
  io_uring: don't assign write hint in the read path
  io_uring: clusterise ki_flags access in rw_prep
  io_uring: kill unused param from io_file_supports_nowait
  io_uring: clean up timeout async_data allocation
  io_uring: don't try io-wq polling if not supported
  io_uring: check if opcode needs poll first on arming
  io_uring: clean iowq submit work cancellation
  io_uring: clean io_wq_submit_work()'s main loop
  io-wq: use helper for worker refcounting
  io_uring: implement async hybrid mode for pollable requests
  io_uring: Use ERR_CAST() instead of ERR_PTR(PTR_ERR())
  io_uring: split logic of force_nonblock
  io_uring: warning about unused-but-set parameter
  io_uring: inform block layer of how many requests we are submitting
  io_uring: simplify io_file_supports_nowait()
  io_uring: combine REQ_F_NOWAIT_{READ,WRITE} flags
  io_uring: arm poll for non-nowait files
  fs/io_uring: Prioritise checking faster conditions first in io_write
  ...
parents 643a7234 1d5f5ea7
Loading
Loading
Loading
Loading
+42 −16
Original line number Diff line number Diff line
@@ -140,6 +140,7 @@ static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
					struct io_wqe_acct *acct,
					struct io_cb_cancel_data *match);
static void create_worker_cb(struct callback_head *cb);

static bool io_worker_get(struct io_worker *worker)
{
@@ -174,12 +175,46 @@ static void io_worker_ref_put(struct io_wq *wq)
		complete(&wq->worker_done);
}

static void io_worker_cancel_cb(struct io_worker *worker)
{
	struct io_wqe_acct *acct = io_wqe_get_acct(worker);
	struct io_wqe *wqe = worker->wqe;
	struct io_wq *wq = wqe->wq;

	atomic_dec(&acct->nr_running);
	raw_spin_lock(&worker->wqe->lock);
	acct->nr_workers--;
	raw_spin_unlock(&worker->wqe->lock);
	io_worker_ref_put(wq);
	clear_bit_unlock(0, &worker->create_state);
	io_worker_release(worker);
}

static bool io_task_worker_match(struct callback_head *cb, void *data)
{
	struct io_worker *worker;

	if (cb->func != create_worker_cb)
		return false;
	worker = container_of(cb, struct io_worker, create_work);
	return worker == data;
}

static void io_worker_exit(struct io_worker *worker)
{
	struct io_wqe *wqe = worker->wqe;
	struct io_wq *wq = wqe->wq;

	if (refcount_dec_and_test(&worker->ref))
		complete(&worker->ref_done);
	while (1) {
		struct callback_head *cb = task_work_cancel_match(wq->task,
						io_task_worker_match, worker);

		if (!cb)
			break;
		io_worker_cancel_cb(worker);
	}

	io_worker_release(worker);
	wait_for_completion(&worker->ref_done);

	raw_spin_lock(&wqe->lock);
@@ -323,8 +358,10 @@ static bool io_queue_worker_create(struct io_worker *worker,

	init_task_work(&worker->create_work, func);
	worker->create_index = acct->index;
	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
	if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) {
		clear_bit_unlock(0, &worker->create_state);
		return true;
	}
	clear_bit_unlock(0, &worker->create_state);
fail_release:
	io_worker_release(worker);
@@ -716,12 +753,9 @@ static void io_workqueue_create(struct work_struct *work)
	struct io_worker *worker = container_of(work, struct io_worker, work);
	struct io_wqe_acct *acct = io_wqe_get_acct(worker);

	if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
		clear_bit_unlock(0, &worker->create_state);
		io_worker_release(worker);
	if (!io_queue_worker_create(worker, acct, create_worker_cont))
		kfree(worker);
}
}

static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{
@@ -1150,17 +1184,9 @@ static void io_wq_exit_workers(struct io_wq *wq)

	while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) {
		struct io_worker *worker;
		struct io_wqe_acct *acct;

		worker = container_of(cb, struct io_worker, create_work);
		acct = io_wqe_get_acct(worker);
		atomic_dec(&acct->nr_running);
		raw_spin_lock(&worker->wqe->lock);
		acct->nr_workers--;
		raw_spin_unlock(&worker->wqe->lock);
		io_worker_ref_put(wq);
		clear_bit_unlock(0, &worker->create_state);
		io_worker_release(worker);
		io_worker_cancel_cb(worker);
	}

	rcu_read_lock();
+52 −7
Original line number Diff line number Diff line
@@ -29,6 +29,17 @@ struct io_wq_work_list {
	struct io_wq_work_node *last;
};

#define wq_list_for_each(pos, prv, head)			\
	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)

#define wq_list_for_each_resume(pos, prv)			\
	for (; pos; prv = pos, pos = (pos)->next)

#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list)	do {				\
	(list)->first = NULL;					\
} while (0)

static inline void wq_list_add_after(struct io_wq_work_node *node,
				     struct io_wq_work_node *pos,
				     struct io_wq_work_list *list)
@@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node,
	}
}

static inline void wq_list_add_head(struct io_wq_work_node *node,
				    struct io_wq_work_list *list)
{
	node->next = list->first;
	if (!node->next)
		list->last = node;
	WRITE_ONCE(list->first, node);
}

static inline void wq_list_cut(struct io_wq_work_list *list,
			       struct io_wq_work_node *last,
			       struct io_wq_work_node *prev)
@@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list,
	last->next = NULL;
}

static inline void __wq_list_splice(struct io_wq_work_list *list,
				    struct io_wq_work_node *to)
{
	list->last->next = to->next;
	to->next = list->first;
	INIT_WQ_LIST(list);
}

static inline bool wq_list_splice(struct io_wq_work_list *list,
				  struct io_wq_work_node *to)
{
	if (!wq_list_empty(list)) {
		__wq_list_splice(list, to);
		return true;
	}
	return false;
}

static inline void wq_stack_add_head(struct io_wq_work_node *node,
				     struct io_wq_work_node *stack)
{
	node->next = stack->next;
	stack->next = node;
}

static inline void wq_list_del(struct io_wq_work_list *list,
			       struct io_wq_work_node *node,
			       struct io_wq_work_node *prev)
@@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list,
	wq_list_cut(list, node, prev);
}

#define wq_list_for_each(pos, prv, head)			\
	for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
static inline
struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
{
	struct io_wq_work_node *node = stack->next;

#define wq_list_empty(list)	(READ_ONCE((list)->first) == NULL)
#define INIT_WQ_LIST(list)	do {				\
	(list)->first = NULL;					\
	(list)->last = NULL;					\
} while (0)
	stack->next = node->next;
	return node;
}

struct io_wq_work {
	struct io_wq_work_node list;
+889 −825

File changed.

Preview size limit exceeded, changes collapsed.

+61 −0
Original line number Diff line number Diff line
@@ -6,6 +6,7 @@
#define _TRACE_IO_URING_H

#include <linux/tracepoint.h>
#include <uapi/linux/io_uring.h>

struct io_wq_work;

@@ -497,6 +498,66 @@ TRACE_EVENT(io_uring_task_run,
		  (unsigned long long) __entry->user_data)
);

/*
 * io_uring_req_failed - called when an sqe is errored dring submission
 *
 * @sqe:		pointer to the io_uring_sqe that failed
 * @error:		error it failed with
 *
 * Allows easier diagnosing of malformed requests in production systems.
 */
TRACE_EVENT(io_uring_req_failed,

	TP_PROTO(const struct io_uring_sqe *sqe, int error),

	TP_ARGS(sqe, error),

	TP_STRUCT__entry (
		__field(  u8,	opcode )
		__field(  u8,	flags )
		__field(  u8,	ioprio )
		__field( u64,	off )
		__field( u64,	addr )
		__field( u32,	len )
		__field( u32,	op_flags )
		__field( u64,	user_data )
		__field( u16,	buf_index )
		__field( u16,	personality )
		__field( u32,	file_index )
		__field( u64,	pad1 )
		__field( u64,	pad2 )
		__field( int,	error )
	),

	TP_fast_assign(
		__entry->opcode		= sqe->opcode;
		__entry->flags		= sqe->flags;
		__entry->ioprio		= sqe->ioprio;
		__entry->off		= sqe->off;
		__entry->addr		= sqe->addr;
		__entry->len		= sqe->len;
		__entry->op_flags	= sqe->rw_flags;
		__entry->user_data	= sqe->user_data;
		__entry->buf_index	= sqe->buf_index;
		__entry->personality	= sqe->personality;
		__entry->file_index	= sqe->file_index;
		__entry->pad1		= sqe->__pad2[0];
		__entry->pad2		= sqe->__pad2[1];
		__entry->error		= error;
	),

	TP_printk("op %d, flags=0x%x, prio=%d, off=%llu, addr=%llu, "
		  "len=%u, rw_flags=0x%x, user_data=0x%llx, buf_index=%d, "
		  "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
		  __entry->opcode, __entry->flags, __entry->ioprio,
		  (unsigned long long)__entry->off,
		  (unsigned long long) __entry->addr, __entry->len,
		  __entry->op_flags, (unsigned long long) __entry->user_data,
		  __entry->buf_index, __entry->personality, __entry->file_index,
		  (unsigned long long) __entry->pad1,
		  (unsigned long long) __entry->pad2, __entry->error)
);

#endif /* _TRACE_IO_URING_H */

/* This part must be outside protection */
+1 −0
Original line number Diff line number Diff line
@@ -158,6 +158,7 @@ enum {
#define IORING_TIMEOUT_BOOTTIME		(1U << 2)
#define IORING_TIMEOUT_REALTIME		(1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE	(1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS	(1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK	(IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK	(IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*