Commit f3dfa40a authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Philipp Reisner
Browse files

drbd: fix race when forcefully disconnecting



If a forced disconnect hits a restarting receiver right after it passed
its final "if (C_DISCONNECTING)" test in drbdd_init(), but before it was
actually restarted by drbd_thread_setup, we could be left with a
connection stuck in C_DISCONNECTING, never reaching C_STANDALONE,
which would be necessary to take it down or reconfigure it.

Move the last cleanup into w_after_conn_state_ch(), and do an additional
state change request in conn_try_disconnect(), just in case.

Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent 88104ca4
Loading
Loading
Loading
Loading
+48 −37
Original line number Diff line number Diff line
@@ -2115,17 +2115,13 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
{
	enum drbd_state_rv rv;
	if (force) {
		spin_lock_irq(&tconn->req_lock);
		rv = _conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
		spin_unlock_irq(&tconn->req_lock);
		return rv;
	}

	rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), 0);
	rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
			force ? CS_HARD : 0);

	switch (rv) {
	case SS_NOTHING_TO_DO:
		break;
	case SS_ALREADY_STANDALONE:
		return SS_SUCCESS;
	case SS_PRIMARY_NOP:
@@ -2138,14 +2134,35 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
		rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
							disk, D_OUTDATED), 0);
		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
			conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
			rv = SS_SUCCESS;
			rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
					CS_HARD);
		}
		break;
	default:;
		/* no special handling necessary */
	}

	if (rv >= SS_SUCCESS) {
		enum drbd_state_rv rv2;
		/* No one else can reconfigure the network while I am here.
		 * The state handling only uses drbd_thread_stop_nowait(),
		 * we want to really wait here until the receiver is no more.
		 */
		drbd_thread_stop(&adm_ctx.tconn->receiver);

		/* Race breaker.  This additional state change request may be
		 * necessary, if this was a forced disconnect during a receiver
		 * restart.  We may have "killed" the receiver thread just
		 * after drbdd_init() returned.  Typically, we should be
		 * C_STANDALONE already, now, and this becomes a no-op.
		 */
		rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
				CS_VERBOSE | CS_HARD);
		if (rv2 < SS_SUCCESS)
			conn_err(tconn,
				"unexpected rv2=%d in conn_try_disconnect()\n",
				rv2);
	}
	return rv;
}

@@ -2176,18 +2193,8 @@ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)

	rv = conn_try_disconnect(tconn, parms.force_disconnect);
	if (rv < SS_SUCCESS)
		goto fail;

	/* No one else can reconfigure the network while I am here.
	 * The state handling only uses drbd_thread_stop_nowait(),
	 * we want to really wait here until the receiver is no more. */
	drbd_thread_stop(&tconn->receiver);
	if (wait_event_interruptible(tconn->ping_wait,
				     tconn->cstate == C_STANDALONE)) {
		retcode = ERR_INTR;
		goto fail;
	}

		retcode = rv;  /* FIXME: Type mismatch. */
	else
		retcode = NO_ERROR;
 fail:
	drbd_adm_finish(info, retcode);
@@ -3049,8 +3056,7 @@ int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)

int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
{
	enum drbd_ret_code retcode;
	enum drbd_state_rv rv;
	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
	struct drbd_conf *mdev;
	unsigned i;

@@ -3074,30 +3080,35 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
			goto out_unlock;
		}
	}
	up_read(&drbd_cfg_rwsem);

	/* disconnect */
	rv = conn_try_disconnect(adm_ctx.tconn, 0);
	if (rv < SS_SUCCESS) {
		retcode = rv; /* enum type mismatch! */
	/* disconnect; may stop the receiver;
	 * must not hold the drbd_cfg_rwsem */
	retcode = conn_try_disconnect(adm_ctx.tconn, 0);
	if (retcode < SS_SUCCESS) {
		drbd_msg_put_info("failed to disconnect");
		goto out_unlock;
		goto out;
	}

	/* Make sure the network threads have actually stopped,
	 * state handling only does drbd_thread_stop_nowait(). */
	drbd_thread_stop(&adm_ctx.tconn->receiver);

	down_read(&drbd_cfg_rwsem);
	/* detach */
	idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
		rv = adm_detach(mdev);
		if (rv < SS_SUCCESS) {
			retcode = rv; /* enum type mismatch! */
		retcode = adm_detach(mdev);
		if (retcode < SS_SUCCESS) {
			drbd_msg_put_info("failed to detach");
			goto out_unlock;
		}
	}
	up_read(&drbd_cfg_rwsem);

	/* If we reach this, all volumes (of this tconn) are Secondary,
	 * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
	 * actually stopped, state handling only does drbd_thread_stop_nowait().
	 * This needs to be done without holding drbd_cfg_rwsem. */
	drbd_thread_stop(&adm_ctx.tconn->worker);

	/* Now, nothing can fail anymore */

	/* delete volumes */
	down_write(&drbd_cfg_rwsem);
	idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+1 −13
Original line number Diff line number Diff line
@@ -4213,21 +4213,9 @@ static void drbd_disconnect(struct drbd_tconn *tconn)

	spin_unlock_irq(&tconn->req_lock);

	if (oc == C_DISCONNECTING) {
		struct net_conf *old_conf;

		mutex_lock(&tconn->net_conf_update);
		old_conf = tconn->net_conf;
		rcu_assign_pointer(tconn->net_conf, NULL);
		conn_free_crypto(tconn);
		mutex_unlock(&tconn->net_conf_update);

		synchronize_rcu();
		kfree(old_conf);

	if (oc == C_DISCONNECTING)
		conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
}
}

static int drbd_disconnected(int vnr, void *p, void *data)
{
+13 −0
Original line number Diff line number Diff line
@@ -1416,6 +1416,19 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused)
	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
		drbd_thread_start(&tconn->receiver);

	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
		struct net_conf *old_conf;

		mutex_lock(&tconn->net_conf_update);
		old_conf = tconn->net_conf;
		rcu_assign_pointer(tconn->net_conf, NULL);
		conn_free_crypto(tconn);
		mutex_unlock(&tconn->net_conf_update);

		synchronize_rcu();
		kfree(old_conf);
	}

	if (ns_max.susp_fen) {
		/* case1: The outdate peer handler is successful: */
		if (ns_max.pdsk <= D_OUTDATED) {