Newer
Older
{
int rc;
rpcrdma_flush_cqs(ep);
rc = rdma_disconnect(ia->ri_id);
if (!rc) {
/* returns without wait if not connected */
wait_event_interruptible(ep->rep_connect_wait,
ep->rep_connected != 1);
dprintk("RPC: %s: after wait, %sconnected\n", __func__,
(ep->rep_connected == 1) ? "still " : "dis");
} else {
dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
ep->rep_connected = rc;
}
}
static struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
req->rl_buffer = &r_xprt->rx_buf;
return req;
}
static struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_rep *rep;
int rc;
rc = -ENOMEM;
rep = kzalloc(sizeof(*rep), GFP_KERNEL);
if (rep == NULL)
goto out;
rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
GFP_KERNEL);
if (IS_ERR(rep->rr_rdmabuf)) {
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
rep->rr_rxprt = r_xprt;
return rep;
out_free:
kfree(rep);
out:
return ERR_PTR(rc);
}
rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
size_t len;
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
int i, rc;
buf->rb_max_requests = cdata->max_requests;
spin_lock_init(&buf->rb_lock);
/* Need to allocate:
* 1. arrays for send and recv pointers
* 2. arrays of struct rpcrdma_req to fill in pointers
* 3. array of struct rpcrdma_rep for replies
* Send/recv buffers in req/rep need to be registered
*/
len = buf->rb_max_requests *
(sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
p = kzalloc(len, GFP_KERNEL);
if (p == NULL) {
dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
__func__, len);
rc = -ENOMEM;
goto out;
}
buf->rb_pool = p; /* for freeing it later */
buf->rb_send_bufs = (struct rpcrdma_req **) p;
p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
rc = ia->ri_ops->ro_init(r_xprt);
if (rc)
goto out;
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
struct rpcrdma_rep *rep;
req = rpcrdma_create_req(r_xprt);
if (IS_ERR(req)) {
dprintk("RPC: %s: request buffer %d alloc"
" failed\n", __func__, i);
rc = PTR_ERR(req);
goto out;
}
buf->rb_send_bufs[i] = req;
rep = rpcrdma_create_rep(r_xprt);
if (IS_ERR(rep)) {
dprintk("RPC: %s: reply buffer %d alloc failed\n",
__func__, i);
rc = PTR_ERR(rep);
goto out;
}
buf->rb_recv_bufs[i] = rep;
}
return 0;
out:
rpcrdma_buffer_destroy(buf);
return rc;
}
static void
rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
{
if (!rep)
return;
rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
kfree(rep);
}
static void
rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
if (!req)
return;
rpcrdma_free_regbuf(ia, req->rl_sendbuf);
rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
kfree(req);
}
void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
/* clean up in reverse order from create
* 1. recv mr memory (mr free, then kfree)
* 2. send mr memory (mr free, then kfree)
*/
dprintk("RPC: %s: entering\n", __func__);
for (i = 0; i < buf->rb_max_requests; i++) {
if (buf->rb_recv_bufs)
rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
if (buf->rb_send_bufs)
rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
kfree(buf->rb_pool);
}
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
struct rpcrdma_mw *
rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_mw *mw = NULL;
unsigned long flags;
spin_lock_irqsave(&buf->rb_lock, flags);
if (!list_empty(&buf->rb_mws)) {
mw = list_first_entry(&buf->rb_mws,
struct rpcrdma_mw, mw_list);
list_del_init(&mw->mw_list);
}
spin_unlock_irqrestore(&buf->rb_lock, flags);
if (!mw)
pr_err("RPC: %s: no MWs available\n", __func__);
return mw;
}
void
rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
unsigned long flags;
spin_lock_irqsave(&buf->rb_lock, flags);
list_add_tail(&mw->mw_list, &buf->rb_mws);
spin_unlock_irqrestore(&buf->rb_lock, flags);
}
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving
* some req segments uninitialized.
*/
static void
rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf)
{
if (*mw) {
list_add_tail(&(*mw)->mw_list, &buf->rb_mws);
*mw = NULL;
}
}
/* Cycle mw's back in reverse order, and "spin" them.
* This delays and scrambles reuse as much as possible.
*/
static void
rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
{
struct rpcrdma_mr_seg *seg = req->rl_segments;
struct rpcrdma_mr_seg *seg1 = seg;
int i;
for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++)
rpcrdma_buffer_put_mr(&seg->rl_mw, buf);
rpcrdma_buffer_put_mr(&seg1->rl_mw, buf);
}
static void
rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
{
buf->rb_send_bufs[--buf->rb_send_index] = req;
req->rl_niovs = 0;
if (req->rl_reply) {
buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
req->rl_reply = NULL;
}
}
/* rpcrdma_unmap_one() was already done during deregistration.
* Redo only the ib_post_send().
*/
static void
rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia)
{
struct rpcrdma_xprt *r_xprt =
container_of(ia, struct rpcrdma_xprt, rx_ia);
struct ib_send_wr invalidate_wr, *bad_wr;
int rc;
dprintk("RPC: %s: FRMR %p is stale\n", __func__, r);
/* When this FRMR is re-inserted into rb_mws, it is no longer stale */
r->r.frmr.fr_state = FRMR_IS_INVALID;
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
memset(&invalidate_wr, 0, sizeof(invalidate_wr));
invalidate_wr.wr_id = (unsigned long)(void *)r;
invalidate_wr.opcode = IB_WR_LOCAL_INV;
invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
dprintk("RPC: %s: frmr %p invalidating rkey %08x\n",
__func__, r, r->r.frmr.fr_mr->rkey);
read_lock(&ia->ri_qplock);
rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc) {
/* Force rpcrdma_buffer_get() to retry */
r->r.frmr.fr_state = FRMR_IS_STALE;
dprintk("RPC: %s: ib_post_send failed, %i\n",
__func__, rc);
}
}
static void
rpcrdma_retry_flushed_linv(struct list_head *stale,
struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
struct list_head *pos;
struct rpcrdma_mw *r;
unsigned long flags;
list_for_each(pos, stale) {
r = list_entry(pos, struct rpcrdma_mw, mw_list);
rpcrdma_retry_local_inv(r, ia);
}
spin_lock_irqsave(&buf->rb_lock, flags);
list_splice_tail(stale, &buf->rb_mws);
spin_unlock_irqrestore(&buf->rb_lock, flags);
}
static struct rpcrdma_req *
rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf,
struct list_head *stale)
{
struct rpcrdma_mw *r;
int i;
i = RPCRDMA_MAX_SEGS - 1;
while (!list_empty(&buf->rb_mws)) {
r = list_entry(buf->rb_mws.next,
struct rpcrdma_mw, mw_list);
list_del(&r->mw_list);
if (r->r.frmr.fr_state == FRMR_IS_STALE) {
list_add(&r->mw_list, stale);
continue;
}
req->rl_segments[i].rl_mw = r;
if (unlikely(i-- == 0))
return req; /* Success */
}
/* Not enough entries on rb_mws for this req */
rpcrdma_buffer_put_sendbuf(req, buf);
rpcrdma_buffer_put_mrs(req, buf);
return NULL;
}
/*
* Get a set of request/reply buffers.
*
* Reply buffer (if needed) is attached to send buffer upon return.
* Rule:
* rb_send_index and rb_recv_index MUST always be pointing to the
* *next* available buffer (non-NULL). They are incremented after
* removing buffers, and decremented *before* returning them.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
struct list_head stale;
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
struct rpcrdma_req *req;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_send_index == buffers->rb_max_requests) {
spin_unlock_irqrestore(&buffers->rb_lock, flags);
dprintk("RPC: %s: out of request buffers\n", __func__);
return ((struct rpcrdma_req *)NULL);
}
req = buffers->rb_send_bufs[buffers->rb_send_index];
if (buffers->rb_send_index < buffers->rb_recv_index) {
dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
__func__,
buffers->rb_recv_index - buffers->rb_send_index);
req->rl_reply = NULL;
} else {
req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
}
buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
INIT_LIST_HEAD(&stale);
switch (ia->ri_memreg_strategy) {
case RPCRDMA_FRMR:
req = rpcrdma_buffer_get_frmrs(req, buffers, &stale);
break;
default:
break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
if (!list_empty(&stale))
rpcrdma_retry_flushed_linv(&stale, buffers);
return req;
}
/*
* Put request/reply buffers back into pool.
* Pre-decrement counter/array index.
*/
void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
rpcrdma_buffer_put_sendbuf(req, buffers);
switch (ia->ri_memreg_strategy) {
rpcrdma_buffer_put_mrs(req, buffers);
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
break;
default:
break;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
}
/*
* Recover reply buffers from pool.
* This happens when recovering from error conditions.
* Post-increment counter/array index.
*/
void
rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
if (buffers->rb_recv_index < buffers->rb_max_requests) {
req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
}
spin_unlock_irqrestore(&buffers->rb_lock, flags);
}
/*
* Put reply buffers back into pool when not attached to
* request. This happens in error conditions.
*/
void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
unsigned long flags;
spin_lock_irqsave(&buffers->rb_lock, flags);
buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
spin_unlock_irqrestore(&buffers->rb_lock, flags);
}
/*
* Wrappers for internal-use kmalloc memory registration, used by buffer code.
*/
void
rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg)
{
dprintk("RPC: map_one: offset %p iova %llx len %zu\n",
seg->mr_offset,
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
struct ib_mr **mrp, struct ib_sge *iov)
{
struct ib_phys_buf ipb;
struct ib_mr *mr;
int rc;
/*
* All memory passed here was kmalloc'ed, therefore phys-contiguous.
*/
iov->addr = ib_dma_map_single(ia->ri_device,
va, len, DMA_BIDIRECTIONAL);
if (ib_dma_mapping_error(ia->ri_device, iov->addr))
return -ENOMEM;
iov->length = len;
if (ia->ri_have_dma_lkey) {
*mrp = NULL;
iov->lkey = ia->ri_dma_lkey;
return 0;
} else if (ia->ri_bind_mem != NULL) {
*mrp = NULL;
iov->lkey = ia->ri_bind_mem->lkey;
return 0;
}
ipb.addr = iov->addr;
ipb.size = iov->length;
mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
IB_ACCESS_LOCAL_WRITE, &iov->addr);
dprintk("RPC: %s: phys convert: 0x%llx "
"registered 0x%llx length %d\n",
__func__, (unsigned long long)ipb.addr,
(unsigned long long)iov->addr, len);
if (IS_ERR(mr)) {
*mrp = NULL;
rc = PTR_ERR(mr);
dprintk("RPC: %s: failed with %i\n", __func__, rc);
} else {
*mrp = mr;
iov->lkey = mr->lkey;
rc = 0;
}
return rc;
}
rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
struct ib_mr *mr, struct ib_sge *iov)
{
int rc;
ib_dma_unmap_single(ia->ri_device,
iov->addr, iov->length, DMA_BIDIRECTIONAL);
if (NULL == mr)
return 0;
rc = ib_dereg_mr(mr);
if (rc)
dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
return rc;
}
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
* @size: size of buffer to be allocated, in bytes
* @flags: GFP flags
*
* Returns pointer to private header of an area of internally
* registered memory, or an ERR_PTR. The registered buffer follows
* the end of the private header.
*
* xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
* receiving the payload of RDMA RECV operations. regbufs are not
* used for RDMA READ/WRITE operations, thus are registered only for
* LOCAL access.
*/
struct rpcrdma_regbuf *
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
int rc;
rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
rb->rg_size = size;
rb->rg_owner = NULL;
rc = rpcrdma_register_internal(ia, rb->rg_base, size,
&rb->rg_mr, &rb->rg_iov);
if (rc)
goto out_free;
return rb;
out_free:
kfree(rb);
out:
return ERR_PTR(rc);
}
/**
* rpcrdma_free_regbuf - deregister and free registered buffer
* @ia: controlling rpcrdma_ia
* @rb: regbuf to be deregistered and freed
*/
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
if (rb) {
rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
kfree(rb);
}
}
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
/*
* Prepost any receive buffer, then post send.
*
* Receive buffer is donated to hardware, reclaimed upon recv completion.
*/
int
rpcrdma_ep_post(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
int rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
if (rc)
goto out;
req->rl_reply = NULL;
}
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
send_wr.sg_list = req->rl_send_iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[3].addr,
req->rl_send_iov[3].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[1].addr,
req->rl_send_iov[1].length,
DMA_TO_DEVICE);
ib_dma_sync_single_for_device(ia->ri_device,
req->rl_send_iov[0].addr,
req->rl_send_iov[0].length,
DMA_TO_DEVICE);
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
else { /* Provider must take a send completion every now and then */
INIT_CQCOUNT(ep);
send_wr.send_flags = IB_SEND_SIGNALED;
}
rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
if (rc)
dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
rc);
out:
return rc;
}
/*
* (Re)post a receive buffer.
*/
int
rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
struct rpcrdma_ep *ep,
struct rpcrdma_rep *rep)
{
struct ib_recv_wr recv_wr, *recv_wr_fail;
int rc;
recv_wr.next = NULL;
recv_wr.wr_id = (u64) (unsigned long) rep;
recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
recv_wr.num_sge = 1;
ib_dma_sync_single_for_cpu(ia->ri_device,
rdmab_addr(rep->rr_rdmabuf),
rdmab_length(rep->rr_rdmabuf),
DMA_BIDIRECTIONAL);
rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
if (rc)
dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
rc);
return rc;
}
/* How many chunk list items fit within our inline buffers?
unsigned int
rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
int bytes, segments;
bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize);
bytes -= RPCRDMA_HDRLEN_MIN;
if (bytes < sizeof(struct rpcrdma_segment) * 2) {
pr_warn("RPC: %s: inline threshold too small\n",
__func__);
return 0;
segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1);
dprintk("RPC: %s: max chunk list size = %d segments\n",
__func__, segments);
return segments;