From 1c69fc2a9012e160c8d459f63df74a6b01db8322 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 6 Feb 2008 21:07:54 -0800 Subject: IB/mlx4: Consolidate code to get an entry from a struct mlx4_buf We use struct mlx4_buf for kernel QP, CQ and SRQ buffers, and the code to look up an entry is duplicated in get_cqe_from_buf() and the QP and SRQ versions of get_wqe(). Factor this out into mlx4_buf_offset(). This will also make it easier to switch over to using vmap() for buffers. Signed-off-by: Roland Dreier --- include/linux/mlx4/device.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 222815d91c40..a0afa7511a30 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -308,6 +308,14 @@ struct mlx4_init_port_param { int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, struct mlx4_buf *buf); void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); +static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) +{ + if (buf->nbufs == 1) + return buf->u.direct.buf + offset; + else + return buf->u.page_list[offset >> PAGE_SHIFT].buf + + (offset & (PAGE_SIZE - 1)); +} int mlx4_pd_alloc(struct mlx4_dev *dev, u32 *pdn); void mlx4_pd_free(struct mlx4_dev *dev, u32 pdn); -- cgit v1.2.3 From 313abe55a87bc10e55d00f337d609e17ad5f8c9a Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 28 Jan 2008 10:40:51 +0200 Subject: mlx4_core: For 64-bit systems, vmap() kernel queue buffers Since kernel virtual memory is not a problem on 64-bit systems, there is no reason to use our own 2-layer page mapping scheme for large kernel queue buffers on such systems. Instead, map the page list to a single virtually contiguous buffer with vmap(), so that can we access buffer memory via direct indexing. Signed-off-by: Michael S. Tsirkin Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/net/mlx4/alloc.c | 16 ++++++++++++++++ include/linux/mlx4/device.h | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c index b226e019bc8b..2da2c2ec1f22 100644 --- a/drivers/net/mlx4/alloc.c +++ b/drivers/net/mlx4/alloc.c @@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, memset(buf->u.page_list[i].buf, 0, PAGE_SIZE); } + + if (BITS_PER_LONG == 64) { + struct page **pages; + pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL); + if (!pages) + goto err_free; + for (i = 0; i < buf->nbufs; ++i) + pages[i] = virt_to_page(buf->u.page_list[i].buf); + buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); + kfree(pages); + if (!buf->u.direct.buf) + goto err_free; + } } return 0; @@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf, buf->u.direct.map); else { + if (BITS_PER_LONG == 64) + vunmap(buf->u.direct.buf); + for (i = 0; i < buf->nbufs; ++i) if (buf->u.page_list[i].buf) dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index a0afa7511a30..631607788f83 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -189,7 +189,7 @@ struct mlx4_buf_list { }; struct mlx4_buf { - union { + struct { struct mlx4_buf_list direct; struct mlx4_buf_list *page_list; } u; @@ -310,7 +310,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { - if (buf->nbufs == 1) + if (BITS_PER_LONG == 64 || buf->nbufs == 1) return buf->u.direct.buf + offset; else return buf->u.page_list[offset >> PAGE_SHIFT].buf + -- cgit v1.2.3 From b57aacfa7a95328f469d0360e49289b023c47e9e Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 6 Feb 2008 21:17:59 -0800 Subject: mlx4_core: Clean up struct mlx4_buf Now that struct mlx4_buf.u is a struct instead of a union because of the vmap() changes, there's no point in having a struct at all. So move .direct and .page_list directly into struct mlx4_buf and get rid of a bunch of unnecessary ".u"s. Signed-off-by: Roland Dreier --- drivers/net/mlx4/alloc.c | 40 ++++++++++++++++++++-------------------- drivers/net/mlx4/mr.c | 4 ++-- include/linux/mlx4/device.h | 10 ++++------ 3 files changed, 26 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/mlx4/alloc.c b/drivers/net/mlx4/alloc.c index 2da2c2ec1f22..521dc0322ee4 100644 --- a/drivers/net/mlx4/alloc.c +++ b/drivers/net/mlx4/alloc.c @@ -116,40 +116,40 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, buf->nbufs = 1; buf->npages = 1; buf->page_shift = get_order(size) + PAGE_SHIFT; - buf->u.direct.buf = dma_alloc_coherent(&dev->pdev->dev, + buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev, size, &t, GFP_KERNEL); - if (!buf->u.direct.buf) + if (!buf->direct.buf) return -ENOMEM; - buf->u.direct.map = t; + buf->direct.map = t; while (t & ((1 << buf->page_shift) - 1)) { --buf->page_shift; buf->npages *= 2; } - memset(buf->u.direct.buf, 0, size); + memset(buf->direct.buf, 0, size); } else { int i; buf->nbufs = (size + PAGE_SIZE - 1) / PAGE_SIZE; buf->npages = buf->nbufs; buf->page_shift = PAGE_SHIFT; - buf->u.page_list = kzalloc(buf->nbufs * sizeof *buf->u.page_list, + buf->page_list = kzalloc(buf->nbufs * sizeof *buf->page_list, GFP_KERNEL); - if (!buf->u.page_list) + if (!buf->page_list) return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) { - buf->u.page_list[i].buf = + buf->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE, &t, GFP_KERNEL); - if (!buf->u.page_list[i].buf) + if (!buf->page_list[i].buf) goto err_free; - buf->u.page_list[i].map = t; + buf->page_list[i].map = t; - memset(buf->u.page_list[i].buf, 0, PAGE_SIZE); + memset(buf->page_list[i].buf, 0, PAGE_SIZE); } if (BITS_PER_LONG == 64) { @@ -158,10 +158,10 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct, if (!pages) goto err_free; for (i = 0; i < buf->nbufs; ++i) - pages[i] = virt_to_page(buf->u.page_list[i].buf); - buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); + pages[i] = virt_to_page(buf->page_list[i].buf); + buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL); kfree(pages); - if (!buf->u.direct.buf) + if (!buf->direct.buf) goto err_free; } } @@ -180,18 +180,18 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf) int i; if (buf->nbufs == 1) - dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf, - buf->u.direct.map); + dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf, + buf->direct.map); else { if (BITS_PER_LONG == 64) - vunmap(buf->u.direct.buf); + vunmap(buf->direct.buf); for (i = 0; i < buf->nbufs; ++i) - if (buf->u.page_list[i].buf) + if (buf->page_list[i].buf) dma_free_coherent(&dev->pdev->dev, PAGE_SIZE, - buf->u.page_list[i].buf, - buf->u.page_list[i].map); - kfree(buf->u.page_list); + buf->page_list[i].buf, + buf->page_list[i].map); + kfree(buf->page_list); } } EXPORT_SYMBOL_GPL(mlx4_buf_free); diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c index 9c9e308d0917..679dfdb6807f 100644 --- a/drivers/net/mlx4/mr.c +++ b/drivers/net/mlx4/mr.c @@ -419,9 +419,9 @@ int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, for (i = 0; i < buf->npages; ++i) if (buf->nbufs == 1) - page_list[i] = buf->u.direct.map + (i << buf->page_shift); + page_list[i] = buf->direct.map + (i << buf->page_shift); else - page_list[i] = buf->u.page_list[i].map; + page_list[i] = buf->page_list[i].map; err = mlx4_write_mtt(dev, mtt, 0, buf->npages, page_list); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 631607788f83..4210ac4a8bcd 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -189,10 +189,8 @@ struct mlx4_buf_list { }; struct mlx4_buf { - struct { - struct mlx4_buf_list direct; - struct mlx4_buf_list *page_list; - } u; + struct mlx4_buf_list direct; + struct mlx4_buf_list *page_list; int nbufs; int npages; int page_shift; @@ -311,9 +309,9 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf); static inline void *mlx4_buf_offset(struct mlx4_buf *buf, int offset) { if (BITS_PER_LONG == 64 || buf->nbufs == 1) - return buf->u.direct.buf + offset; + return buf->direct.buf + offset; else - return buf->u.page_list[offset >> PAGE_SHIFT].buf + + return buf->page_list[offset >> PAGE_SHIFT].buf + (offset & (PAGE_SIZE - 1)); } -- cgit v1.2.3 From ea54b10c7773007e173da31fe7adcc049da33331 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 28 Jan 2008 10:40:59 +0200 Subject: IB/mlx4: Use multiple WQ blocks to post smaller send WQEs ConnectX HCA supports shrinking WQEs, so that a single work request can be made of multiple units of wqe_shift. This way, WRs can differ in size, and do not have to be a power of 2 in size, saving memory and speeding up send WR posting. Unfortunately, if we do this then the wqe_index field in CQEs can't be used to look up the WR ID anymore, so our implementation does this only if selective signaling is off. Further, on 32-bit platforms, we can't use vmap() to make the QP buffer virtually contigious. Thus we have to use constant-sized WRs to make sure a WR is always fully within a single page-sized chunk. Finally, we use WRs with the NOP opcode to avoid wrapping around the queue buffer in the middle of posting a WR, and we set the NoErrorCompletion bit to avoid getting completions with error for NOP WRs. However, NEC is only supported starting with firmware 2.2.232, so we use constant-sized WRs for older firmware. And, since MLX QPs only support SEND, we use constant-sized WRs in this case. When stamping during NOP posting, do stamping following setting of the NOP WQE valid bit. Signed-off-by: Michael S. Tsirkin Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 12 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 + drivers/infiniband/hw/mlx4/qp.c | 210 +++++++++++++++++++++++++++++------ include/linux/mlx4/device.h | 5 + include/linux/mlx4/qp.h | 4 + 5 files changed, 197 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 8ac7b973f870..7360bbafbe84 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -326,6 +326,12 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR; + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP && + is_send)) { + printk(KERN_WARNING "Completion for NOP opcode detected!\n"); + return -EINVAL; + } + if (!*cur_qp || (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) { /* @@ -348,8 +354,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, if (is_send) { wq = &(*cur_qp)->sq; - wqe_ctr = be16_to_cpu(cqe->wqe_index); - wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + if (!(*cur_qp)->sq_signal_bits) { + wqe_ctr = be16_to_cpu(cqe->wqe_index); + wq->tail += (u16) (wqe_ctr - (u16) wq->tail); + } wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } else if ((*cur_qp)->ibqp.srq) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 28697653a370..3726e451a327 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -120,6 +120,8 @@ struct mlx4_ib_qp { u32 doorbell_qpn; __be32 sq_signal_bits; + unsigned sq_next_wqe; + int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 376db730bc75..958e205b6d7c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -30,6 +30,8 @@ * SOFTWARE. */ +#include + #include #include @@ -111,16 +113,87 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) /* * Stamp a SQ WQE so that it is invalid if prefetched by marking the - * first four bytes of every 64 byte chunk with 0xffffffff, except for - * the very first chunk of the WQE. + * first four bytes of every 64 byte chunk with + * 0x7FFFFFF | (invalid_ownership_value << 31). + * + * When the max work request size is less than or equal to the WQE + * basic block size, as an optimization, we can stamp all WQEs with + * 0xffffffff, and skip the very first chunk of each WQE. */ -static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) { - u32 *wqe = get_send_wqe(qp, n); + u32 *wqe; int i; + int s; + int ind; + void *buf; + __be32 stamp; + + s = roundup(size, 1U << qp->sq.wqe_shift); + if (qp->sq_max_wqes_per_wr > 1) { + for (i = 0; i < s; i += 64) { + ind = (i >> qp->sq.wqe_shift) + n; + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : + cpu_to_be32(0xffffffff); + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); + *wqe = stamp; + } + } else { + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = 0xffffffff; + } + } +} + +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) +{ + struct mlx4_wqe_ctrl_seg *ctrl; + struct mlx4_wqe_inline_seg *inl; + void *wqe; + int s; + + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + s = sizeof(struct mlx4_wqe_ctrl_seg); + + if (qp->ibqp.qp_type == IB_QPT_UD) { + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; + struct mlx4_av *av = (struct mlx4_av *)dgram->av; + memset(dgram, 0, sizeof *dgram); + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); + s += sizeof(struct mlx4_wqe_datagram_seg); + } + + /* Pad the remainder of the WQE with an inline data segment. */ + if (size > s) { + inl = wqe + s; + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); + } + ctrl->srcrb_flags = 0; + ctrl->fence_size = size / 16; + /* + * Make sure descriptor is fully written before setting ownership bit + * (because HW can start executing as soon as we do). + */ + wmb(); + + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); - for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16) - wqe[i] = 0xffffffff; + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); +} + +/* Post NOP WQE to prevent wrap-around in the middle of WR */ +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) +{ + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); + if (unlikely(s < qp->sq_max_wqes_per_wr)) { + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); + ind += s; + } + return ind; } static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) @@ -237,6 +310,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, enum ib_qp_type type, struct mlx4_ib_qp *qp) { + int s; + /* Sanity check SQ size before proceeding */ if (cap->max_send_wr > dev->dev->caps.max_wqes || cap->max_send_sge > dev->dev->caps.max_sq_sg || @@ -252,20 +327,74 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg) return -EINVAL; - qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge * - sizeof (struct mlx4_wqe_data_seg), - cap->max_inline_data + - sizeof (struct mlx4_wqe_inline_seg)) + - send_wqe_overhead(type))); - qp->sq.max_gs = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) / - sizeof (struct mlx4_wqe_data_seg); + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg), + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) + + send_wqe_overhead(type); /* - * We need to leave 2 KB + 1 WQE of headroom in the SQ to - * allow HW to prefetch. + * Hermon supports shrinking WQEs, such that a single work + * request can include multiple units of 1 << wqe_shift. This + * way, work requests can differ in size, and do not have to + * be a power of 2 in size, saving memory and speeding up send + * WR posting. Unfortunately, if we do this then the + * wqe_index field in CQEs can't be used to look up the WR ID + * anymore, so we do this only if selective signaling is off. + * + * Further, on 32-bit platforms, we can't use vmap() to make + * the QP buffer virtually contigious. Thus we have to use + * constant-sized WRs to make sure a WR is always fully within + * a single page-sized chunk. + * + * Finally, we use NOP work requests to pad the end of the + * work queue, to avoid wrap-around in the middle of WR. We + * set NEC bit to avoid getting completions with error for + * these NOP WRs, but since NEC is only supported starting + * with firmware 2.2.232, we use constant-sized WRs for older + * firmware. + * + * And, since MLX QPs only support SEND, we use constant-sized + * WRs in this case. + * + * We look for the smallest value of wqe_shift such that the + * resulting number of wqes does not exceed device + * capabilities. + * + * We set WQE size to at least 64 bytes, this way stamping + * invalidates each WQE. */ - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes); + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + qp->sq_signal_bits && BITS_PER_LONG == 64 && + type != IB_QPT_SMI && type != IB_QPT_GSI) + qp->sq.wqe_shift = ilog2(64); + else + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + + for (;;) { + if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz) + return -EINVAL; + + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); + + /* + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. + */ + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * + qp->sq_max_wqes_per_wr + + qp->sq_spare_wqes); + + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) + break; + + if (qp->sq_max_wqes_per_wr <= 1) + return -EINVAL; + + ++qp->sq.wqe_shift; + } + + qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - + send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg); qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); @@ -277,7 +406,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, qp->sq.offset = 0; } - cap->max_send_wr = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; + cap->max_send_wr = qp->sq.max_post = + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; cap->max_send_sge = qp->sq.max_gs; /* We don't support inline sends for kernel QPs (yet) */ cap->max_inline_data = 0; @@ -323,6 +453,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; + qp->sq_next_wqe = 0; + + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); + else + qp->sq_signal_bits = 0; err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp); if (err) @@ -413,11 +549,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) - qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - else - qp->sq_signal_bits = 0; - qp->mqp.event = mlx4_ib_qp_event; return 0; @@ -912,7 +1043,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = cpu_to_be32(1 << 31); - stamp_send_wqe(qp, i); + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); } } @@ -965,6 +1096,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, qp->rq.tail = 0; qp->sq.head = 0; qp->sq.tail = 0; + qp->sq_next_wqe = 0; if (!ibqp->srq) *qp->db.db = 0; } @@ -1274,13 +1406,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, unsigned long flags; int nreq; int err = 0; - int ind; - int size; + unsigned ind; + int uninitialized_var(stamp); + int uninitialized_var(size); int i; spin_lock_irqsave(&qp->sq.lock, flags); - ind = qp->sq.head; + ind = qp->sq_next_wqe; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) { @@ -1296,7 +1429,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); - qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->srcrb_flags = (wr->send_flags & IB_SEND_SIGNALED ? @@ -1409,16 +1542,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); + stamp = ind + qp->sq_spare_wqes; + ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); + /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. + * + * Same optimization applies to padding with NOP wqe + * in case of WQE shrinking (used to prevent wrap-around + * in the middle of WR). */ - if (wr->next) - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & - (qp->sq.wqe_cnt - 1)); + if (wr->next) { + stamp_send_wqe(qp, stamp, size * 16); + ind = pad_wraparound(qp, ind); + } - ++ind; } out: @@ -1440,8 +1580,10 @@ out: */ mmiowb(); - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & - (qp->sq.wqe_cnt - 1)); + stamp_send_wqe(qp, stamp, size * 16); + + ind = pad_wraparound(qp, ind); + qp->sq_next_wqe = ind; } spin_unlock_irqrestore(&qp->sq.lock, flags); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 4210ac4a8bcd..6cdf813cd478 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -133,6 +133,11 @@ enum { MLX4_STAT_RATE_OFFSET = 5 }; +static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor) +{ + return (major << 32) | (minor << 16) | subminor; +} + struct mlx4_caps { u64 fw_ver; int num_ports; diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 3968b943259a..09a2230923f2 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -154,7 +154,11 @@ struct mlx4_qp_context { u32 reserved5[10]; }; +/* Which firmware version adds support for NEC (NoErrorCompletion) bit */ +#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232) + enum { + MLX4_WQE_CTRL_NEC = 1 << 29, MLX4_WQE_CTRL_FENCE = 1 << 6, MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2, MLX4_WQE_CTRL_SOLICITED = 1 << 1, -- cgit v1.2.3