Merge branches 'core', 'cxgb4', 'ipoib', 'iser', 'mlx4', 'ocrdma', 'odp' and 'srp' into for-next

This commit is contained in:
Roland Dreier
2014-12-15 18:19:20 -08:00
50 changed files with 3500 additions and 452 deletions
+11
View File
@@ -38,6 +38,17 @@ config INFINIBAND_USER_MEM
depends on INFINIBAND_USER_ACCESS != n
default y
config INFINIBAND_ON_DEMAND_PAGING
bool "InfiniBand on-demand paging support"
depends on INFINIBAND_USER_MEM
select MMU_NOTIFIER
default y
---help---
On demand paging support for the InfiniBand subsystem.
Together with driver support this allows registration of
memory regions without pinning their pages, fetching the
pages on demand instead.
config INFINIBAND_ADDR_TRANS
bool
depends on INFINIBAND
+1
View File
@@ -11,6 +11,7 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
+66 -6
View File
@@ -39,6 +39,7 @@
#include <linux/hugetlb.h>
#include <linux/dma-attrs.h>
#include <linux/slab.h>
#include <rdma/ib_umem_odp.h>
#include "uverbs.h"
@@ -69,6 +70,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
/**
* ib_umem_get - Pin and DMA map userspace memory.
*
* If access flags indicate ODP memory, avoid pinning. Instead, stores
* the mm for future page fault handling in conjunction with MMU notifiers.
*
* @context: userspace context to pin memory for
* @addr: userspace virtual address to start at
* @size: length of region to pin
@@ -103,17 +108,30 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->context = context;
umem->length = size;
umem->offset = addr & ~PAGE_MASK;
umem->address = addr;
umem->page_size = PAGE_SIZE;
umem->pid = get_task_pid(current, PIDTYPE_PID);
/*
* We ask for writable memory if any access flags other than
* "remote read" are set. "Local write" and "remote write"
* We ask for writable memory if any of the following
* access flags are set. "Local write" and "remote write"
* obviously require write access. "Remote atomic" can do
* things like fetch and add, which will modify memory, and
* "MW bind" can change permissions by binding a window.
*/
umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
umem->writable = !!(access &
(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
if (access & IB_ACCESS_ON_DEMAND) {
ret = ib_umem_odp_get(context, umem);
if (ret) {
kfree(umem);
return ERR_PTR(ret);
}
return umem;
}
umem->odp_data = NULL;
/* We assume the memory is from hugetlb until proved otherwise */
umem->hugetlb = 1;
@@ -132,7 +150,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (!vma_list)
umem->hugetlb = 0;
npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
npages = ib_umem_num_pages(umem);
down_write(&current->mm->mmap_sem);
@@ -235,6 +253,11 @@ void ib_umem_release(struct ib_umem *umem)
struct task_struct *task;
unsigned long diff;
if (umem->odp_data) {
ib_umem_odp_release(umem);
return;
}
__ib_umem_release(umem->context->device, umem, 1);
task = get_pid_task(umem->pid, PIDTYPE_PID);
@@ -246,7 +269,7 @@ void ib_umem_release(struct ib_umem *umem)
if (!mm)
goto out;
diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
diff = ib_umem_num_pages(umem);
/*
* We may be called with the mm's mmap_sem already held. This
@@ -283,6 +306,9 @@ int ib_umem_page_count(struct ib_umem *umem)
int n;
struct scatterlist *sg;
if (umem->odp_data)
return ib_umem_num_pages(umem);
shift = ilog2(umem->page_size);
n = 0;
@@ -292,3 +318,37 @@ int ib_umem_page_count(struct ib_umem *umem)
return n;
}
EXPORT_SYMBOL(ib_umem_page_count);
/*
* Copy from the given ib_umem's pages to the given buffer.
*
* umem - the umem to copy from
* offset - offset to start copying from
* dst - destination buffer
* length - buffer length
*
* Returns 0 on success, or an error code.
*/
int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
size_t length)
{
size_t end = offset + length;
int ret;
if (offset > umem->length || length > umem->length - offset) {
pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
offset, umem->length, end);
return -EINVAL;
}
ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
offset + ib_umem_offset(umem));
if (ret < 0)
return ret;
else if (ret != length)
return -EINVAL;
else
return 0;
}
EXPORT_SYMBOL(ib_umem_copy_from);
File diff suppressed because it is too large Load Diff
+94
View File
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2014 Mellanox Technologies. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/interval_tree_generic.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <rdma/ib_umem_odp.h>
/*
* The ib_umem list keeps track of memory regions for which the HW
* device request to receive notification when the related memory
* mapping is changed.
*
* ib_umem_lock protects the list.
*/
static inline u64 node_start(struct umem_odp_node *n)
{
struct ib_umem_odp *umem_odp =
container_of(n, struct ib_umem_odp, interval_tree);
return ib_umem_start(umem_odp->umem);
}
/* Note that the representation of the intervals in the interval tree
* considers the ending point as contained in the interval, while the
* function ib_umem_end returns the first address which is not contained
* in the umem.
*/
static inline u64 node_last(struct umem_odp_node *n)
{
struct ib_umem_odp *umem_odp =
container_of(n, struct ib_umem_odp, interval_tree);
return ib_umem_end(umem_odp->umem) - 1;
}
INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
node_start, node_last, , rbt_ib_umem)
/* @last is not a part of the interval. See comment for function
* node_last.
*/
int rbt_ib_umem_for_each_in_range(struct rb_root *root,
u64 start, u64 last,
umem_call_back cb,
void *cookie)
{
int ret_val = 0;
struct umem_odp_node *node;
struct ib_umem_odp *umem;
if (unlikely(start == last))
return ret_val;
for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
node = rbt_ib_umem_iter_next(node, start, last - 1)) {
umem = container_of(node, struct ib_umem_odp, interval_tree);
ret_val = cb(umem->umem, start, last, cookie) || ret_val;
}
return ret_val;
}
+1
View File
@@ -258,5 +258,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
IB_UVERBS_DECLARE_EX_CMD(create_flow);
IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
IB_UVERBS_DECLARE_EX_CMD(query_device);
#endif /* UVERBS_H */
+130 -41
View File
@@ -36,6 +36,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <asm/uaccess.h>
@@ -288,6 +289,9 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
struct ib_device *ibdev = file->device->ib_dev;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
#endif
struct ib_ucontext *ucontext;
struct file *filp;
int ret;
@@ -325,8 +329,25 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
INIT_LIST_HEAD(&ucontext->ah_list);
INIT_LIST_HEAD(&ucontext->xrcd_list);
INIT_LIST_HEAD(&ucontext->rule_list);
rcu_read_lock();
ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
rcu_read_unlock();
ucontext->closing = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
ucontext->umem_tree = RB_ROOT;
init_rwsem(&ucontext->umem_rwsem);
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
ret = ib_query_device(ibdev, &dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
ucontext->invalidate_range = NULL;
#endif
resp.num_comp_vectors = file->device->num_comp_vectors;
ret = get_unused_fd_flags(O_CLOEXEC);
@@ -371,6 +392,7 @@ err_fd:
put_unused_fd(resp.async_fd);
err_free:
put_pid(ucontext->tgid);
ibdev->dealloc_ucontext(ucontext);
err:
@@ -378,6 +400,52 @@ err:
return ret;
}
static void copy_query_dev_fields(struct ib_uverbs_file *file,
struct ib_uverbs_query_device_resp *resp,
struct ib_device_attr *attr)
{
resp->fw_ver = attr->fw_ver;
resp->node_guid = file->device->ib_dev->node_guid;
resp->sys_image_guid = attr->sys_image_guid;
resp->max_mr_size = attr->max_mr_size;
resp->page_size_cap = attr->page_size_cap;
resp->vendor_id = attr->vendor_id;
resp->vendor_part_id = attr->vendor_part_id;
resp->hw_ver = attr->hw_ver;
resp->max_qp = attr->max_qp;
resp->max_qp_wr = attr->max_qp_wr;
resp->device_cap_flags = attr->device_cap_flags;
resp->max_sge = attr->max_sge;
resp->max_sge_rd = attr->max_sge_rd;
resp->max_cq = attr->max_cq;
resp->max_cqe = attr->max_cqe;
resp->max_mr = attr->max_mr;
resp->max_pd = attr->max_pd;
resp->max_qp_rd_atom = attr->max_qp_rd_atom;
resp->max_ee_rd_atom = attr->max_ee_rd_atom;
resp->max_res_rd_atom = attr->max_res_rd_atom;
resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
resp->atomic_cap = attr->atomic_cap;
resp->max_ee = attr->max_ee;
resp->max_rdd = attr->max_rdd;
resp->max_mw = attr->max_mw;
resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
resp->max_mcast_grp = attr->max_mcast_grp;
resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
resp->max_ah = attr->max_ah;
resp->max_fmr = attr->max_fmr;
resp->max_map_per_fmr = attr->max_map_per_fmr;
resp->max_srq = attr->max_srq;
resp->max_srq_wr = attr->max_srq_wr;
resp->max_srq_sge = attr->max_srq_sge;
resp->max_pkeys = attr->max_pkeys;
resp->local_ca_ack_delay = attr->local_ca_ack_delay;
resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
const char __user *buf,
int in_len, int out_len)
@@ -398,47 +466,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
return ret;
memset(&resp, 0, sizeof resp);
resp.fw_ver = attr.fw_ver;
resp.node_guid = file->device->ib_dev->node_guid;
resp.sys_image_guid = attr.sys_image_guid;
resp.max_mr_size = attr.max_mr_size;
resp.page_size_cap = attr.page_size_cap;
resp.vendor_id = attr.vendor_id;
resp.vendor_part_id = attr.vendor_part_id;
resp.hw_ver = attr.hw_ver;
resp.max_qp = attr.max_qp;
resp.max_qp_wr = attr.max_qp_wr;
resp.device_cap_flags = attr.device_cap_flags;
resp.max_sge = attr.max_sge;
resp.max_sge_rd = attr.max_sge_rd;
resp.max_cq = attr.max_cq;
resp.max_cqe = attr.max_cqe;
resp.max_mr = attr.max_mr;
resp.max_pd = attr.max_pd;
resp.max_qp_rd_atom = attr.max_qp_rd_atom;
resp.max_ee_rd_atom = attr.max_ee_rd_atom;
resp.max_res_rd_atom = attr.max_res_rd_atom;
resp.max_qp_init_rd_atom = attr.max_qp_init_rd_atom;
resp.max_ee_init_rd_atom = attr.max_ee_init_rd_atom;
resp.atomic_cap = attr.atomic_cap;
resp.max_ee = attr.max_ee;
resp.max_rdd = attr.max_rdd;
resp.max_mw = attr.max_mw;
resp.max_raw_ipv6_qp = attr.max_raw_ipv6_qp;
resp.max_raw_ethy_qp = attr.max_raw_ethy_qp;
resp.max_mcast_grp = attr.max_mcast_grp;
resp.max_mcast_qp_attach = attr.max_mcast_qp_attach;
resp.max_total_mcast_qp_attach = attr.max_total_mcast_qp_attach;
resp.max_ah = attr.max_ah;
resp.max_fmr = attr.max_fmr;
resp.max_map_per_fmr = attr.max_map_per_fmr;
resp.max_srq = attr.max_srq;
resp.max_srq_wr = attr.max_srq_wr;
resp.max_srq_sge = attr.max_srq_sge;
resp.max_pkeys = attr.max_pkeys;
resp.local_ca_ack_delay = attr.local_ca_ack_delay;
resp.phys_port_cnt = file->device->ib_dev->phys_port_cnt;
copy_query_dev_fields(file, &resp, &attr);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -947,6 +975,18 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
goto err_free;
}
if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_device_attr attr;
ret = ib_query_device(pd->device, &attr);
if (ret || !(attr.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
ret = -EINVAL;
goto err_put;
}
}
mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
cmd.access_flags, &udata);
if (IS_ERR(mr)) {
@@ -3253,3 +3293,52 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
return ret ? ret : in_len;
}
int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_ex_query_device_resp resp;
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr;
struct ib_device *device;
int err;
device = file->device->ib_dev;
if (ucore->inlen < sizeof(cmd))
return -EINVAL;
err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
if (err)
return err;
if (cmd.reserved)
return -EINVAL;
err = device->query_device(device, &attr);
if (err)
return err;
memset(&resp, 0, sizeof(resp));
copy_query_dev_fields(file, &resp.base, &attr);
resp.comp_mask = 0;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (cmd.comp_mask & IB_USER_VERBS_EX_QUERY_DEVICE_ODP) {
resp.odp_caps.general_caps = attr.odp_caps.general_caps;
resp.odp_caps.per_transport_caps.rc_odp_caps =
attr.odp_caps.per_transport_caps.rc_odp_caps;
resp.odp_caps.per_transport_caps.uc_odp_caps =
attr.odp_caps.per_transport_caps.uc_odp_caps;
resp.odp_caps.per_transport_caps.ud_odp_caps =
attr.odp_caps.per_transport_caps.ud_odp_caps;
resp.comp_mask |= IB_USER_VERBS_EX_QUERY_DEVICE_ODP;
}
#endif
err = ib_copy_to_udata(ucore, &resp, sizeof(resp));
if (err)
return err;
return 0;
}
+4 -1
View File
@@ -122,7 +122,8 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow
[IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
[IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device
};
static void ib_uverbs_add_one(struct ib_device *device);
@@ -296,6 +297,8 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
kfree(uobj);
}
put_pid(context->tgid);
return context->device->dealloc_ucontext(context);
}
+1 -1
View File
@@ -476,7 +476,7 @@ static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
c2mr->umem->page_size,
i,
length,
c2mr->umem->offset,
ib_umem_offset(c2mr->umem),
&kva,
c2_convert_access(acc),
c2mr);
+6 -1
View File
@@ -1640,7 +1640,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
__state_set(&ep->com, MPA_REQ_RCVD);
/* drive upcall */
mutex_lock(&ep->parent_ep->com.mutex);
mutex_lock_nested(&ep->parent_ep->com.mutex,
SINGLE_DEPTH_NESTING);
if (ep->parent_ep->com.state != DEAD) {
if (connect_request_upcall(ep))
abort_connection(ep, skb, GFP_KERNEL);
@@ -3126,6 +3127,8 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
err = c4iw_wait_for_reply(&ep->com.dev->rdev,
&ep->com.wr_wait,
0, 0, __func__);
else if (err > 0)
err = net_xmit_errno(err);
if (err)
pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
err, ep->stid,
@@ -3159,6 +3162,8 @@ static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
err = c4iw_wait_for_reply(&ep->com.dev->rdev,
&ep->com.wr_wait,
0, 0, __func__);
else if (err > 0)
err = net_xmit_errno(err);
}
if (err)
pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
+1 -1
View File
@@ -670,7 +670,7 @@ static int ep_open(struct inode *inode, struct file *file)
idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
spin_unlock_irq(&epd->devp->lock);
epd->bufsize = count * 160;
epd->bufsize = count * 240;
epd->buf = vmalloc(epd->bufsize);
if (!epd->buf) {
ret = -ENOMEM;
+26 -2
View File
@@ -50,6 +50,13 @@ static int inline_threshold = C4IW_INLINE_THRESHOLD;
module_param(inline_threshold, int, 0644);
MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
{
return (is_t4(dev->rdev.lldi.adapter_type) ||
is_t5(dev->rdev.lldi.adapter_type)) &&
length >= 8*1024*1024*1024ULL;
}
static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
u32 len, dma_addr_t data, int wait)
{
@@ -369,9 +376,11 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
int ret;
ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
FW_RI_STAG_NSMR, mhp->attr.perms,
FW_RI_STAG_NSMR, mhp->attr.len ?
mhp->attr.perms : 0,
mhp->attr.mw_bind_enable, mhp->attr.zbva,
mhp->attr.va_fbo, mhp->attr.len, shift - 12,
mhp->attr.va_fbo, mhp->attr.len ?
mhp->attr.len : -1, shift - 12,
mhp->attr.pbl_size, mhp->attr.pbl_addr);
if (ret)
return ret;
@@ -536,6 +545,11 @@ int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
return ret;
}
if (mr_exceeds_hw_limits(rhp, total_size)) {
kfree(page_list);
return -EINVAL;
}
ret = reregister_mem(rhp, php, &mh, shift, npages);
kfree(page_list);
if (ret)
@@ -596,6 +610,12 @@ struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
if (ret)
goto err;
if (mr_exceeds_hw_limits(rhp, total_size)) {
kfree(page_list);
ret = -EINVAL;
goto err;
}
ret = alloc_pbl(mhp, npages);
if (ret) {
kfree(page_list);
@@ -699,6 +719,10 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
php = to_c4iw_pd(pd);
rhp = php->rhp;
if (mr_exceeds_hw_limits(rhp, length))
return ERR_PTR(-EINVAL);
mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
if (!mhp)
return ERR_PTR(-ENOMEM);
+1 -1
View File
@@ -1538,9 +1538,9 @@ err:
set_state(qhp, C4IW_QP_STATE_ERROR);
free = 1;
abort = 1;
wake_up(&qhp->wait);
BUG_ON(!ep);
flush_qp(qhp);
wake_up(&qhp->wait);
out:
mutex_unlock(&qhp->mutex);
+1 -1
View File
@@ -399,7 +399,7 @@ reg_user_mr_fallback:
pginfo.num_kpages = num_kpages;
pginfo.num_hwpages = num_hwpages;
pginfo.u.usr.region = e_mr->umem;
pginfo.next_hwpage = e_mr->umem->offset / hwpage_size;
pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+1 -1
View File
@@ -214,7 +214,7 @@ struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mr->mr.user_base = start;
mr->mr.iova = virt_addr;
mr->mr.length = length;
mr->mr.offset = umem->offset;
mr->mr.offset = ib_umem_offset(umem);
mr->mr.access_flags = mr_access_flags;
mr->mr.max_segs = n;
mr->umem = umem;
-1
View File
@@ -223,7 +223,6 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
if (flags & IB_MR_REREG_TRANS) {
int shift;
int err;
int n;
mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+1
View File
@@ -1,3 +1,4 @@
obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
+41 -4
View File
@@ -244,6 +244,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->max_mcast_grp;
props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps;
#endif
out:
kfree(in_mad);
kfree(out_mad);
@@ -568,6 +574,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
goto out_count;
}
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
#endif
INIT_LIST_HEAD(&context->db_page_list);
mutex_init(&context->db_page_mutex);
@@ -858,7 +868,7 @@ static ssize_t show_reg_pages(struct device *device,
struct mlx5_ib_dev *dev =
container_of(device, struct mlx5_ib_dev, ib_dev.dev);
return sprintf(buf, "%d\n", dev->mdev->priv.reg_pages);
return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
}
static ssize_t show_hca(struct device *device, struct device_attribute *attr,
@@ -1321,6 +1331,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) |
(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) |
(1ull << IB_USER_VERBS_CMD_OPEN_QP);
dev->ib_dev.uverbs_ex_cmd_mask =
(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
dev->ib_dev.query_device = mlx5_ib_query_device;
dev->ib_dev.query_port = mlx5_ib_query_port;
@@ -1366,6 +1378,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
dev->ib_dev.free_fast_reg_page_list = mlx5_ib_free_fast_reg_page_list;
dev->ib_dev.check_mr_status = mlx5_ib_check_mr_status;
mlx5_ib_internal_query_odp_caps(dev);
if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
@@ -1379,16 +1393,19 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
goto err_eqs;
mutex_init(&dev->cap_mask_mutex);
spin_lock_init(&dev->mr_lock);
err = create_dev_resources(&dev->devr);
if (err)
goto err_eqs;
err = ib_register_device(&dev->ib_dev, NULL);
err = mlx5_ib_odp_init_one(dev);
if (err)
goto err_rsrc;
err = ib_register_device(&dev->ib_dev, NULL);
if (err)
goto err_odp;
err = create_umr_res(dev);
if (err)
goto err_dev;
@@ -1410,6 +1427,9 @@ err_umrc:
err_dev:
ib_unregister_device(&dev->ib_dev);
err_odp:
mlx5_ib_odp_remove_one(dev);
err_rsrc:
destroy_dev_resources(&dev->devr);
@@ -1425,8 +1445,10 @@ err_dealloc:
static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
{
struct mlx5_ib_dev *dev = context;
ib_unregister_device(&dev->ib_dev);
destroy_umrc_res(dev);
mlx5_ib_odp_remove_one(dev);
destroy_dev_resources(&dev->devr);
free_comp_eqs(dev);
ib_dealloc_device(&dev->ib_dev);
@@ -1440,15 +1462,30 @@ static struct mlx5_interface mlx5_ib_interface = {
static int __init mlx5_ib_init(void)
{
int err;
if (deprecated_prof_sel != 2)
pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
return mlx5_register_interface(&mlx5_ib_interface);
err = mlx5_ib_odp_init();
if (err)
return err;
err = mlx5_register_interface(&mlx5_ib_interface);
if (err)
goto clean_odp;
return err;
clean_odp:
mlx5_ib_odp_cleanup();
return err;
}
static void __exit mlx5_ib_cleanup(void)
{
mlx5_unregister_interface(&mlx5_ib_interface);
mlx5_ib_odp_cleanup();
}
module_init(mlx5_ib_init);
+65 -4
View File
@@ -32,6 +32,7 @@
#include <linux/module.h>
#include <rdma/ib_umem.h>
#include <rdma/ib_umem_odp.h>
#include "mlx5_ib.h"
/* @umem: umem object to scan
@@ -57,6 +58,17 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
int entry;
unsigned long page_shift = ilog2(umem->page_size);
/* With ODP we must always match OS page size. */
if (umem->odp_data) {
*count = ib_umem_page_count(umem);
*shift = PAGE_SHIFT;
*ncont = *count;
if (order)
*order = ilog2(roundup_pow_of_two(*count));
return;
}
addr = addr >> page_shift;
tmp = (unsigned long)addr;
m = find_first_bit(&tmp, sizeof(tmp));
@@ -108,8 +120,36 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
*count = i;
}
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int umr)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
if (umem_dma & ODP_READ_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_READ;
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
mtt_entry |= MLX5_IB_MTT_WRITE;
return mtt_entry;
}
#endif
/*
* Populate the given array with bus addresses from the umem.
*
* dev - mlx5_ib device
* umem - umem to use to fill the pages
* page_shift - determines the page size used in the resulting array
* offset - offset into the umem to start from,
* only implemented for ODP umems
* num_pages - total number of pages to fill
* pas - bus addresses array to fill
* access_flags - access flags to set on all present pages.
use enum mlx5_ib_mtt_access_flags for this.
*/
void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, size_t offset, size_t num_pages,
__be64 *pas, int access_flags)
{
unsigned long umem_page_shift = ilog2(umem->page_size);
int shift = page_shift - umem_page_shift;
@@ -120,6 +160,21 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int len;
struct scatterlist *sg;
int entry;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
const bool odp = umem->odp_data != NULL;
if (odp) {
WARN_ON(shift != 0);
WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
for (i = 0; i < num_pages; ++i) {
dma_addr_t pa = umem->odp_data->dma_list[offset + i];
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
}
return;
}
#endif
i = 0;
for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
@@ -128,8 +183,7 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
for (k = 0; k < len; k++) {
if (!(i & mask)) {
cur = base + (k << umem_page_shift);
if (umr)
cur |= 3;
cur |= access_flags;
pas[i >> shift] = cpu_to_be64(cur);
mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
@@ -142,6 +196,13 @@ void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
}
}
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int access_flags)
{
return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
ib_umem_num_pages(umem), pas,
access_flags);
}
int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
{
u64 page_size;
+112 -4
View File
@@ -111,6 +111,8 @@ struct mlx5_ib_pd {
*/
#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START
#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
#define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1
#define MLX5_IB_WR_UMR IB_WR_RESERVED1
@@ -147,6 +149,29 @@ enum {
MLX5_QP_EMPTY
};
/*
* Connect-IB can trigger up to four concurrent pagefaults
* per-QP.
*/
enum mlx5_ib_pagefault_context {
MLX5_IB_PAGEFAULT_RESPONDER_READ,
MLX5_IB_PAGEFAULT_REQUESTOR_READ,
MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
MLX5_IB_PAGEFAULT_CONTEXTS
};
static inline enum mlx5_ib_pagefault_context
mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
{
return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
}
struct mlx5_ib_pfault {
struct work_struct work;
struct mlx5_pagefault mpfault;
};
struct mlx5_ib_qp {
struct ib_qp ibqp;
struct mlx5_core_qp mqp;
@@ -192,6 +217,21 @@ struct mlx5_ib_qp {
/* Store signature errors */
bool signature_en;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
/*
* A flag that is true for QP's that are in a state that doesn't
* allow page faults, and shouldn't schedule any more faults.
*/
int disable_page_faults;
/*
* The disable_page_faults_lock protects a QP's disable_page_faults
* field, allowing for a thread to atomically check whether the QP
* allows page faults, and if so schedule a page fault.
*/
spinlock_t disable_page_faults_lock;
struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
#endif
};
struct mlx5_ib_cq_buf {
@@ -206,6 +246,19 @@ enum mlx5_ib_qp_flags {
MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1,
};
struct mlx5_umr_wr {
union {
u64 virt_addr;
u64 offset;
} target;
struct ib_pd *pd;
unsigned int page_shift;
unsigned int npages;
u32 length;
int access_flags;
u32 mkey;
};
struct mlx5_shared_mr_info {
int mr_id;
struct ib_umem *umem;
@@ -253,6 +306,13 @@ struct mlx5_ib_xrcd {
u32 xrcdn;
};
enum mlx5_ib_mtt_access_flags {
MLX5_IB_MTT_READ = (1 << 0),
MLX5_IB_MTT_WRITE = (1 << 1),
};
#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
struct mlx5_ib_mr {
struct ib_mr ibmr;
struct mlx5_core_mr mmr;
@@ -261,12 +321,11 @@ struct mlx5_ib_mr {
struct list_head list;
int order;
int umred;
__be64 *pas;
dma_addr_t dma;
int npages;
struct mlx5_ib_dev *dev;
struct mlx5_create_mkey_mbox_out out;
struct mlx5_core_sig_ctx *sig;
int live;
};
struct mlx5_ib_fast_reg_page_list {
@@ -372,11 +431,18 @@ struct mlx5_ib_dev {
struct umr_common umrc;
/* sync used page count stats
*/
spinlock_t mr_lock;
struct mlx5_ib_resources devr;
struct mlx5_mr_cache cache;
struct timer_list delay_timer;
int fill_delay;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_odp_caps odp_caps;
/*
* Sleepable RCU that prevents destruction of MRs while they are still
* being used by a page fault handler.
*/
struct srcu_struct mr_srcu;
#endif
};
static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -490,6 +556,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
struct ib_recv_wr **bad_wr);
void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
void *buffer, u32 length);
struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
int vector, struct ib_ucontext *context,
struct ib_udata *udata);
@@ -502,6 +570,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags,
struct ib_udata *udata);
int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
int npages, int zap);
int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
@@ -533,8 +603,11 @@ int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
int *ncont, int *order);
void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, size_t offset, size_t num_pages,
__be64 *pas, int access_flags);
void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
int page_shift, __be64 *pas, int umr);
int page_shift, __be64 *pas, int access_flags);
void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
@@ -544,6 +617,38 @@ void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
struct ib_mr_status *mr_status);
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
extern struct workqueue_struct *mlx5_ib_page_fault_wq;
int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
struct mlx5_ib_pfault *pfault);
void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
int __init mlx5_ib_odp_init(void);
void mlx5_ib_odp_cleanup(void);
void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
unsigned long end);
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
{
return 0;
}
static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {}
static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {}
static inline int mlx5_ib_odp_init(void) { return 0; }
static inline void mlx5_ib_odp_cleanup(void) {}
static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {}
#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline void init_query_mad(struct ib_smp *mad)
{
mad->base_version = 1;
@@ -561,4 +666,7 @@ static inline u8 convert_access(int acc)
MLX5_PERM_LOCAL_READ;
}
#define MLX5_MAX_UMR_SHIFT 16
#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
#endif /* MLX5_IB_H */

Some files were not shown because too many files have changed in this diff Show More