IB/mlx4: Add support for IBoE

Add support for IBoE to mlx4_ib.  The bulk of the code is handling the
new address vector fields; mlx4 needs the MAC address of a remote node
to include it in a WQE (for datagrams) or in the QP context (for
connected QPs).  Address resolution is done by assuming all unicast
GIDs are either link-local IPv6 addresses.

Multicast group attach/detach needs to update the NIC's multicast
filters; but since attaching a QP to a multicast group can be done
before the QP is bound to a port, for IBoE we need to keep track of
all multicast groups that a QP is attached too before it transitions
from INIT to RTR (since it does not have a port in the INIT state).

Signed-off-by: Eli Cohen <eli@mellanox.co.il>

[ Many things cleaned up and otherwise monkeyed with; hope I didn't
  introduce too many bugs.  - Roland ]

Signed-off-by: Roland Dreier <rolandd@cisco.com>
This commit is contained in:
Eli Cohen
2010-10-24 21:08:52 -07:00
committed by Roland Dreier
parent 7ac870ed7d
commit fa417f7b52
6 changed files with 716 additions and 110 deletions
+105 -25
View File
@@ -33,6 +33,7 @@
#include <linux/log2.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_pack.h>
@@ -48,17 +49,25 @@ enum {
enum {
MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f
MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
MLX4_IB_LINK_TYPE_IB = 0,
MLX4_IB_LINK_TYPE_ETH = 1
};
enum {
/*
* Largest possible UD header: send with GRH and immediate data.
* Largest possible UD header: send with GRH and immediate
* data plus 14 bytes for an Ethernet header. (LRH would only
* use 8 bytes, so Ethernet is the biggest case)
*/
MLX4_IB_UD_HEADER_SIZE = 72,
MLX4_IB_UD_HEADER_SIZE = 78,
MLX4_IB_LSO_HEADER_SPARE = 128,
};
enum {
MLX4_IB_IBOE_ETHERTYPE = 0x8915
};
struct mlx4_ib_sqp {
struct mlx4_ib_qp qp;
int pkey_index;
@@ -462,6 +471,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
mutex_init(&qp->mutex);
spin_lock_init(&qp->sq.lock);
spin_lock_init(&qp->rq.lock);
INIT_LIST_HEAD(&qp->gid_list);
qp->state = IB_QPS_RESET;
if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
@@ -649,6 +659,16 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re
}
}
static void del_gid_entries(struct mlx4_ib_qp *qp)
{
struct mlx4_ib_gid_entry *ge, *tmp;
list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
list_del(&ge->list);
kfree(ge);
}
}
static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
int is_user)
{
@@ -695,6 +715,8 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
if (!qp->ibqp.srq)
mlx4_db_free(dev->dev, &qp->db);
}
del_gid_entries(qp);
}
struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
@@ -852,6 +874,12 @@ static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
struct mlx4_qp_path *path, u8 port)
{
int err;
int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
IB_LINK_LAYER_ETHERNET;
u8 mac[6];
int is_mcast;
path->grh_mylmc = ah->src_path_bits & 0x7f;
path->rlid = cpu_to_be16(ah->dlid);
if (ah->static_rate) {
@@ -882,9 +910,35 @@ static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
((port - 1) << 6) | ((ah->sl & 0xf) << 2);
if (is_eth) {
if (!(ah->ah_flags & IB_AH_GRH))
return -1;
err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
if (err)
return err;
memcpy(path->dmac, mac, 6);
path->ackto = MLX4_IB_LINK_TYPE_ETH;
/* use index 0 into MAC table for IBoE */
path->grh_mylmc &= 0x80;
}
return 0;
}
static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
{
struct mlx4_ib_gid_entry *ge, *tmp;
list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
ge->added = 1;
ge->port = qp->port;
}
}
}
static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
const struct ib_qp_attr *attr, int attr_mask,
enum ib_qp_state cur_state, enum ib_qp_state new_state)
@@ -980,7 +1034,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
}
if (attr_mask & IB_QP_TIMEOUT) {
context->pri_path.ackto = attr->timeout << 3;
context->pri_path.ackto |= attr->timeout << 3;
optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
}
@@ -1118,8 +1172,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
qp->atomic_rd_en = attr->qp_access_flags;
if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
qp->resp_depth = attr->max_dest_rd_atomic;
if (attr_mask & IB_QP_PORT)
if (attr_mask & IB_QP_PORT) {
qp->port = attr->port_num;
update_mcg_macs(dev, qp);
}
if (attr_mask & IB_QP_ALT_PATH)
qp->alt_port = attr->alt_port_num;
@@ -1226,35 +1282,45 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
int header_size;
int spc;
int i;
int is_eth;
int is_grh;
send_size = 0;
for (i = 0; i < wr->num_sge; ++i)
send_size += wr->sg_list[i].length;
ib_ud_header_init(send_size, 1, 0, mlx4_ib_ah_grh_present(ah), 0, &sqp->ud_header);
is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
is_grh = mlx4_ib_ah_grh_present(ah);
ib_ud_header_init(send_size, !is_eth, is_eth, is_grh, 0, &sqp->ud_header);
sqp->ud_header.lrh.service_level =
be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 28;
sqp->ud_header.lrh.destination_lid = ah->av.dlid;
sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.g_slid & 0x7f);
if (mlx4_ib_ah_grh_present(ah)) {
if (!is_eth) {
sqp->ud_header.lrh.service_level =
be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
}
if (is_grh) {
sqp->ud_header.grh.traffic_class =
(be32_to_cpu(ah->av.sl_tclass_flowlabel) >> 20) & 0xff;
(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
sqp->ud_header.grh.flow_label =
ah->av.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
sqp->ud_header.grh.hop_limit = ah->av.hop_limit;
ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.port_pd) >> 24,
ah->av.gid_index, &sqp->ud_header.grh.source_gid);
ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
memcpy(sqp->ud_header.grh.destination_gid.raw,
ah->av.dgid, 16);
ah->av.ib.dgid, 16);
}
mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
(sqp->ud_header.lrh.destination_lid ==
IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
(sqp->ud_header.lrh.service_level << 8));
mlx->rlid = sqp->ud_header.lrh.destination_lid;
if (!is_eth) {
mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
(sqp->ud_header.lrh.destination_lid ==
IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
(sqp->ud_header.lrh.service_level << 8));
mlx->rlid = sqp->ud_header.lrh.destination_lid;
}
switch (wr->opcode) {
case IB_WR_SEND:
@@ -1270,9 +1336,21 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
return -EINVAL;
}
sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
if (is_eth) {
u8 *smac;
memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
/* FIXME: cache smac value? */
smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr;
memcpy(sqp->ud_header.eth.smac_h, smac, 6);
if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
} else {
sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
}
sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
if (!sqp->qp.ibqp.qp_num)
ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
@@ -1434,6 +1512,8 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
}
static void set_mlx_icrc_seg(void *dseg)