Commit c5cfcfcb authored by Jason Gunthorpe's avatar Jason Gunthorpe

Merge branch 'siw' into rdma.git for-next

Bernard Metzler says:

====================
This patch set contributes the SoftiWarp driver rebased for latest
rdma-next. SoftiWarp (siw) implements the iWarp RDMA protocol over kernel
TCP sockets. The driver integrates with the linux-rdma framework.

A matching userlevel driver is available as PR at
https://github.com/linux-rdma/rdma-core/pull/536

Many thanks for reviewing and testing the driver, especially to Leon,
Jason, Steve, Doug, Olga, Dennis, Gal. You all helped to significantly
improve the driver over the last year.

Please find below a list of changes and comments, compared to older
versions of the siw driver.

Many thanks!
Bernard.

CHANGES:
========

v3 (this version)
-----------------

- Rebased to rdma-next

- Removed unneccessary initialization of enums in siw-abi.h

- Added comment on sizing of all work queues to power of two.

v2
-----------------

- Changed recieve path CRC calculation to compute CRC32c not
  on target buffer after placement, but on original skbuf.
  This change severely hurts performance, if CRC is switched
  on, since skb must now be walked twice. It is planned to
  work on an extension to skb_copy_bits() to fold in CRC
  computation.

- Moved debugging to using ibdev_dbg().

- Dropped detailed packet debug printing.

- Removed siw_debug.[ch] files.

- Removed resource tracking, code now relies on restrack of
  RDMA midlayer. Only object counting to enforce reported
  device limits is left in place.

- Removed all nested switch-case statements.

- Cleaned up header file #include's

- Moved CQ create/destroy to new semantics,
  where midlayer creates/destroys containing object.

- Set siw's ABI version to 1 (was 0 before)

- Removed all enum initialization where not needed.

- Fixed MAINTANERS entry for siw driver

- This version stays with the current siw specific
  management of user memory (siw_umem_get() vs.
  ib_umem_get(), etc.). This, since the current ib_umem
  implementation is less efficient for user page lookup
  on the fast path, where effciency is important for a
  SW RDMA driver.
  It is planned to contribute enhancements to the ib_umem
  framework, wich makes it suitable for SW drivers as well.

v1 (first version after v9 of siw RFC)
--------------------------------------

- Rebased to 5.2-rc1

- All IDR code got removed.

- Both MR and QP deallocation verbs now synchronously
  free the resources referenced by the RDMA mid-layer.

- IPv6 support was added.

- For compatibility with Chelsio iWarp hardware, the RX
  path was slightly reworked. It now allows packet intersection
  between tagged and untagged RDMAP operations. While not
  a defined behavior as of IETF RFC 5040/5041, some RDMA hardware
  may intersect an ongoing outbound (large) tagged message, such
  as an multisegment RDMA Read Response with sending an untagged
  message, such as an RDMA Send frame. This behavior was only
  detected in an NVMeF setup, where siw was used at target side,
  and RDMA hardware at client side (during file write). siw now
  implements two input paths for tagged and untagged messages each,
  and allows the intersected placement of both messages.

- The siw kernel abi file got renamed from siw_user.h to siw-abi.h.
====================

* branch 'siw':
  SIW addition to kernel build environment
  SIW completion queue methods
  SIW receive path
  SIW transmit path
  SIW queue pair methods
  SIW application buffer management
  SIW application interface
  SIW connection management
  SIW network and RDMA core interface
  SIW main include file
  iWarp wire packet format
parents 09fbca8e c0cf5bdd
......@@ -14558,6 +14558,13 @@ M: Chris Boot <bootc@bootc.net>
S: Maintained
F: drivers/leds/leds-net48xx.c
SOFT-IWARP DRIVER (siw)
M: Bernard Metzler <bmt@zurich.ibm.com>
L: linux-rdma@vger.kernel.org
S: Supported
F: drivers/infiniband/sw/siw/
F: include/uapi/rdma/siw-abi.h
SOFT-ROCE DRIVER (rxe)
M: Moni Shoua <monis@mellanox.com>
L: linux-rdma@vger.kernel.org
......
......@@ -96,6 +96,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig"
source "drivers/infiniband/hw/qedr/Kconfig"
source "drivers/infiniband/sw/rdmavt/Kconfig"
source "drivers/infiniband/sw/rxe/Kconfig"
source "drivers/infiniband/sw/siw/Kconfig"
endif
source "drivers/infiniband/ulp/ipoib/Kconfig"
......
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/
obj-$(CONFIG_RDMA_RXE) += rxe/
obj-$(CONFIG_RDMA_SIW) += siw/
config RDMA_SIW
tristate "Software RDMA over TCP/IP (iWARP) driver"
depends on INET && INFINIBAND && CRYPTO_CRC32
help
This driver implements the iWARP RDMA transport over
the Linux TCP/IP network stack. It enables a system with a
standard Ethernet adapter to interoperate with a iWARP
adapter or with another system running the SIW driver.
(See also RXE which is a similar software driver for RoCE.)
The driver interfaces with the Linux RDMA stack and
implements both a kernel and user space RDMA verbs API.
The user space verbs API requires a support
library named libsiw which is loaded by the generic user
space verbs API, libibverbs. To implement RDMA over
TCP/IP, the driver further interfaces with the Linux
in-kernel TCP socket layer.
obj-$(CONFIG_RDMA_SIW) += siw.o
siw-y := \
siw_cm.o \
siw_cq.o \
siw_main.o \
siw_mem.o \
siw_qp.o \
siw_qp_tx.o \
siw_qp_rx.o \
siw_verbs.o
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
#ifndef _IWARP_H
#define _IWARP_H
#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */
#include <linux/types.h>
#include <asm/byteorder.h>
#define RDMAP_VERSION 1
#define DDP_VERSION 1
#define MPA_REVISION_1 1
#define MPA_REVISION_2 2
#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
#define MPA_KEY_REQ "MPA ID Req Frame"
#define MPA_KEY_REP "MPA ID Rep Frame"
#define MPA_IRD_ORD_MASK 0x3fff
struct mpa_rr_params {
__be16 bits;
__be16 pd_len;
};
/*
* MPA request/response header bits & fields
*/
enum {
MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000),
MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
};
/*
* MPA request/reply header
*/
struct mpa_rr {
__u8 key[16];
struct mpa_rr_params params;
};
static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev)
{
*bits = (*bits & ~MPA_RR_MASK_REVISION) |
(cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
}
static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits)
{
__be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
return be16_to_cpu(rev);
}
enum mpa_v2_ctrl {
MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000),
MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000),
MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000),
MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000),
MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000),
MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff)
};
struct mpa_v2_data {
__be16 ird;
__be16 ord;
};
struct mpa_marker {
__be16 rsvd;
__be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
};
/*
* maximum MPA trailer
*/
struct mpa_trailer {
__u8 pad[4];
__be32 crc;
};
#define MPA_HDR_SIZE 2
#define MPA_CRC_SIZE 4
/*
* Common portion of iWARP headers (MPA, DDP, RDMAP)
* for any FPDU
*/
struct iwarp_ctrl {
__be16 mpa_len;
__be16 ddp_rdmap_ctrl;
};
/*
* DDP/RDMAP Hdr bits & fields
*/
enum {
DDP_FLAG_TAGGED = cpu_to_be16(0x8000),
DDP_FLAG_LAST = cpu_to_be16(0x4000),
DDP_MASK_RESERVED = cpu_to_be16(0x3C00),
DDP_MASK_VERSION = cpu_to_be16(0x0300),
RDMAP_MASK_VERSION = cpu_to_be16(0x00C0),
RDMAP_MASK_RESERVED = cpu_to_be16(0x0030),
RDMAP_MASK_OPCODE = cpu_to_be16(0x000f)
};
static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
{
return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
}
static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version)
{
ctrl->ddp_rdmap_ctrl =
(ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) |
(cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION);
}
static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
{
__be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
return be16_to_cpu(ver) >> 6;
}
static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version)
{
ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) |
(cpu_to_be16(version << 6) & RDMAP_MASK_VERSION);
}
static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
{
return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
}
static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode)
{
ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) |
(cpu_to_be16(opcode) & RDMAP_MASK_OPCODE);
}
struct iwarp_rdma_write {
struct iwarp_ctrl ctrl;
__be32 sink_stag;
__be64 sink_to;
};
struct iwarp_rdma_rreq {
struct iwarp_ctrl ctrl;
__be32 rsvd;
__be32 ddp_qn;
__be32 ddp_msn;
__be32 ddp_mo;
__be32 sink_stag;
__be64 sink_to;
__be32 read_size;
__be32 source_stag;
__be64 source_to;
};
struct iwarp_rdma_rresp {
struct iwarp_ctrl ctrl;
__be32 sink_stag;
__be64 sink_to;
};
struct iwarp_send {
struct iwarp_ctrl ctrl;
__be32 rsvd;
__be32 ddp_qn;
__be32 ddp_msn;
__be32 ddp_mo;
};
struct iwarp_send_inv {
struct iwarp_ctrl ctrl;
__be32 inval_stag;
__be32 ddp_qn;
__be32 ddp_msn;
__be32 ddp_mo;
};
struct iwarp_terminate {
struct iwarp_ctrl ctrl;
__be32 rsvd;
__be32 ddp_qn;
__be32 ddp_msn;
__be32 ddp_mo;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__be32 layer : 4;
__be32 etype : 4;
__be32 ecode : 8;
__be32 flag_m : 1;
__be32 flag_d : 1;
__be32 flag_r : 1;
__be32 reserved : 13;
#elif defined(__BIG_ENDIAN_BITFIELD)
__be32 reserved : 13;
__be32 flag_r : 1;
__be32 flag_d : 1;
__be32 flag_m : 1;
__be32 ecode : 8;
__be32 etype : 4;
__be32 layer : 4;
#else
#error "undefined byte order"
#endif
};
/*
* Terminate Hdr bits & fields
*/
enum {
TERM_MASK_LAYER = cpu_to_be32(0xf0000000),
TERM_MASK_ETYPE = cpu_to_be32(0x0f000000),
TERM_MASK_ECODE = cpu_to_be32(0x00ff0000),
TERM_FLAG_M = cpu_to_be32(0x00008000),
TERM_FLAG_D = cpu_to_be32(0x00004000),
TERM_FLAG_R = cpu_to_be32(0x00002000),
TERM_MASK_RESVD = cpu_to_be32(0x00001fff)
};
static inline u8 __rdmap_term_layer(struct iwarp_terminate *term)
{
return term->layer;
}
static inline void __rdmap_term_set_layer(struct iwarp_terminate *term,
u8 layer)
{
term->layer = layer & 0xf;
}
static inline u8 __rdmap_term_etype(struct iwarp_terminate *term)
{
return term->etype;
}
static inline void __rdmap_term_set_etype(struct iwarp_terminate *term,
u8 etype)
{
term->etype = etype & 0xf;
}
static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term)
{
return term->ecode;
}
static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term,
u8 ecode)
{
term->ecode = ecode;
}
/*
* Common portion of iWARP headers (MPA, DDP, RDMAP)
* for an FPDU carrying an untagged DDP segment
*/
struct iwarp_ctrl_untagged {
struct iwarp_ctrl ctrl;
__be32 rsvd;
__be32 ddp_qn;
__be32 ddp_msn;
__be32 ddp_mo;
};
/*
* Common portion of iWARP headers (MPA, DDP, RDMAP)
* for an FPDU carrying a tagged DDP segment
*/
struct iwarp_ctrl_tagged {
struct iwarp_ctrl ctrl;
__be32 ddp_stag;
__be64 ddp_to;
};
union iwarp_hdr {
struct iwarp_ctrl ctrl;
struct iwarp_ctrl_untagged c_untagged;
struct iwarp_ctrl_tagged c_tagged;
struct iwarp_rdma_write rwrite;
struct iwarp_rdma_rreq rreq;
struct iwarp_rdma_rresp rresp;
struct iwarp_terminate terminate;
struct iwarp_send send;
struct iwarp_send_inv send_inv;
};
enum term_elayer {
TERM_ERROR_LAYER_RDMAP = 0x00,
TERM_ERROR_LAYER_DDP = 0x01,
TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */
};
enum ddp_etype {
DDP_ETYPE_CATASTROPHIC = 0x0,
DDP_ETYPE_TAGGED_BUF = 0x1,
DDP_ETYPE_UNTAGGED_BUF = 0x2,
DDP_ETYPE_RSVD = 0x3
};
enum ddp_ecode {
/* unspecified, set to zero */
DDP_ECODE_CATASTROPHIC = 0x00,
/* Tagged Buffer Errors */
DDP_ECODE_T_INVALID_STAG = 0x00,
DDP_ECODE_T_BASE_BOUNDS = 0x01,
DDP_ECODE_T_STAG_NOT_ASSOC = 0x02,
DDP_ECODE_T_TO_WRAP = 0x03,
DDP_ECODE_T_VERSION = 0x04,
/* Untagged Buffer Errors */
DDP_ECODE_UT_INVALID_QN = 0x01,
DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02,
DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03,
DDP_ECODE_UT_INVALID_MO = 0x04,
DDP_ECODE_UT_MSG_TOOLONG = 0x05,
DDP_ECODE_UT_VERSION = 0x06
};
enum rdmap_untagged_qn {
RDMAP_UNTAGGED_QN_SEND = 0,
RDMAP_UNTAGGED_QN_RDMA_READ = 1,
RDMAP_UNTAGGED_QN_TERMINATE = 2,
RDMAP_UNTAGGED_QN_COUNT = 3
};
enum rdmap_etype {
RDMAP_ETYPE_CATASTROPHIC = 0x0,
RDMAP_ETYPE_REMOTE_PROTECTION = 0x1,
RDMAP_ETYPE_REMOTE_OPERATION = 0x2
};
enum rdmap_ecode {
RDMAP_ECODE_INVALID_STAG = 0x00,
RDMAP_ECODE_BASE_BOUNDS = 0x01,
RDMAP_ECODE_ACCESS_RIGHTS = 0x02,
RDMAP_ECODE_STAG_NOT_ASSOC = 0x03,
RDMAP_ECODE_TO_WRAP = 0x04,
RDMAP_ECODE_VERSION = 0x05,
RDMAP_ECODE_OPCODE = 0x06,
RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07,
RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08,
RDMAP_ECODE_CANNOT_INVALIDATE = 0x09,
RDMAP_ECODE_UNSPECIFIED = 0xff
};
enum llp_ecode {
LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */
LLP_ECODE_RECEIVED_CRC = 0x02,
LLP_ECODE_FPDU_START = 0x03,
LLP_ECODE_INVALID_REQ_RESP = 0x04,
/* Errors for Enhanced Connection Establishment only */
LLP_ECODE_LOCAL_CATASTROPHIC = 0x05,
LLP_ECODE_INSUFFICIENT_IRD = 0x06,
LLP_ECODE_NO_MATCHING_RTR = 0x07
};
enum llp_etype { LLP_ETYPE_MPA = 0x00 };
enum rdma_opcode {
RDMAP_RDMA_WRITE = 0x0,
RDMAP_RDMA_READ_REQ = 0x1,
RDMAP_RDMA_READ_RESP = 0x2,
RDMAP_SEND = 0x3,
RDMAP_SEND_INVAL = 0x4,
RDMAP_SEND_SE = 0x5,
RDMAP_SEND_SE_INVAL = 0x6,
RDMAP_TERMINATE = 0x7,
RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1
};
#endif
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Greg Joyce <greg@opengridcomputing.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
/* Copyright (c) 2017, Open Grid Computing, Inc. */
#ifndef _SIW_CM_H
#define _SIW_CM_H
#include <net/sock.h>
#include <linux/tcp.h>
#include <rdma/iw_cm.h>
enum siw_cep_state {
SIW_EPSTATE_IDLE = 1,
SIW_EPSTATE_LISTENING,
SIW_EPSTATE_CONNECTING,
SIW_EPSTATE_AWAIT_MPAREQ,
SIW_EPSTATE_RECVD_MPAREQ,
SIW_EPSTATE_AWAIT_MPAREP,
SIW_EPSTATE_RDMA_MODE,
SIW_EPSTATE_CLOSED
};
struct siw_mpa_info {
struct mpa_rr hdr; /* peer mpa hdr in host byte order */
struct mpa_v2_data v2_ctrl;
struct mpa_v2_data v2_ctrl_req;
char *pdata;
int bytes_rcvd;
};
struct siw_device;
struct siw_cep {
struct iw_cm_id *cm_id;
struct siw_device *sdev;
struct list_head devq;
spinlock_t lock;
struct kref ref;
int in_use;
wait_queue_head_t waitq;
enum siw_cep_state state;
struct list_head listenq;
struct siw_cep *listen_cep;
struct siw_qp *qp;
struct socket *sock;
struct siw_cm_work *mpa_timer;
struct list_head work_freelist;
struct siw_mpa_info mpa;
int ord;
int ird;
bool enhanced_rdma_conn_est;
/* Saved upcalls of socket */
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk);
void (*sk_write_space)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
};
/*
* Connection initiator waits 10 seconds to receive an
* MPA reply after sending out MPA request. Reponder waits for
* 5 seconds for MPA request to arrive if new TCP connection
* was set up.
*/
#define MPAREQ_TIMEOUT (HZ * 10)
#define MPAREP_TIMEOUT (HZ * 5)
enum siw_work_type {
SIW_CM_WORK_ACCEPT = 1,
SIW_CM_WORK_READ_MPAHDR,
SIW_CM_WORK_CLOSE_LLP, /* close socket */
SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
SIW_CM_WORK_MPATIMEOUT
};
struct siw_cm_work {
struct delayed_work work;
struct list_head list;
enum siw_work_type type;
struct siw_cep *cep;
};
#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a)))
static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
{
return s->ops->getname(s, (struct sockaddr *)a, 1);
}
static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
{
return s->ops->getname(s, (struct sockaddr *)a, 0);
}
static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
int flags)
{
struct kvec iov = { buf, size };
struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
}
int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm);
int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
int siw_reject(struct iw_cm_id *id, const void *data, u8 len);
int siw_create_listen(struct iw_cm_id *id, int backlog);
int siw_destroy_listen(struct iw_cm_id *id);
void siw_cep_get(struct siw_cep *cep);
void siw_cep_put(struct siw_cep *cep);
int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type);
int siw_cm_init(void);
void siw_cm_exit(void);
/*
* TCP socket interface
*/
#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp)
#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data))
#endif
// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
#include <linux/errno.h>
#include <linux/types.h>
#include <rdma/ib_verbs.h>
#include "siw.h"
static int map_wc_opcode[SIW_NUM_OPCODES] = {
[SIW_OP_WRITE] = IB_WC_RDMA_WRITE,
[SIW_OP_SEND] = IB_WC_SEND,
[SIW_OP_SEND_WITH_IMM] = IB_WC_SEND,
[SIW_OP_READ] = IB_WC_RDMA_READ,
[SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ,
[SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP,
[SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
[SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV,
[SIW_OP_REG_MR] = IB_WC_REG_MR,
[SIW_OP_RECEIVE] = IB_WC_RECV,
[SIW_OP_READ_RESPONSE] = -1 /* not used */
};
static struct {
enum siw_opcode siw;
enum ib_wc_status ib;
} map_cqe_status[SIW_NUM_WC_STATUS] = {
{ SIW_WC_SUCCESS, IB_WC_SUCCESS },
{ SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR },
{ SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR },
{ SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
{ SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
{ SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
{ SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
{ SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
{ SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
{ SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR }
};
/*
* Reap one CQE from the CQ. Only used by kernel clients
* during CQ normal operation. Might be called during CQ
* flush for user mapped CQE array as well.
*/
int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
{
struct siw_cqe *cqe;
unsigned long flags;
spin_lock_irqsave(&cq->lock, flags);
cqe = &cq->queue[cq->cq_get % cq->num_cqe];
if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
memset(wc, 0, sizeof(*wc));
wc->wr_id = cqe->id;
wc->status = map_cqe_status[cqe->status].ib;
wc->opcode = map_wc_opcode[cqe->opcode];
wc->byte_len = cqe->bytes;
/*
* During CQ flush, also user land CQE's may get
* reaped here, which do not hold a QP reference
* and do not qualify for memory extension verbs.
*/
if (likely(cq->kernel_verbs)) {
if (cqe->flags & SIW_WQE_REM_INVAL) {
wc->ex.invalidate_rkey = cqe->inval_stag;
wc->wc_flags = IB_WC_WITH_INVALIDATE;
}
wc->qp = cqe->base_qp;
siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n",
cq->cq_get % cq->num_cqe, cqe->opcode,
cqe->flags, (void *)cqe->id);
}
WRITE_ONCE(cqe->flags, 0);
cq->cq_get++;
spin_unlock_irqrestore(&cq->lock, flags);
return 1;
}
spin_unlock_irqrestore(&cq->lock, flags);
return 0;
}
/*
* siw_cq_flush()
*
* Flush all CQ elements.
*/
void siw_cq_flush(struct siw_cq *cq)
{
struct ib_wc wc;
while (siw_reap_cqe(cq, &wc))
;
}
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
#ifndef _SIW_MEM_H
#define _SIW_MEM_H
struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
void siw_umem_release(struct siw_umem *umem, bool dirty);
struct siw_pbl *siw_pbl_alloc(u32 num_buf);
u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
enum ib_access_flags perms, int len);
int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge,
struct siw_mem *mem[], enum ib_access_flags perms,
u32 off, int len);
void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op);
int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
u64 start, u64 len, int rights);
void siw_mr_drop_mem(struct siw_mr *mr);
void siw_free_mem(struct kref *ref);
static inline void siw_mem_put(struct siw_mem *mem)
{
kref_put(&mem->ref, siw_free_mem);
}
static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
{
return container_of(m, struct siw_mr, mem);
}
static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
{
while (num_sge) {
if (*mem == NULL)
break;
siw_mem_put(*mem);
*mem = NULL;
mem++;
num_sge--;
}
}
#define CHUNK_SHIFT 9 /* sets number of pages per chunk */
#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT)
#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1))
#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *))
/*
* siw_get_upage()
*
* Get page pointer for address on given umem.
*
* @umem: two dimensional list of page pointers
* @addr: user virtual address
*/
static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr)
{
unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT,
chunk_idx = page_idx >> CHUNK_SHIFT,
page_in_chunk = page_idx & ~CHUNK_MASK;
if (likely(page_idx < umem->num_pages))
return umem->page_chunk[chunk_idx].plist[page_in_chunk];
return NULL;
}
#endif
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
#ifndef _SIW_VERBS_H
#define _SIW_VERBS_H
#include <linux/errno.h>
#include <rdma/iw_cm.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_user_verbs.h>
#include "siw.h"
#include "siw_cm.h"
/*
* siw_copy_sgl()
*
* Copy SGL from RDMA core representation to local
* representation.
*/
static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
int num_sge)
{
while (num_sge--) {
siw_sge->laddr = sge->addr;
siw_sge->length = sge->length;
siw_sge->lkey = sge->lkey;
siw_sge++;
sge++;
}
}
int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
int siw_query_port(struct ib_device *base_dev, u8 port,
struct ib_port_attr *attr);
int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
struct ib_port_immutable *port_immutable);
int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
struct ib_udata *udata);
int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
struct ib_udata *udata);
int siw_query_port(struct ib_device *base_dev, u8 port,
struct ib_port_attr *attr);
int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
union ib_gid *gid);
int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
struct ib_qp *siw_create_qp(struct ib_pd *base_pd,
struct ib_qp_init_attr *attr,
struct ib_udata *udata);
int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
int attr_mask, struct ib_udata *udata);
int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr);
int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
u64 rnic_va, int rights, struct ib_udata *udata);
struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
u32 max_sge, struct ib_udata *udata);
struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
unsigned int *sg_off);
int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
struct ib_udata *udata);
int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
enum ib_srq_attr_mask mask, struct ib_udata *udata);
int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
#endif
......@@ -103,6 +103,7 @@ enum rdma_driver_id {
RDMA_DRIVER_HFI1,
RDMA_DRIVER_QIB,
RDMA_DRIVER_EFA,
RDMA_DRIVER_SIW,
};
#endif
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */
#ifndef _SIW_USER_H
#define _SIW_USER_H
#include <linux/types.h>
#define SIW_NODE_DESC_COMMON "Software iWARP stack"
#define SIW_ABI_VERSION 1
#define SIW_MAX_SGE 6
#define SIW_UOBJ_MAX_KEY 0x08FFFF
#define SIW_INVAL_UOBJ_KEY (SIW_UOBJ_MAX_KEY + 1)
struct siw_uresp_create_cq {
__u32 cq_id;
__u32 num_cqe;
__aligned_u64 cq_key;
};
struct siw_uresp_create_qp {
__u32 qp_id;
__u32 num_sqe;
__u32 num_rqe;
__u32 pad;
__aligned_u64 sq_key;
__aligned_u64 rq_key;
};
struct siw_ureq_reg_mr {
__u8 stag_key;
__u8 reserved[3];
__u32 pad;
};
struct siw_uresp_reg_mr {
__u32 stag;
__u32 pad;
};
struct siw_uresp_create_srq {
__u32 num_rqe;
__u32 pad;
__aligned_u64 srq_key;
};
struct siw_uresp_alloc_ctx {
__u32 dev_id;
__u32 pad;
};
enum siw_opcode {
SIW_OP_WRITE,
SIW_OP_READ,
SIW_OP_READ_LOCAL_INV,
SIW_OP_SEND,
SIW_OP_SEND_WITH_IMM,
SIW_OP_SEND_REMOTE_INV,
/* Unsupported */
SIW_OP_FETCH_AND_ADD,
SIW_OP_COMP_AND_SWAP,
SIW_OP_RECEIVE,
/* provider internal SQE */
SIW_OP_READ_RESPONSE,
/*
* below opcodes valid for
* in-kernel clients only
*/
SIW_OP_INVAL_STAG,
SIW_OP_REG_MR,
SIW_NUM_OPCODES
};
/* Keep it same as ibv_sge to allow for memcpy */
struct siw_sge {
__aligned_u64 laddr;
__u32 length;
__u32 lkey;
};
/*
* Inline data are kept within the work request itself occupying
* the space of sge[1] .. sge[n]. Therefore, inline data cannot be
* supported if SIW_MAX_SGE is below 2 elements.
*/
#define SIW_MAX_INLINE (sizeof(struct siw_sge) * (SIW_MAX_SGE - 1))
#if SIW_MAX_SGE < 2
#error "SIW_MAX_SGE must be at least 2"
#endif
enum siw_wqe_flags {
SIW_WQE_VALID = 1,
SIW_WQE_INLINE = (1 << 1),
SIW_WQE_SIGNALLED = (1 << 2),
SIW_WQE_SOLICITED = (1 << 3),
SIW_WQE_READ_FENCE = (1 << 4),
SIW_WQE_REM_INVAL = (1 << 5),
SIW_WQE_COMPLETED = (1 << 6)
};
/* Send Queue Element */
struct siw_sqe {
__aligned_u64 id;
__u16 flags;
__u8 num_sge;
/* Contains enum siw_opcode values */
__u8 opcode;
__u32 rkey;
union {
__aligned_u64 raddr;
__aligned_u64 base_mr;
};
union {
struct siw_sge sge[SIW_MAX_SGE];
__aligned_u64 access;
};
};
/* Receive Queue Element */
struct siw_rqe {
__aligned_u64 id;
__u16 flags;
__u8 num_sge;
/*
* only used by kernel driver,
* ignored if set by user
*/
__u8 opcode;
__u32 unused;
struct siw_sge sge[SIW_MAX_SGE];
};
enum siw_notify_flags {
SIW_NOTIFY_NOT = (0),
SIW_NOTIFY_SOLICITED = (1 << 0),
SIW_NOTIFY_NEXT_COMPLETION = (1 << 1),
SIW_NOTIFY_MISSED_EVENTS = (1 << 2),
SIW_NOTIFY_ALL = SIW_NOTIFY_SOLICITED | SIW_NOTIFY_NEXT_COMPLETION |
SIW_NOTIFY_MISSED_EVENTS
};
enum siw_wc_status {
SIW_WC_SUCCESS,
SIW_WC_LOC_LEN_ERR,
SIW_WC_LOC_PROT_ERR,
SIW_WC_LOC_QP_OP_ERR,
SIW_WC_WR_FLUSH_ERR,
SIW_WC_BAD_RESP_ERR,
SIW_WC_LOC_ACCESS_ERR,
SIW_WC_REM_ACCESS_ERR,
SIW_WC_REM_INV_REQ_ERR,
SIW_WC_GENERAL_ERR,
SIW_NUM_WC_STATUS
};
struct siw_cqe {
__aligned_u64 id;
__u8 flags;
__u8 opcode;
__u16 status;
__u32 bytes;
union {
__aligned_u64 imm_data;
__u32 inval_stag;
};
/* QP number or QP pointer */
union {
struct ib_qp *base_qp;
__aligned_u64 qp_id;
};
};
/*
* Shared structure between user and kernel
* to control CQ arming.
*/
struct siw_cq_ctrl {
__aligned_u64 notify;
};
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment