@@ -383,6 +383,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
#
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
+#
# Compile link bonding PMD library
#
CONFIG_RTE_LIBRTE_PMD_BOND=y
@@ -18,6 +18,7 @@ CONFIG_RTE_LIBRTE_PMD_VHOST=y
CONFIG_RTE_LIBRTE_IFC_PMD=y
CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
CONFIG_RTE_LIBRTE_PMD_SOFTNIC=y
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=y
CONFIG_RTE_LIBRTE_PMD_TAP=y
CONFIG_RTE_LIBRTE_AVP_PMD=y
CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=y
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
endif
DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
DIRS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += avp
new file mode 100644
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+
+CFLAGS += -O3
+# below line should be removed
+CFLAGS += -I/home/qzhan15/bpf/usr/include
+
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
new file mode 100644
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2018 Intel Corporation
+
+if host_machine.system() != 'linux'
+ build = false
+endif
+sources = files('rte_eth_af_xdp.c')
new file mode 100644
@@ -0,0 +1,1247 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <linux/if_link.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include <bpf/bpf.h>
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG "iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG "queue"
+#define ETH_AF_XDP_XSK_MAP_ID_ARG "xsk_map_id"
+#define ETH_AF_XDP_XSK_MAP_KEY_START_ARG "xsk_map_key_start"
+#define ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG "xsk_map_key_count"
+
+#define ETH_AF_XDP_FRAME_SIZE 2048
+#define ETH_AF_XDP_NUM_BUFFERS 4096
+#define ETH_AF_XDP_DATA_HEADROOM 0
+#define ETH_AF_XDP_DFLT_NUM_DESCS 1024
+#define ETH_AF_XDP_FQ_NUM_DESCS 1024
+#define ETH_AF_XDP_CQ_NUM_DESCS 1024
+#define ETH_AF_XDP_DFLT_QUEUE_IDX 0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE 16
+#define ETH_AF_XDP_TX_BATCH_SIZE 16
+
+#define ETH_AF_XDP_MAX_QUEUE_PAIRS 16
+
+struct xdp_umem_uqueue {
+ uint32_t cached_prod;
+ uint32_t cached_cons;
+ uint32_t mask;
+ uint32_t size;
+ uint32_t *producer;
+ uint32_t *consumer;
+ uint64_t *ring;
+ void *map;
+};
+
+struct xdp_umem {
+ char *frames;
+ struct xdp_umem_uqueue fq;
+ struct xdp_umem_uqueue cq;
+ struct rte_ring *buf_ring; /* be used to manage the buffer */
+ int fd;
+};
+
+struct xdp_uqueue {
+ uint32_t cached_prod;
+ uint32_t cached_cons;
+ uint32_t mask;
+ uint32_t size;
+ uint32_t *producer;
+ uint32_t *consumer;
+ struct xdp_desc *ring;
+ void *map;
+};
+
+static inline uint32_t xq_nb_avail(struct xdp_uqueue *q, uint32_t ndescs)
+{
+ uint32_t entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+
+ return (entries > ndescs) ? ndescs : entries;
+}
+
+static inline uint32_t xq_nb_free(struct xdp_uqueue *q, uint32_t ndescs)
+{
+ uint32_t free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= ndescs)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline uint32_t umem_nb_avail(struct xdp_umem_uqueue *q, uint32_t nb)
+{
+ uint32_t entries = q->cached_prod - q->cached_cons;
+
+ if (entries == 0) {
+ q->cached_prod = *q->producer;
+ entries = q->cached_prod - q->cached_cons;
+ }
+ return (entries > nb) ? nb : entries;
+}
+
+static inline uint32_t umem_nb_free(struct xdp_umem_uqueue *q, uint32_t nb)
+{
+ uint32_t free_entries = q->cached_cons - q->cached_prod;
+
+ if (free_entries >= nb)
+ return free_entries;
+
+ /* Refresh the local tail pointer */
+ q->cached_cons = *q->consumer + q->size;
+
+ return q->cached_cons - q->cached_prod;
+}
+
+static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
+ struct xdp_desc *d,
+ size_t nb)
+{
+ uint32_t i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ uint32_t idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i].addr;
+ }
+
+ rte_smp_wmb();
+
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq,
+ uint64_t *d,
+ size_t nb)
+{
+ uint32_t i;
+
+ if (umem_nb_free(fq, nb) < nb)
+ return -ENOSPC;
+
+ for (i = 0; i < nb; i++) {
+ uint32_t idx = fq->cached_prod++ & fq->mask;
+
+ fq->ring[idx] = d[i];
+ }
+
+ rte_smp_wmb();
+ *fq->producer = fq->cached_prod;
+
+ return 0;
+}
+
+static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
+ uint64_t *d, size_t nb)
+{
+ uint32_t idx, i, entries = umem_nb_avail(cq, nb);
+
+ rte_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = cq->cached_cons++ & cq->mask;
+ d[i] = cq->ring[idx];
+ }
+
+ if (entries > 0) {
+ rte_smp_wmb();
+ *cq->consumer = cq->cached_cons;
+ }
+
+ return entries;
+}
+
+static inline int xq_enq(struct xdp_uqueue *uq,
+ const struct xdp_desc *descs,
+ unsigned int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int i;
+
+ if (xq_nb_free(uq, ndescs) < ndescs)
+ return -ENOSPC;
+
+ for (i = 0; i < ndescs; i++) {
+ uint32_t idx = uq->cached_prod++ & uq->mask;
+
+ r[idx].addr = descs[i].addr;
+ r[idx].len = descs[i].len;
+ }
+
+ rte_smp_wmb();
+
+ *uq->producer = uq->cached_prod;
+ return 0;
+}
+
+static inline int xq_deq(struct xdp_uqueue *uq,
+ struct xdp_desc *descs,
+ int ndescs)
+{
+ struct xdp_desc *r = uq->ring;
+ unsigned int idx;
+ int i, entries;
+
+ entries = xq_nb_avail(uq, ndescs);
+ rte_smp_rmb();
+
+ for (i = 0; i < entries; i++) {
+ idx = uq->cached_cons++ & uq->mask;
+ descs[i] = r[idx];
+ }
+
+ if (entries > 0) {
+ rte_smp_wmb();
+
+ *uq->consumer = uq->cached_cons;
+ }
+
+ return entries;
+}
+
+struct pkt_rx_queue {
+ int xsk_fd;
+ uint16_t queue_idx;
+ struct xdp_uqueue rx;
+ struct xdp_umem *umem;
+ struct rte_mempool *mb_pool;
+
+ unsigned long rx_pkts;
+ unsigned long rx_bytes;
+ unsigned long rx_dropped;
+
+ struct pkt_tx_queue *pair;
+};
+
+struct pkt_tx_queue {
+ uint16_t queue_idx;
+ struct xdp_uqueue tx;
+
+ unsigned long tx_pkts;
+ unsigned long err_pkts;
+ unsigned long tx_bytes;
+
+ struct pkt_rx_queue *pair;
+};
+
+struct pmd_internals {
+ int if_index;
+ char if_name[IFNAMSIZ];
+ uint16_t queue_idx;
+ struct ether_addr eth_addr;
+ struct xdp_umem *umem_share;
+ int umem_share_count;
+ struct rte_mempool *mb_pool_share;
+ int xsk_map_id;
+ int xsk_map_key_start;
+ int xsk_map_key_count;
+
+ struct pkt_rx_queue rx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+ struct pkt_tx_queue tx_queues[ETH_AF_XDP_MAX_QUEUE_PAIRS];
+};
+
+static const char * const valid_arguments[] = {
+ ETH_AF_XDP_IFACE_ARG,
+ ETH_AF_XDP_QUEUE_IDX_ARG,
+ ETH_AF_XDP_XSK_MAP_ID_ARG,
+ ETH_AF_XDP_XSK_MAP_KEY_START_ARG,
+ ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG,
+ NULL
+};
+
+static struct rte_eth_link pmd_link = {
+ .link_speed = ETH_SPEED_NUM_10G,
+ .link_duplex = ETH_LINK_FULL_DUPLEX,
+ .link_status = ETH_LINK_DOWN,
+ .link_autoneg = ETH_LINK_AUTONEG
+};
+
+static char *get_pkt_data(struct xdp_umem *umem, uint64_t addr)
+{
+ return &umem->frames[addr];
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
+ void *addrs[ETH_AF_XDP_RX_BATCH_SIZE];
+ struct pkt_rx_queue *rxq = queue;
+ struct xdp_uqueue *uq = &rxq->rx;
+ struct xdp_umem_uqueue *fq = &rxq->umem->fq;
+ uint32_t free_thresh = fq->size >> 1;
+ struct rte_mbuf *mbuf;
+ unsigned long dropped = 0;
+ unsigned long rx_bytes = 0;
+ uint16_t count = 0;
+ int rcvd, i;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+ if (umem_nb_free(fq, free_thresh) >= free_thresh) {
+ int n = rte_ring_dequeue_bulk(rxq->umem->buf_ring,
+ addrs,
+ ETH_AF_XDP_RX_BATCH_SIZE,
+ NULL);
+ if (n == 0)
+ return -ENOMEM;
+
+ if (umem_fill_to_kernel(fq, (uint64_t *)&addrs[0],
+ ETH_AF_XDP_RX_BATCH_SIZE)) {
+ rte_ring_enqueue_bulk(rxq->umem->buf_ring,
+ addrs,
+ ETH_AF_XDP_RX_BATCH_SIZE,
+ NULL);
+ }
+ }
+
+ /* read data */
+ rcvd = xq_deq(uq, descs, nb_pkts);
+ if (rcvd == 0)
+ return 0;
+
+ for (i = 0; i < rcvd; i++) {
+ char *pkt;
+ uint64_t addr = descs[i].addr;
+
+ mbuf = rte_pktmbuf_alloc(rxq->mb_pool);
+ rte_pktmbuf_pkt_len(mbuf) =
+ rte_pktmbuf_data_len(mbuf) =
+ descs[i].len;
+ if (mbuf) {
+ pkt = get_pkt_data(rxq->umem, addr);
+ memcpy(rte_pktmbuf_mtod(mbuf, void *),
+ pkt, descs[i].len);
+ rx_bytes += descs[i].len;
+ bufs[count++] = mbuf;
+ } else {
+ dropped++;
+ }
+ addrs[i] = (void *)addr;
+ }
+
+ rte_ring_enqueue_bulk(rxq->umem->buf_ring, addrs, rcvd, NULL);
+
+ rxq->rx_pkts += (rcvd - dropped);
+ rxq->rx_bytes += rx_bytes;
+ rxq->rx_dropped += dropped;
+
+ return count;
+}
+
+static void kick_tx(struct pkt_tx_queue *txq)
+{
+ void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+ struct rte_ring *buf_ring = txq->pair->umem->buf_ring;
+ struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ int fd = txq->pair->xsk_fd;
+ int ret, n;
+
+ while (1) {
+
+ ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+
+ /* everything is ok */
+ if (ret >= 0)
+ break;
+
+ /* some thing unexpected */
+ if (errno != EBUSY && errno != EAGAIN)
+ break;
+
+ /* pull from complete qeueu to leave more space */
+ if (errno == EAGAIN) {
+ n = umem_complete_from_kernel(cq,
+ (uint64_t *)&addrs[0],
+ ETH_AF_XDP_TX_BATCH_SIZE);
+ if (n > 0)
+ rte_ring_enqueue_bulk(buf_ring,
+ addrs, n, NULL);
+ }
+ }
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+ struct pkt_tx_queue *txq = queue;
+ struct xdp_uqueue *uq = &txq->tx;
+ struct xdp_umem_uqueue *cq = &txq->pair->umem->cq;
+ struct rte_mbuf *mbuf;
+ struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
+ void *addrs[ETH_AF_XDP_TX_BATCH_SIZE];
+ uint16_t i, valid;
+ unsigned long tx_bytes = 0;
+
+ nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+ nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+ int n = umem_complete_from_kernel(cq, (uint64_t *)&addrs[0],
+ ETH_AF_XDP_TX_BATCH_SIZE);
+ if (n > 0)
+ rte_ring_enqueue_bulk(txq->pair->umem->buf_ring,
+ addrs, n, NULL);
+
+ nb_pkts = rte_ring_dequeue_bulk(txq->pair->umem->buf_ring, addrs,
+ nb_pkts, NULL);
+ if (!nb_pkts)
+ return 0;
+
+ valid = 0;
+ for (i = 0; i < nb_pkts; i++) {
+ char *pkt;
+ unsigned int buf_len =
+ ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
+ mbuf = bufs[i];
+ if (mbuf->pkt_len <= buf_len) {
+ descs[valid].addr = (uint64_t)addrs[valid];
+ descs[valid].len = mbuf->pkt_len;
+ descs[valid].options = 0;
+ pkt = get_pkt_data(txq->pair->umem, descs[valid].addr);
+ memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+ descs[i].len);
+ valid++;
+ tx_bytes += mbuf->pkt_len;
+ }
+ rte_pktmbuf_free(mbuf);
+ }
+
+ if (xq_enq(uq, descs, valid)) {
+ valid = 0;
+ tx_bytes = 0;
+ } else {
+ kick_tx(txq);
+ }
+
+ if (valid < nb_pkts)
+ rte_ring_enqueue_bulk(txq->pair->umem->buf_ring, &addrs[valid],
+ nb_pkts - valid, NULL);
+
+ txq->err_pkts += (nb_pkts - valid);
+ txq->tx_pkts += valid;
+ txq->tx_bytes += tx_bytes;
+
+ return nb_pkts;
+}
+
+static void
+fill_rx_desc(struct xdp_umem *umem)
+{
+ struct xdp_umem_uqueue *fq = &umem->fq;
+ void *p = NULL;
+ uint32_t i;
+
+ for (i = 0; i < fq->size / 2; i++) {
+ rte_ring_dequeue(umem->buf_ring, &p);
+ if (umem_fill_to_kernel(fq, (uint64_t *)&p, 1)) {
+ rte_ring_enqueue(umem->buf_ring, p);
+ break;
+ }
+ }
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+ dev->data->dev_link.link_status = ETH_LINK_UP;
+
+ return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+ dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+ /* rx/tx must be paired */
+ if (dev->data->nb_rx_queues != dev->data->nb_tx_queues)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ dev_info->if_index = internals->if_index;
+ dev_info->max_mac_addrs = 1;
+ dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+ dev_info->max_rx_queues = internals->xsk_map_key_count;
+ dev_info->max_tx_queues = internals->xsk_map_key_count;
+ dev_info->min_rx_bufsize = 0;
+
+ dev_info->default_rxportconf.nb_queues = 1;
+ dev_info->default_txportconf.nb_queues = 1;
+ dev_info->default_rxportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+ dev_info->default_txportconf.ring_size = ETH_AF_XDP_DFLT_NUM_DESCS;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct xdp_statistics xdp_stats;
+ struct pkt_rx_queue *rxq;
+ socklen_t optlen;
+ int i;
+
+ optlen = sizeof(struct xdp_statistics);
+ for (i = 0; i < dev->data->nb_rx_queues; i++) {
+ rxq = &internals->rx_queues[i];
+ stats->q_ipackets[i] = internals->rx_queues[i].rx_pkts;
+ stats->q_ibytes[i] = internals->rx_queues[i].rx_bytes;
+
+ stats->q_opackets[i] = internals->tx_queues[i].tx_pkts;
+ stats->q_errors[i] = internals->tx_queues[i].err_pkts;
+ stats->q_obytes[i] = internals->tx_queues[i].tx_bytes;
+
+ stats->ipackets += stats->q_ipackets[i];
+ stats->ibytes += stats->q_ibytes[i];
+ stats->imissed += internals->rx_queues[i].rx_dropped;
+ getsockopt(rxq->xsk_fd, SOL_XDP, XDP_STATISTICS,
+ &xdp_stats, &optlen);
+ stats->imissed += xdp_stats.rx_dropped;
+
+ stats->opackets += stats->q_opackets[i];
+ stats->oerrors += stats->q_errors[i];
+ stats->obytes += stats->q_obytes[i];
+ }
+
+ return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ int i;
+
+ for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+ internals->rx_queues[i].rx_pkts = 0;
+ internals->rx_queues[i].rx_bytes = 0;
+ internals->rx_queues[i].rx_dropped = 0;
+
+ internals->tx_queues[i].tx_pkts = 0;
+ internals->tx_queues[i].err_pkts = 0;
+ internals->tx_queues[i].tx_bytes = 0;
+ }
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev __rte_unused)
+{
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+ int wait_to_complete __rte_unused)
+{
+ return 0;
+}
+
+static void xdp_umem_destroy(struct xdp_umem *umem)
+{
+ if (umem->frames)
+ free(umem->frames);
+ if (umem->buf_ring)
+ rte_ring_free(umem->buf_ring);
+
+ free(umem);
+}
+
+static struct xdp_umem *xdp_umem_configure(int sfd)
+{
+ int fq_size = ETH_AF_XDP_FQ_NUM_DESCS;
+ int cq_size = ETH_AF_XDP_CQ_NUM_DESCS;
+ struct xdp_mmap_offsets off;
+ struct xdp_umem_reg mr;
+ struct xdp_umem *umem;
+ char ring_name[0x100];
+ socklen_t optlen;
+ void *bufs = NULL;
+ uint64_t i;
+
+ umem = calloc(1, sizeof(*umem));
+ if (!umem)
+ return NULL;
+
+ snprintf(ring_name, 0x100, "%s_%d", "af_xdp_ring", sfd);
+ umem->buf_ring = rte_ring_create(ring_name,
+ ETH_AF_XDP_NUM_BUFFERS,
+ SOCKET_ID_ANY,
+ 0x0);
+ if (!umem->buf_ring) {
+ RTE_LOG(ERR, PMD,
+ "Failed to create rte_ring\n");
+ goto err;
+ }
+
+ for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+ rte_ring_enqueue(umem->buf_ring,
+ (void *)(i * ETH_AF_XDP_FRAME_SIZE +
+ ETH_AF_XDP_DATA_HEADROOM));
+
+ if (posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+ ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory pool.\n");
+ goto err;
+ }
+
+ mr.addr = (uint64_t)bufs;
+ mr.len = ETH_AF_XDP_NUM_BUFFERS * ETH_AF_XDP_FRAME_SIZE;
+ mr.chunk_size = ETH_AF_XDP_FRAME_SIZE;
+ mr.headroom = ETH_AF_XDP_DATA_HEADROOM;
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to register memory pool.\n");
+ goto err;
+ }
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
+ sizeof(int))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to setup fill ring.\n");
+ goto err;
+ }
+
+ if (setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
+ sizeof(int))) {
+ RTE_LOG(ERR, PMD,
+ "Failed to setup complete ring.\n");
+ goto err;
+ }
+
+ optlen = sizeof(off);
+ if (getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to get map fr/cr offset.\n");
+ goto err;
+ }
+
+ umem->fq.map = mmap(0, off.fr.desc +
+ fq_size * sizeof(uint64_t),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_FILL_RING);
+
+ if (umem->fq.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory for fq.\n");
+ goto err;
+ }
+
+ umem->fq.mask = fq_size - 1;
+ umem->fq.size = fq_size;
+ umem->fq.producer =
+ (uint32_t *)((uint64_t)umem->fq.map + off.fr.producer);
+ umem->fq.consumer =
+ (uint32_t *)((uint64_t)umem->fq.map + off.fr.consumer);
+ umem->fq.ring = (uint64_t *)((uint64_t)umem->fq.map + off.fr.desc);
+ umem->fq.cached_cons = fq_size;
+
+ umem->cq.map = mmap(0, off.cr.desc +
+ cq_size * sizeof(uint64_t),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, sfd,
+ XDP_UMEM_PGOFF_COMPLETION_RING);
+
+ if (umem->cq.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD,
+ "Failed to allocate memory for caq\n");
+ goto err;
+ }
+
+ umem->cq.mask = cq_size - 1;
+ umem->cq.size = cq_size;
+ umem->cq.producer =
+ (uint32_t *)((uint64_t)umem->cq.map + off.cr.producer);
+ umem->cq.consumer =
+ (uint32_t *)((uint64_t)umem->cq.map + off.cr.consumer);
+ umem->cq.ring = (uint64_t *)((uint64_t)umem->cq.map + off.cr.desc);
+
+ umem->frames = bufs;
+ umem->fd = sfd;
+
+ return umem;
+
+err:
+ xdp_umem_destroy(umem);
+ return NULL;
+
+}
+
+static int
+xsk_configure(struct pkt_rx_queue *rxq, int ring_size, struct xdp_umem *umem)
+{
+ struct pkt_tx_queue *txq = rxq->pair;
+ struct xdp_mmap_offsets off;
+ int new_umem = 0;
+ socklen_t optlen;
+
+ rxq->xsk_fd = socket(PF_XDP, SOCK_RAW, 0);
+ if (rxq->xsk_fd < 0)
+ return -1;
+
+ if (!umem) {
+ rxq->umem = xdp_umem_configure(rxq->xsk_fd);
+ if (!rxq->umem)
+ goto err;
+ new_umem = 1;
+ } else {
+ rxq->umem = umem;
+ }
+
+ if (setsockopt(rxq->xsk_fd, SOL_XDP, XDP_RX_RING,
+ &ring_size, sizeof(int))) {
+ RTE_LOG(ERR, PMD, "Failed to setup Rx ring.\n");
+ goto err;
+ }
+
+ if (setsockopt(rxq->xsk_fd, SOL_XDP, XDP_TX_RING,
+ &ring_size, sizeof(int))) {
+ RTE_LOG(ERR, PMD, "Failed to setup Tx ring.\n");
+ goto err;
+ }
+
+ optlen = sizeof(off);
+ if (getsockopt(rxq->xsk_fd, SOL_XDP, XDP_MMAP_OFFSETS,
+ &off, &optlen)) {
+ RTE_LOG(ERR, PMD, "Failed to get map rx/tx offsets.\n");
+ goto err;
+ }
+
+ /* Rx */
+ rxq->rx.map = mmap(NULL,
+ off.rx.desc +
+ ring_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, rxq->xsk_fd,
+ XDP_PGOFF_RX_RING);
+
+ if (rxq->rx.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "Failed to map Rx ring memory.\n");
+ goto err;
+ }
+
+ fill_rx_desc(rxq->umem);
+ /* Tx */
+ txq->tx.map = mmap(NULL,
+ off.tx.desc +
+ ring_size * sizeof(struct xdp_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, rxq->xsk_fd,
+ XDP_PGOFF_TX_RING);
+
+ if (txq->tx.ring == MAP_FAILED) {
+ RTE_LOG(ERR, PMD, "Failed to map Tx ring memory\n");
+ goto err;
+ }
+
+ rxq->rx.mask = ring_size - 1;
+ rxq->rx.size = ring_size;
+ rxq->rx.producer =
+ (uint32_t *)((uint64_t)rxq->rx.map + off.rx.producer);
+ rxq->rx.consumer =
+ (uint32_t *)((uint64_t)rxq->rx.map + off.rx.consumer);
+ rxq->rx.ring = (struct xdp_desc *)((uint64_t)rxq->rx.map + off.rx.desc);
+
+ txq->tx.mask = ring_size - 1;
+ txq->tx.size = ring_size;
+ txq->tx.producer =
+ (uint32_t *)((uint64_t)txq->tx.map + off.tx.producer);
+ txq->tx.consumer =
+ (uint32_t *)((uint64_t)txq->tx.map + off.tx.consumer);
+ txq->tx.ring = (struct xdp_desc *)((uint64_t)txq->tx.map + off.tx.desc);
+ txq->tx.cached_cons = ring_size;
+
+ return 0;
+
+err:
+ if (new_umem)
+ xdp_umem_destroy(rxq->umem);
+ close(rxq->xsk_fd);
+ rxq->xsk_fd = 0;
+
+ return -1;
+}
+
+static void
+queue_reset(struct pmd_internals *internals, uint16_t queue_idx)
+{
+ struct pkt_rx_queue *rxq = &internals->rx_queues[queue_idx];
+ struct pkt_tx_queue *txq = rxq->pair;
+
+ if (rxq->xsk_fd) {
+ close(rxq->xsk_fd);
+ if (internals->umem_share_count > 0) {
+ internals->umem_share_count--;
+ if (internals->umem_share_count == 0 &&
+ internals->umem_share) {
+ xdp_umem_destroy(internals->umem_share);
+ internals->umem_share = NULL;
+ }
+ }
+ }
+ memset(rxq, 0, sizeof(*rxq));
+ memset(txq, 0, sizeof(*txq));
+ rxq->pair = txq;
+ txq->pair = rxq;
+ rxq->queue_idx = queue_idx;
+ txq->queue_idx = queue_idx;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t rx_queue_id,
+ uint16_t nb_rx_desc,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_rxconf *rx_conf __rte_unused,
+ struct rte_mempool *mb_pool)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ unsigned int buf_size, data_size;
+ struct pkt_rx_queue *rxq;
+ struct sockaddr_xdp sxdp = {0};
+ int xsk_key;
+ int map_fd;
+
+ if (dev->data->nb_rx_queues <= rx_queue_id) {
+ RTE_LOG(ERR, PMD,
+ "Invalid rx queue id: %d\n", rx_queue_id);
+ return -EINVAL;
+ }
+
+ rxq = &internals->rx_queues[rx_queue_id];
+ queue_reset(internals, rx_queue_id);
+
+ /* Now get the space available for data in the mbuf */
+ buf_size = rte_pktmbuf_data_room_size(mb_pool) -
+ RTE_PKTMBUF_HEADROOM;
+ data_size = ETH_AF_XDP_FRAME_SIZE - ETH_AF_XDP_DATA_HEADROOM;
+
+ if (data_size > buf_size) {
+ RTE_LOG(ERR, PMD,
+ "%s: %d bytes will not fit in mbuf (%d bytes)\n",
+ dev->device->name, data_size, buf_size);
+ return -ENOMEM;
+ }
+
+ rxq->mb_pool = mb_pool;
+
+ if (xsk_configure(rxq, nb_rx_desc, internals->umem_share)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to configure xdp socket\n");
+ return -EINVAL;
+ }
+
+ sxdp.sxdp_family = PF_XDP;
+ sxdp.sxdp_ifindex = internals->if_index;
+ sxdp.sxdp_queue_id = internals->queue_idx;
+ sxdp.sxdp_flags = 0;
+ if (internals->umem_share) {
+ RTE_LOG(INFO, PMD,
+ "use share umem at queue id %d\n", rx_queue_id);
+ sxdp.sxdp_flags = XDP_SHARED_UMEM;
+ sxdp.sxdp_shared_umem_fd = internals->umem_share->fd;
+ }
+
+ if (bind(rxq->xsk_fd, (struct sockaddr *)&sxdp, sizeof(sxdp))) {
+ RTE_LOG(ERR, PMD, "Failed to bind xdp socket\n");
+ if (!internals->umem_share)
+ xdp_umem_destroy(rxq->umem);
+ goto err;
+ }
+
+ if (!internals->umem_share)
+ internals->umem_share = rxq->umem;
+
+ internals->umem_share_count++;
+ map_fd = bpf_map_get_fd_by_id(internals->xsk_map_id);
+
+ xsk_key = internals->xsk_map_key_start + rx_queue_id;
+ if (bpf_map_update_elem(map_fd, &xsk_key, &rxq->xsk_fd, 0)) {
+ RTE_LOG(ERR, PMD,
+ "Failed to update xsk map\n");
+ goto err;
+ }
+
+ dev->data->rx_queues[rx_queue_id] = rxq;
+ return 0;
+
+err:
+ queue_reset(internals, rx_queue_id);
+ return -EINVAL;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+ uint16_t tx_queue_id,
+ uint16_t nb_tx_desc,
+ unsigned int socket_id __rte_unused,
+ const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct pkt_tx_queue *txq;
+
+ if (dev->data->nb_tx_queues <= tx_queue_id) {
+ RTE_LOG(ERR, PMD, "Invalid tx queue id: %d\n", tx_queue_id);
+ return -EINVAL;
+ }
+
+ RTE_LOG(WARNING, PMD, "Warning tx queue setup size=%d will be skipped\n",
+ nb_tx_desc);
+ txq = &internals->tx_queues[tx_queue_id];
+
+ dev->data->tx_queues[tx_queue_id] = txq;
+ return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+ struct ifreq ifr = { .ifr_mtu = mtu };
+ int ret;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return -EINVAL;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+ ret = ioctl(s, SIOCSIFMTU, &ifr);
+ close(s);
+
+ if (ret < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+ struct ifreq ifr;
+ int s;
+
+ s = socket(PF_INET, SOCK_DGRAM, 0);
+ if (s < 0)
+ return;
+
+ snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+ if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+ goto out;
+ ifr.ifr_flags &= mask;
+ ifr.ifr_flags |= flags;
+ if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+ goto out;
+out:
+ close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+ struct pmd_internals *internals = dev->data->dev_private;
+
+ eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+ .dev_start = eth_dev_start,
+ .dev_stop = eth_dev_stop,
+ .dev_close = eth_dev_close,
+ .dev_configure = eth_dev_configure,
+ .dev_infos_get = eth_dev_info,
+ .mtu_set = eth_dev_mtu_set,
+ .promiscuous_enable = eth_dev_promiscuous_enable,
+ .promiscuous_disable = eth_dev_promiscuous_disable,
+ .rx_queue_setup = eth_rx_queue_setup,
+ .tx_queue_setup = eth_tx_queue_setup,
+ .rx_queue_release = eth_queue_release,
+ .tx_queue_release = eth_queue_release,
+ .link_update = eth_link_update,
+ .stats_get = eth_stats_get,
+ .stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+ char **if_name,
+ int *queue_idx,
+ int *xsk_map_id,
+ int *xsk_map_key_start,
+ int *xsk_map_key_count)
+{
+ struct rte_kvargs_pair *pair = NULL;
+ unsigned int k_idx;
+
+ for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+ pair = &kvlist->pairs[k_idx];
+ if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+ *if_name = pair->value;
+ else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+ *queue_idx = atoi(pair->value);
+ else if (strstr(pair->key, ETH_AF_XDP_XSK_MAP_ID_ARG))
+ *xsk_map_id = atoi(pair->value);
+ else if (strstr(pair->value, ETH_AF_XDP_XSK_MAP_KEY_START_ARG))
+ *xsk_map_key_start = atoi(pair->value);
+ else if (strstr(pair->key, ETH_AF_XDP_XSK_MAP_KEY_COUNT_ARG))
+ *xsk_map_key_count = atoi(pair->value);
+ }
+}
+
+static int
+get_iface_info(const char *if_name,
+ struct ether_addr *eth_addr,
+ int *if_index)
+{
+ struct ifreq ifr;
+ int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+ if (sock < 0)
+ return -1;
+
+ strcpy(ifr.ifr_name, if_name);
+ if (ioctl(sock, SIOCGIFINDEX, &ifr))
+ goto error;
+
+ if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+ goto error;
+
+ memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+ close(sock);
+ *if_index = if_nametoindex(if_name);
+ return 0;
+
+error:
+ close(sock);
+ return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+ const char *if_name,
+ int queue_idx,
+ int xsk_map_id,
+ int xsk_map_key_start,
+ int xsk_map_key_count)
+{
+ const char *name = rte_vdev_device_name(dev);
+ struct rte_eth_dev *eth_dev = NULL;
+ const unsigned int numa_node = dev->device.numa_node;
+ struct pmd_internals *internals = NULL;
+ int ret;
+ int i;
+
+ internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+ if (!internals)
+ return -ENOMEM;
+
+ internals->queue_idx = queue_idx;
+ internals->xsk_map_id = xsk_map_id;
+ internals->xsk_map_key_start = xsk_map_key_start;
+ internals->xsk_map_key_count = xsk_map_key_count;
+ strcpy(internals->if_name, if_name);
+
+ for (i = 0; i < ETH_AF_XDP_MAX_QUEUE_PAIRS; i++) {
+ internals->tx_queues[i].pair = &internals->rx_queues[i];
+ internals->rx_queues[i].pair = &internals->tx_queues[i];
+ }
+
+ ret = get_iface_info(if_name, &internals->eth_addr,
+ &internals->if_index);
+ if (ret)
+ goto err;
+
+ eth_dev = rte_eth_vdev_allocate(dev, 0);
+ if (!eth_dev)
+ goto err;
+
+ eth_dev->data->dev_private = internals;
+ eth_dev->data->dev_link = pmd_link;
+ eth_dev->data->mac_addrs = &internals->eth_addr;
+ eth_dev->dev_ops = &ops;
+ eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+ eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+ rte_eth_dev_probing_finish(eth_dev);
+ return 0;
+
+err:
+ rte_free(internals);
+ return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+ struct rte_kvargs *kvlist;
+ char *if_name = NULL;
+ int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+ struct rte_eth_dev *eth_dev;
+ int xsk_map_id = -1;
+ int xsk_map_key_start = 0;
+ int xsk_map_key_count = 1;
+ const char *name;
+ int ret;
+
+ RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+ rte_vdev_device_name(dev));
+
+ name = rte_vdev_device_name(dev);
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
+ strlen(rte_vdev_device_args(dev)) == 0) {
+ eth_dev = rte_eth_dev_attach_secondary(name);
+ if (!eth_dev) {
+ RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
+ return -EINVAL;
+ }
+ eth_dev->dev_ops = &ops;
+ rte_eth_dev_probing_finish(eth_dev);
+ }
+
+ kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+ if (!kvlist) {
+ RTE_LOG(ERR, PMD,
+ "Invalid kvargs\n");
+ return -EINVAL;
+ }
+
+ if (dev->device.numa_node == SOCKET_ID_ANY)
+ dev->device.numa_node = rte_socket_id();
+
+ parse_parameters(kvlist, &if_name,
+ &queue_idx,
+ &xsk_map_id,
+ &xsk_map_key_start,
+ &xsk_map_key_count);
+
+ if (xsk_map_id < 0) {
+ RTE_LOG(ERR, PMD,
+ "Invalid map id\n");
+ return -EINVAL;
+ }
+ ret = init_internals(dev, if_name, queue_idx, xsk_map_id,
+ xsk_map_key_start, xsk_map_key_count);
+
+ rte_kvargs_free(kvlist);
+
+ return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+ struct rte_eth_dev *eth_dev = NULL;
+ struct pmd_internals *internals;
+ int i;
+
+ RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+ rte_socket_id());
+
+ if (!dev)
+ return -1;
+
+ /* find the ethdev entry */
+ eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+ if (!eth_dev)
+ return -1;
+
+ internals = eth_dev->data->dev_private;
+
+ for (i = 0; i < internals->xsk_map_key_count; i++)
+ queue_reset(internals, i);
+
+ rte_ring_free(internals->umem_share->buf_ring);
+ rte_free(internals->umem_share->frames);
+ rte_free(internals->umem_share);
+ rte_free(internals);
+
+ rte_eth_dev_release_port(eth_dev);
+
+ return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+ .probe = rte_pmd_af_xdp_probe,
+ .remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+ "iface=<string> "
+ "queue=<int> "
+ "xsk_map_id=<int> "
+ "xsk_map_key_start=<int> "
+ "xsk_map_key_count=<ind> ");
new file mode 100644
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+ local: *;
+};
@@ -118,6 +118,7 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_DPAA2_MEMPOOL) += -lrte_mempool_dpaa2
endif
_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += -lrte_pmd_af_xdp -lelf -lbpf
_LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += -lrte_pmd_ark
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += -lrte_pmd_avf
_LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += -lrte_pmd_avp