diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index bf5baef5c3e065..4948b4906584e0 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2281,7 +2281,7 @@ static int dpaa_a050385_wa_xdpf(struct dpaa_priv *priv, new_xdpf->len = xdpf->len; new_xdpf->headroom = priv->tx_headroom; new_xdpf->frame_sz = DPAA_BP_RAW_SIZE; - new_xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + new_xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; /* Release the initial buffer */ xdp_return_frame_rx_napi(xdpf); diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig index 20bc40eec487a3..63a7ee55f0d7e9 100644 --- a/drivers/net/ethernet/intel/Kconfig +++ b/drivers/net/ethernet/intel/Kconfig @@ -291,6 +291,7 @@ config ICE select AUXILIARY_BUS select DIMLIB select LIBIE + select LIBETH_XDP select NET_DEVLINK select PLDMFW select DPLL diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 4e885df789ef4d..e28f1905a4a0f8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -395,32 +395,6 @@ static void i40e_handle_xdp_result_zc(struct i40e_ring *rx_ring, WARN_ON_ONCE(1); } -static int -i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, - struct xdp_buff *xdp, const unsigned int size) -{ - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); - - if (!xdp_buff_has_frags(first)) { - sinfo->nr_frags = 0; - sinfo->xdp_frags_size = 0; - xdp_buff_set_frags_flag(first); - } - - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - xsk_buff_free(first); - return -ENOMEM; - } - - __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), - XDP_PACKET_HEADROOM, size); - sinfo->xdp_frags_size += size; - xsk_buff_add_frag(xdp); - - return 0; -} - /** * i40e_clean_rx_irq_zc - Consumes Rx packets from the hardware ring * @rx_ring: Rx ring @@ -486,8 +460,10 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) if (!first) first = bi; - else if (i40e_add_xsk_frag(rx_ring, first, bi, size)) + else if (!xsk_buff_add_frag(first, bi)) { + xsk_buff_free(first); break; + } if (++next_to_process == count) next_to_process = 0; diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index 680a81961ba191..aa54eba3694f43 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -372,6 +372,8 @@ struct ice_vsi { spinlock_t arfs_lock; /* protects aRFS hash table and filter state */ atomic_t *arfs_last_fltr_id; + u16 max_frame; + struct ice_aqc_vsi_props info; /* VSI properties */ struct ice_vsi_vlan_info vlan_info; /* vlan config to be restored */ @@ -511,7 +513,6 @@ enum ice_pf_flags { ICE_FLAG_MOD_POWER_UNSUPPORTED, ICE_FLAG_PHY_FW_LOAD_FAILED, ICE_FLAG_ETHTOOL_CTXT, /* set when ethtool holds RTNL lock */ - ICE_FLAG_LEGACY_RX, ICE_FLAG_VF_TRUE_PROMISC_ENA, ICE_FLAG_MDD_AUTO_RESET_VF, ICE_FLAG_VF_VLAN_PRUNING, diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 3a8e156d7d86cf..f6cb65c6921d51 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -359,19 +359,6 @@ ice_setup_tx_ctx(struct ice_tx_ring *ring, struct ice_tlan_ctx *tlan_ctx, u16 pf tlan_ctx->legacy_int = ICE_TX_LEGACY; } -/** - * ice_rx_offset - Return expected offset into page to access data - * @rx_ring: Ring we are requesting offset of - * - * Returns the offset value for ring into the data buffer. - */ -static unsigned int ice_rx_offset(struct ice_rx_ring *rx_ring) -{ - if (ice_ring_uses_build_skb(rx_ring)) - return ICE_SKB_PAD; - return 0; -} - /** * ice_setup_rx_ctx - Configure a receive ring context * @ring: The Rx ring to configure @@ -406,7 +393,7 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) /* Receive Packet Data Buffer Size. * The Packet Data Buffer Size is defined in 128 byte units. */ - rlan_ctx.dbuf = DIV_ROUND_UP(ring->rx_buf_len, + rlan_ctx.dbuf = DIV_ROUND_UP(ICE_RXBUF_3072, BIT_ULL(ICE_RLAN_CTX_DBUF_S)); /* use 32 byte descriptors */ @@ -447,8 +434,8 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) /* Max packet size for this queue - must not be set to a larger value * than 5 x DBUF */ - rlan_ctx.rxmax = min_t(u32, ring->max_frame, - ICE_MAX_CHAINED_RX_BUFS * ring->rx_buf_len); + rlan_ctx.rxmax = min_t(u32, vsi->max_frame, + ICE_MAX_CHAINED_RX_BUFS * ICE_RXBUF_3072); /* Rx queue threshold in units of 64 */ rlan_ctx.lrxqthresh = 1; @@ -484,13 +471,7 @@ static int ice_setup_rx_ctx(struct ice_rx_ring *ring) if (vsi->type == ICE_VSI_VF) return 0; - /* configure Rx buffer alignment */ - if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) - ice_clear_ring_build_skb_ena(ring); - else - ice_set_ring_build_skb_ena(ring); - - ring->rx_offset = ice_rx_offset(ring); + ring->rx_offset = ICE_SKB_PAD; /* init queue specific tail register */ ring->tail = hw->hw_addr + QRX_TAIL(pf_q); @@ -525,7 +506,7 @@ static unsigned int ice_get_frame_sz(struct ice_rx_ring *rx_ring) #if (PAGE_SIZE >= 8192) frame_sz = rx_ring->rx_buf_len; #else - frame_sz = ice_rx_pg_size(rx_ring) / 2; + frame_sz = PAGE_SIZE / 2; #endif return frame_sz; @@ -541,6 +522,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) { struct device *dev = ice_pf_to_dev(ring->vsi->back); u32 num_bufs = ICE_RX_DESC_UNUSED(ring); + u32 rx_buf_len; int err; if (ring->vsi->type == ICE_VSI_PF || ring->vsi->type == ICE_VSI_SF) { @@ -557,12 +539,12 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) if (ring->xsk_pool) { xdp_rxq_info_unreg(&ring->xdp_rxq); - ring->rx_buf_len = + rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ring->q_index, ring->q_vector->napi.napi_id, - ring->rx_buf_len); + rx_buf_len); if (err) return err; err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, @@ -585,15 +567,16 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return err; } - err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, - MEM_TYPE_PAGE_SHARED, - NULL); - if (err) - return err; + //err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + //MEM_TYPE_PAGE_SHARED, + //NULL); + // MEM_TYPE_PAGE_POOL, + // ring->pp); + xdp_rxq_info_attach_page_pool(&ring->xdp_rxq, + ring->pp); } } - xdp_init_buff(&ring->xdp, ice_get_frame_sz(ring), &ring->xdp_rxq); ring->xdp.data = NULL; ring->xdp_ext.pkt_ctx = &ring->pkt_ctx; err = ice_setup_rx_ctx(ring); @@ -625,7 +608,7 @@ static int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) return 0; } - ice_alloc_rx_bufs(ring, num_bufs); + err = ice_alloc_rx_bufs(ring, num_bufs); return 0; } @@ -648,18 +631,15 @@ int ice_vsi_cfg_single_rxq(struct ice_vsi *vsi, u16 q_idx) */ static void ice_vsi_cfg_frame_size(struct ice_vsi *vsi, struct ice_rx_ring *ring) { - if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) { - ring->max_frame = ICE_MAX_FRAME_LEGACY_RX; - ring->rx_buf_len = ICE_RXBUF_1664; + if (!vsi->netdev) { + vsi->max_frame = ICE_MAX_FRAME_LEGACY_RX; #if (PAGE_SIZE < 8192) } else if (!ICE_2K_TOO_SMALL_WITH_PADDING && (vsi->netdev->mtu <= ETH_DATA_LEN)) { - ring->max_frame = ICE_RXBUF_1536 - NET_IP_ALIGN; - ring->rx_buf_len = ICE_RXBUF_1536 - NET_IP_ALIGN; + vsi->max_frame = ICE_RXBUF_1536 - NET_IP_ALIGN; #endif } else { - ring->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX; - ring->rx_buf_len = ICE_RXBUF_3072; + vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX; } } diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 2924ac61300dab..1595fd5d4e85f6 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -10,6 +10,7 @@ #include "ice_lib.h" #include "ice_dcb_lib.h" #include +#include struct ice_stats { char stat_string[ETH_GSTRING_LEN]; @@ -340,7 +341,6 @@ static const struct ice_priv_flag ice_gstrings_priv_flags[] = { ICE_FLAG_VF_TRUE_PROMISC_ENA), ICE_PRIV_FLAG("mdd-auto-reset-vf", ICE_FLAG_MDD_AUTO_RESET_VF), ICE_PRIV_FLAG("vf-vlan-pruning", ICE_FLAG_VF_VLAN_PRUNING), - ICE_PRIV_FLAG("legacy-rx", ICE_FLAG_LEGACY_RX), }; #define ICE_PRIV_FLAG_ARRAY_SIZE ARRAY_SIZE(ice_gstrings_priv_flags) @@ -1252,7 +1252,7 @@ static int ice_diag_send(struct ice_tx_ring *tx_ring, u8 *data, u16 size) */ static int ice_lbtest_receive_frames(struct ice_rx_ring *rx_ring) { - struct ice_rx_buf *rx_buf; + struct libeth_fqe *rx_buf; int valid_frames, i; u8 *received_buf; @@ -1268,8 +1268,8 @@ static int ice_lbtest_receive_frames(struct ice_rx_ring *rx_ring) cpu_to_le16(BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S))))) continue; - rx_buf = &rx_ring->rx_buf[i]; - received_buf = page_address(rx_buf->page) + rx_buf->page_offset; + rx_buf = &rx_ring->rx_fqes[i]; + received_buf = page_address(rx_buf->page) + rx_buf->offset; if (ice_lbtest_check_frame(received_buf)) valid_frames++; @@ -1882,10 +1882,6 @@ static int ice_set_priv_flags(struct net_device *netdev, u32 flags) ice_nway_reset(netdev); } } - if (test_bit(ICE_FLAG_LEGACY_RX, change_flags)) { - /* down and up VSI so that changes of Rx cfg are reflected. */ - ice_down_up(vsi); - } /* don't allow modification of this flag when a single VF is in * promiscuous mode because it's not supported */ @@ -3355,7 +3351,6 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, rx_rings[i].count = new_rx_cnt; rx_rings[i].cached_phctime = pf->ptp.cached_phc_time; rx_rings[i].desc = NULL; - rx_rings[i].rx_buf = NULL; /* this is to allow wr32 to have something to write to * during early allocation of Rx buffers */ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index a6f586f9bfd170..e01c785d960f2e 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -37,6 +37,8 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation."; #define ICE_DDP_PKG_FILE ICE_DDP_PKG_PATH "ice.pkg" MODULE_DESCRIPTION(DRV_SUMMARY); +MODULE_IMPORT_NS(LIBETH_XDP); +MODULE_IMPORT_NS(LIBETH); MODULE_IMPORT_NS(LIBIE); MODULE_LICENSE("GPL v2"); MODULE_FIRMWARE(ICE_DDP_PKG_FILE); @@ -2983,10 +2985,7 @@ int ice_vsi_determine_xdp_res(struct ice_vsi *vsi) */ static int ice_max_xdp_frame_size(struct ice_vsi *vsi) { - if (test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) - return ICE_RXBUF_1664; - else - return ICE_RXBUF_3072; + return ICE_RXBUF_3072; } /** @@ -4859,8 +4858,8 @@ static void ice_init_features(struct ice_pf *pf) ice_dpll_init(pf); /* Note: Flow director init failure is non-fatal to load */ - if (ice_init_fdir(pf)) - dev_err(dev, "could not initialize flow director\n"); + //if (ice_init_fdir(pf)) + // dev_err(dev, "could not initialize flow director\n"); /* Note: DCB init failure is non-fatal to load */ if (ice_init_pf_dcb(pf, false)) { @@ -4884,7 +4883,7 @@ static void ice_deinit_features(struct ice_pf *pf) ice_deinit_lag(pf); if (test_bit(ICE_FLAG_DCB_CAPABLE, pf->flags)) ice_cfg_lldp_mib_change(&pf->hw, false); - ice_deinit_fdir(pf); + //ice_deinit_fdir(pf); if (ice_is_feature_supported(pf, ICE_F_GNSS)) ice_gnss_exit(pf); if (test_bit(ICE_FLAG_PTP_SUPPORTED, pf->flags)) @@ -7806,12 +7805,6 @@ int ice_change_mtu(struct net_device *netdev, int new_mtu) frame_size - ICE_ETH_PKT_HDR_PAD); return -EINVAL; } - } else if (test_bit(ICE_FLAG_LEGACY_RX, pf->flags)) { - if (new_mtu + ICE_ETH_PKT_HDR_PAD > ICE_MAX_FRAME_LEGACY_RX) { - netdev_err(netdev, "Too big MTU for legacy-rx; Max is %d\n", - ICE_MAX_FRAME_LEGACY_RX - ICE_ETH_PKT_HDR_PAD); - return -EINVAL; - } } /* if a reset is in progress, wait for some time for it to complete */ diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index 5d2d7736fd5f12..ad1e56f6e1fcf9 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include #include #include #include "ice_txrx_lib.h" @@ -112,7 +114,9 @@ ice_prgm_fdir_fltr(struct ice_vsi *vsi, struct ice_fltr_desc *fdir_desc, static void ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) { - if (dma_unmap_len(tx_buf, len)) + struct page *page; + + if (tx_buf->type != ICE_TX_BUF_XDP_TX && dma_unmap_len(tx_buf, len)) dma_unmap_page(ring->dev, dma_unmap_addr(tx_buf, dma), dma_unmap_len(tx_buf, len), @@ -126,7 +130,8 @@ ice_unmap_and_free_tx_buf(struct ice_tx_ring *ring, struct ice_tx_buf *tx_buf) dev_kfree_skb_any(tx_buf->skb); break; case ICE_TX_BUF_XDP_TX: - page_frag_free(tx_buf->raw_buf); + page = virt_to_page(tx_buf->raw_buf); + page_pool_put_full_page(page->pp, page, false); break; case ICE_TX_BUF_XDP_XMIT: xdp_return_frame(tx_buf->xdpf); @@ -382,13 +387,10 @@ int ice_setup_tx_ring(struct ice_tx_ring *tx_ring) */ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring) { - struct xdp_buff *xdp = &rx_ring->xdp; - struct device *dev = rx_ring->dev; u32 size; - u16 i; /* ring already cleared, nothing to do */ - if (!rx_ring->rx_buf) + if (!rx_ring->rx_fqes) return; if (rx_ring->xsk_pool) { @@ -396,40 +398,27 @@ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring) goto rx_skip_free; } - if (xdp->data) { - xdp_return_buff(xdp); - xdp->data = NULL; - } + libeth_xdp_return_stash(&rx_ring->xdp); /* Free all the Rx ring sk_buffs */ - for (i = 0; i < rx_ring->count; i++) { - struct ice_rx_buf *rx_buf = &rx_ring->rx_buf[i]; - - if (!rx_buf->page) - continue; + //printk("queue_idx = %d, ntc = %d, ntu = %d\n", rx_ring->q_index, rx_ring->next_to_clean, rx_ring->next_to_use); + for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { +// for (u32 i = 0; i < rx_ring->count; i++) { + struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i]; + + if (rx_fqes->page) { + page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false); + rx_fqes->page = NULL; + rx_fqes->offset = 0; + } - /* Invalidate cache lines that may have been written to by - * device so that we avoid corrupting memory. - */ - dma_sync_single_range_for_cpu(dev, rx_buf->dma, - rx_buf->page_offset, - rx_ring->rx_buf_len, - DMA_FROM_DEVICE); - - /* free resources associated with mapping */ - dma_unmap_page_attrs(dev, rx_buf->dma, ice_rx_pg_size(rx_ring), - DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); - __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); - - rx_buf->page = NULL; - rx_buf->page_offset = 0; + if (unlikely(++i == rx_ring->count)) + i = 0; } rx_skip_free: if (rx_ring->xsk_pool) memset(rx_ring->xdp_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->xdp_buf))); - else - memset(rx_ring->rx_buf, 0, array_size(rx_ring->count, sizeof(*rx_ring->rx_buf))); /* Zero out the descriptor ring */ size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc), @@ -450,19 +439,24 @@ void ice_clean_rx_ring(struct ice_rx_ring *rx_ring) */ void ice_free_rx_ring(struct ice_rx_ring *rx_ring) { + struct libeth_fq fq = { + .fqes = rx_ring->rx_fqes, + .pp = rx_ring->pp, + }; u32 size; + netmem_ref netmem = rx_ring->pp->frag_page; + atomic_long_t *pp_ref_count = netmem_get_pp_ref_count_ref(netmem); ice_clean_rx_ring(rx_ring); if (rx_ring->vsi->type == ICE_VSI_PF) - if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) + if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) { + xdp_rxq_info_detach_mem_model(&rx_ring->xdp_rxq); xdp_rxq_info_unreg(&rx_ring->xdp_rxq); + } WRITE_ONCE(rx_ring->xdp_prog, NULL); if (rx_ring->xsk_pool) { kfree(rx_ring->xdp_buf); rx_ring->xdp_buf = NULL; - } else { - kfree(rx_ring->rx_buf); - rx_ring->rx_buf = NULL; } if (rx_ring->desc) { @@ -472,6 +466,14 @@ void ice_free_rx_ring(struct ice_rx_ring *rx_ring) rx_ring->desc, rx_ring->dma); rx_ring->desc = NULL; } + + //printk("pp_ref_count = %ld\n", atomic_long_read(pp_ref_count)); + //printk("destroy pp pool = %p, frag_users = %ld, frag_page = %lx\n", rx_ring->pp, + // rx_ring->pp->frag_users, + // rx_ring->pp->frag_page); + libeth_rx_fq_destroy(&fq); + rx_ring->rx_fqes = NULL; + rx_ring->pp = NULL; } /** @@ -483,17 +485,23 @@ void ice_free_rx_ring(struct ice_rx_ring *rx_ring) int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) { struct device *dev = rx_ring->dev; + struct libeth_fq fq = { + .count = rx_ring->count, + .buf_len = LIBIE_MAX_RX_BUF_LEN, + .nid = NUMA_NO_NODE, + }; u32 size; + int ret; - if (!dev) - return -ENOMEM; + ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi); + if (ret) + return ret; + + rx_ring->pp = fq.pp; + rx_ring->rx_fqes = fq.fqes; + rx_ring->truesize = fq.truesize; + rx_ring->rx_buf_len = fq.buf_len; - /* warn if we are about to overwrite the pointer */ - WARN_ON(rx_ring->rx_buf); - rx_ring->rx_buf = - kcalloc(rx_ring->count, sizeof(*rx_ring->rx_buf), GFP_KERNEL); - if (!rx_ring->rx_buf) - return -ENOMEM; /* round up to nearest page */ size = ALIGN(rx_ring->count * sizeof(union ice_32byte_rx_desc), @@ -516,8 +524,9 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) return 0; err: - kfree(rx_ring->rx_buf); - rx_ring->rx_buf = NULL; + libeth_rx_fq_destroy(&fq); + rx_ring->rx_fqes = NULL; + rx_ring->pp = NULL; return -ENOMEM; } @@ -527,39 +536,39 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) * @xdp: xdp_buff used as input to the XDP program * @xdp_prog: XDP program to run * @xdp_ring: ring to be used for XDP_TX action - * @rx_buf: Rx buffer to store the XDP action * @eop_desc: Last descriptor in packet to read metadata from * * Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR} */ -static void -ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, +static unsigned int +ice_run_xdp(struct ice_rx_ring *rx_ring, struct libeth_xdp_buff *xdp, struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring, - struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc) + union ice_32b_rx_flex_desc *eop_desc) { unsigned int ret = ICE_XDP_PASS; + struct page *page; u32 act; if (!xdp_prog) goto exit; - ice_xdp_meta_set_desc(xdp, eop_desc); + ice_xdp_meta_set_desc(&xdp->base, eop_desc); - act = bpf_prog_run_xdp(xdp_prog, xdp); + act = bpf_prog_run_xdp(xdp_prog, &xdp->base); switch (act) { case XDP_PASS: break; case XDP_TX: if (static_branch_unlikely(&ice_xdp_locking_key)) spin_lock(&xdp_ring->tx_lock); - ret = __ice_xmit_xdp_ring(xdp, xdp_ring, false); + ret = __ice_xmit_xdp_ring(&xdp->base, xdp_ring, false); if (static_branch_unlikely(&ice_xdp_locking_key)) spin_unlock(&xdp_ring->tx_lock); if (ret == ICE_XDP_CONSUMED) goto out_failure; break; case XDP_REDIRECT: - if (xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog)) + if (xdp_do_redirect(rx_ring->netdev, &xdp->base, xdp_prog)) goto out_failure; ret = ICE_XDP_REDIR; break; @@ -571,10 +580,12 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, trace_xdp_exception(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_DROP: + libeth_xdp_return_buff(xdp); ret = ICE_XDP_CONSUMED; } + exit: - ice_set_rx_bufs_act(xdp, rx_ring, ret); + return ret; } /** @@ -660,53 +671,6 @@ ice_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, return nxmit; } -/** - * ice_alloc_mapped_page - recycle or make a new page - * @rx_ring: ring to use - * @bi: rx_buf struct to modify - * - * Returns true if the page was successfully allocated or - * reused. - */ -static bool -ice_alloc_mapped_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *bi) -{ - struct page *page = bi->page; - dma_addr_t dma; - - /* since we are recycling buffers we should seldom need to alloc */ - if (likely(page)) - return true; - - /* alloc new page for storage */ - page = dev_alloc_pages(ice_rx_pg_order(rx_ring)); - if (unlikely(!page)) { - rx_ring->ring_stats->rx_stats.alloc_page_failed++; - return false; - } - - /* map page for use */ - dma = dma_map_page_attrs(rx_ring->dev, page, 0, ice_rx_pg_size(rx_ring), - DMA_FROM_DEVICE, ICE_RX_DMA_ATTR); - - /* if mapping failed free memory back to system since - * there isn't much point in holding memory we can't use - */ - if (dma_mapping_error(rx_ring->dev, dma)) { - __free_pages(page, ice_rx_pg_order(rx_ring)); - rx_ring->ring_stats->rx_stats.alloc_page_failed++; - return false; - } - - bi->dma = dma; - bi->page = page; - bi->page_offset = rx_ring->rx_offset; - page_ref_add(page, USHRT_MAX - 1); - bi->pagecnt_bias = USHRT_MAX; - - return true; -} - /** * ice_alloc_rx_bufs - Replace used receive buffers * @rx_ring: ring to place buffers on @@ -722,9 +686,14 @@ ice_alloc_mapped_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *bi) */ bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count) { + const struct libeth_fq_fp fq = { + .pp = rx_ring->pp, + .fqes = rx_ring->rx_fqes, + .truesize = rx_ring->truesize, + .count = rx_ring->count, + }; union ice_32b_rx_flex_desc *rx_desc; u16 ntu = rx_ring->next_to_use; - struct ice_rx_buf *bi; /* do nothing if no valid netdev defined */ if ((!rx_ring->netdev && rx_ring->vsi->type != ICE_VSI_CTRL) || @@ -733,30 +702,28 @@ bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count) /* get the Rx descriptor and buffer based on next_to_use */ rx_desc = ICE_RX_DESC(rx_ring, ntu); - bi = &rx_ring->rx_buf[ntu]; + + //printk("alloc_rx_bufs ----- ntu = %d, cleaned_count = %d\n", ntu, cleaned_count); do { - /* if we fail here, we have work remaining */ - if (!ice_alloc_mapped_page(rx_ring, bi)) - break; + dma_addr_t addr; + + addr = libeth_rx_alloc(&fq, ntu); - /* sync the buffer for use by the device */ - dma_sync_single_range_for_device(rx_ring->dev, bi->dma, - bi->page_offset, - rx_ring->rx_buf_len, - DMA_FROM_DEVICE); + if (addr == DMA_MAPPING_ERROR) { + rx_ring->ring_stats->rx_stats.alloc_page_failed++; + break; + } /* Refresh the desc even if buffer_addrs didn't change * because each write-back erases this info. */ - rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset); + rx_desc->read.pkt_addr = cpu_to_le64(addr); rx_desc++; - bi++; ntu++; if (unlikely(ntu == rx_ring->count)) { rx_desc = ICE_RX_DESC(rx_ring, 0); - bi = rx_ring->rx_buf; ntu = 0; } @@ -765,6 +732,8 @@ bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count) cleaned_count--; } while (cleaned_count); + + //printk("AFTER alloc_rx_bufs ----- ntu = %d, cleaned_count = %d\n", ntu, cleaned_count); if (rx_ring->next_to_use != ntu) ice_release_rx_desc(rx_ring, ntu); @@ -772,69 +741,6 @@ bool ice_alloc_rx_bufs(struct ice_rx_ring *rx_ring, unsigned int cleaned_count) return !!cleaned_count; } -/** - * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse - * @rx_buf: Rx buffer to adjust - * @size: Size of adjustment - * - * Update the offset within page so that Rx buf will be ready to be reused. - * For systems with PAGE_SIZE < 8192 this function will flip the page offset - * so the second half of page assigned to Rx buffer will be used, otherwise - * the offset is moved by "size" bytes - */ -static void -ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size) -{ -#if (PAGE_SIZE < 8192) - /* flip page offset to other buffer */ - rx_buf->page_offset ^= size; -#else - /* move offset up to the next cache line */ - rx_buf->page_offset += size; -#endif -} - -/** - * ice_can_reuse_rx_page - Determine if page can be reused for another Rx - * @rx_buf: buffer containing the page - * - * If page is reusable, we have a green light for calling ice_reuse_rx_page, - * which will assign the current buffer to the buffer that next_to_alloc is - * pointing to; otherwise, the DMA mapping needs to be destroyed and - * page freed - */ -static bool -ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) -{ - unsigned int pagecnt_bias = rx_buf->pagecnt_bias; - struct page *page = rx_buf->page; - - /* avoid re-using remote and pfmemalloc pages */ - if (!dev_page_is_reusable(page)) - return false; - - /* if we are only owner of page we can reuse it */ - if (unlikely(rx_buf->pgcnt - pagecnt_bias > 1)) - return false; -#if (PAGE_SIZE >= 8192) -#define ICE_LAST_OFFSET \ - (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_3072) - if (rx_buf->page_offset > ICE_LAST_OFFSET) - return false; -#endif /* PAGE_SIZE >= 8192) */ - - /* If we have drained the page fragment pool we need to update - * the pagecnt_bias and page count so that we fully restock the - * number of references the driver holds. - */ - if (unlikely(pagecnt_bias == 1)) { - page_ref_add(page, USHRT_MAX - 1); - rx_buf->pagecnt_bias = USHRT_MAX; - } - - return true; -} - /** * ice_add_xdp_frag - Add contents of Rx buffer to xdp buf as a frag * @rx_ring: Rx descriptor ring to transact packets on @@ -847,7 +753,7 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf) */ static int ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, - struct ice_rx_buf *rx_buf, const unsigned int size) + struct libeth_fqe *rx_buf, const unsigned int size) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); @@ -860,13 +766,11 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, xdp_buff_set_frags_flag(xdp); } - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) return -ENOMEM; - } __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, - rx_buf->page_offset, size); + rx_buf->offset, size); sinfo->xdp_frags_size += size; /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() * can pop off frags but driver has to handle it on its own @@ -879,67 +783,6 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, return 0; } -/** - * ice_reuse_rx_page - page flip buffer and store it back on the ring - * @rx_ring: Rx descriptor ring to store buffers on - * @old_buf: donor buffer to have page reused - * - * Synchronizes page for reuse by the adapter - */ -static void -ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf) -{ - u16 nta = rx_ring->next_to_alloc; - struct ice_rx_buf *new_buf; - - new_buf = &rx_ring->rx_buf[nta]; - - /* update, and store next to alloc */ - nta++; - rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - - /* Transfer page from old buffer to new buffer. - * Move each member individually to avoid possible store - * forwarding stalls and unnecessary copy of skb. - */ - new_buf->dma = old_buf->dma; - new_buf->page = old_buf->page; - new_buf->page_offset = old_buf->page_offset; - new_buf->pagecnt_bias = old_buf->pagecnt_bias; -} - -/** - * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use - * @rx_ring: Rx descriptor ring to transact packets on - * @size: size of buffer to add to skb - * @ntc: index of next to clean element - * - * This function will pull an Rx buffer from the ring and synchronize it - * for use by the CPU. - */ -static struct ice_rx_buf * -ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size, - const unsigned int ntc) -{ - struct ice_rx_buf *rx_buf; - - rx_buf = &rx_ring->rx_buf[ntc]; - rx_buf->pgcnt = page_count(rx_buf->page); - prefetchw(rx_buf->page); - - if (!size) - return rx_buf; - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma, - rx_buf->page_offset, size, - DMA_FROM_DEVICE); - - /* We have pulled a buffer for use, so decrement pagecnt_bias */ - rx_buf->pagecnt_bias--; - - return rx_buf; -} - /** * ice_build_skb - Build skb around an existing buffer * @rx_ring: Rx descriptor ring to transact packets on @@ -973,6 +816,8 @@ ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) if (unlikely(!skb)) return NULL; + skb_mark_for_recycle(skb); + /* must to record Rx queue, otherwise OS features such as * symmetric queue won't work */ @@ -989,120 +834,10 @@ ice_build_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) sinfo->xdp_frags_size, nr_frags * xdp->frame_sz, xdp_buff_is_frag_pfmemalloc(xdp)); - + return skb; } -/** - * ice_construct_skb - Allocate skb and populate it - * @rx_ring: Rx descriptor ring to transact packets on - * @xdp: xdp_buff pointing to the data - * - * This function allocates an skb. It then populates it with the page - * data from the current receive descriptor, taking care to set up the - * skb correctly. - */ -static struct sk_buff * -ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp) -{ - unsigned int size = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo = NULL; - struct ice_rx_buf *rx_buf; - unsigned int nr_frags = 0; - unsigned int headlen; - struct sk_buff *skb; - - /* prefetch first cache line of first page */ - net_prefetch(xdp->data); - - if (unlikely(xdp_buff_has_frags(xdp))) { - sinfo = xdp_get_shared_info_from_buff(xdp); - nr_frags = sinfo->nr_frags; - } - - /* allocate a skb to store the frags */ - skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE); - if (unlikely(!skb)) - return NULL; - - rx_buf = &rx_ring->rx_buf[rx_ring->first_desc]; - skb_record_rx_queue(skb, rx_ring->q_index); - /* Determine available headroom for copy */ - headlen = size; - if (headlen > ICE_RX_HDR_SIZE) - headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE); - - /* align pull length to size of long to optimize memcpy performance */ - memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen, - sizeof(long))); - - /* if we exhaust the linear part then add what is left as a frag */ - size -= headlen; - if (size) { - /* besides adding here a partial frag, we are going to add - * frags from xdp_buff, make sure there is enough space for - * them - */ - if (unlikely(nr_frags >= MAX_SKB_FRAGS - 1)) { - dev_kfree_skb(skb); - return NULL; - } - skb_add_rx_frag(skb, 0, rx_buf->page, - rx_buf->page_offset + headlen, size, - xdp->frame_sz); - } else { - /* buffer is unused, change the act that should be taken later - * on; data was copied onto skb's linear part so there's no - * need for adjusting page offset and we can reuse this buffer - * as-is - */ - rx_buf->act = ICE_SKB_CONSUMED; - } - - if (unlikely(xdp_buff_has_frags(xdp))) { - struct skb_shared_info *skinfo = skb_shinfo(skb); - - memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], - sizeof(skb_frag_t) * nr_frags); - - xdp_update_skb_shared_info(skb, skinfo->nr_frags + nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdp->frame_sz, - xdp_buff_is_frag_pfmemalloc(xdp)); - } - - return skb; -} - -/** - * ice_put_rx_buf - Clean up used buffer and either recycle or free - * @rx_ring: Rx descriptor ring to transact packets on - * @rx_buf: Rx buffer to pull data from - * - * This function will clean up the contents of the rx_buf. It will either - * recycle the buffer or unmap it and free the associated resources. - */ -static void -ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf) -{ - if (!rx_buf) - return; - - if (ice_can_reuse_rx_page(rx_buf)) { - /* hand second half of page back to the ring */ - ice_reuse_rx_page(rx_ring, rx_buf); - } else { - /* we are not reusing the buffer so unmap it */ - dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, - ice_rx_pg_size(rx_ring), DMA_FROM_DEVICE, - ICE_RX_DMA_ATTR); - __page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias); - } - - /* clear contents of buffer_info */ - rx_buf->page = NULL; -} - /** * ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf * @rx_ring: Rx descriptor ring to transact packets on @@ -1119,16 +854,17 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) { unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int offset = rx_ring->rx_offset; - struct xdp_buff *xdp = &rx_ring->xdp; - u32 cached_ntc = rx_ring->first_desc; struct ice_tx_ring *xdp_ring = NULL; struct bpf_prog *xdp_prog = NULL; u32 ntc = rx_ring->next_to_clean; + LIBETH_XDP_ONSTACK_BUFF(xdp); u32 cnt = rx_ring->count; + bool failure = false; u32 xdp_xmit = 0; u32 cached_ntu; - bool failure; - u32 first; + u32 act; + + libeth_xdp_init_buff(xdp, &rx_ring->xdp, &rx_ring->xdp_rxq); xdp_prog = READ_ONCE(rx_ring->xdp_prog); if (xdp_prog) { @@ -1139,7 +875,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) /* start the loop to process Rx packets bounded by 'budget' */ while (likely(total_rx_pkts < (unsigned int)budget)) { union ice_32b_rx_flex_desc *rx_desc; - struct ice_rx_buf *rx_buf; + struct libeth_fqe *rx_buf; struct sk_buff *skb; unsigned int size; u16 stat_err_bits; @@ -1180,16 +916,19 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) ICE_RX_FLX_DESC_PKT_LEN_M; /* retrieve a buffer from the ring */ - rx_buf = ice_get_rx_buf(rx_ring, size, ntc); + rx_buf = &rx_ring->rx_fqes[ntc]; + if (!libeth_rx_sync_for_cpu(rx_buf, size)) { + continue; + } if (!xdp->data) { void *hard_start; - hard_start = page_address(rx_buf->page) + rx_buf->page_offset - - offset; - xdp_prepare_buff(xdp, hard_start, offset, size, !!offset); - xdp_buff_clear_frags_flag(xdp); - } else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) { + //hard_start = page_address(rx_buf->page) + rx_buf->offset; + //xdp_prepare_buff(&xdp->base, hard_start, rx_buf->page->pp->p.offset, size, true); + libeth_xdp_prepare_buff(xdp, rx_buf, size); + xdp_buff_clear_frags_flag(&xdp->base); + } else if (ice_add_xdp_frag(rx_ring, &xdp->base, rx_buf, size)) { break; } if (++ntc == cnt) @@ -1199,10 +938,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) if (ice_is_non_eop(rx_ring, rx_desc)) continue; - ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc); - if (rx_buf->act == ICE_XDP_PASS) + act = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc); + if (act == ICE_XDP_PASS) goto construct_skb; - total_rx_bytes += xdp_get_buff_len(xdp); + + if (act & (ICE_XDP_TX | ICE_XDP_REDIR)) + xdp_xmit |= act; + total_rx_bytes += xdp_get_buff_len(&xdp->base); total_rx_pkts++; xdp->data = NULL; @@ -1210,17 +952,19 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) rx_ring->nr_frags = 0; continue; construct_skb: - if (likely(ice_ring_uses_build_skb(rx_ring))) - skb = ice_build_skb(rx_ring, xdp); - else - skb = ice_construct_skb(rx_ring, xdp); + skb = ice_build_skb(rx_ring, &xdp->base); + + /* TODO: Consider using "xdp_build_skb_from_buff". + * The only problem is skb->protocol which is set in "ice_process_skb_fields()". + * So, some refactoring is needed first. + */ + //skb = xdp_build_skb_from_buff(xdp); + /* exit if we failed to retrieve a buffer */ if (!skb) { + printk("SKB is NULL after ice_build_skb\n"); rx_ring->ring_stats->rx_stats.alloc_page_failed++; - rx_buf->act = ICE_XDP_CONSUMED; - if (unlikely(xdp_buff_has_frags(xdp))) - ice_set_rx_bufs_act(xdp, rx_ring, - ICE_XDP_CONSUMED); + act = ICE_XDP_CONSUMED; xdp->data = NULL; rx_ring->first_desc = ntc; rx_ring->nr_frags = 0; @@ -1240,8 +984,9 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) vlan_tci = ice_get_vlan_tci(rx_desc); /* pad the skb if needed, to make a valid ethernet frame */ - if (eth_skb_pad(skb)) + if (eth_skb_pad(skb)) { continue; + } /* probably a little skewed due to removing CRC */ total_rx_bytes += skb->len; @@ -1257,27 +1002,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) total_rx_pkts++; } - first = rx_ring->first_desc; - while (cached_ntc != first) { - struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc]; - - if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - xdp_xmit |= buf->act; - } else if (buf->act & ICE_XDP_CONSUMED) { - buf->pagecnt_bias++; - } else if (buf->act == ICE_XDP_PASS) { - ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz); - } - - ice_put_rx_buf(rx_ring, buf); - if (++cached_ntc >= cnt) - cached_ntc = 0; - } rx_ring->next_to_clean = ntc; /* return up to cleaned_count buffers to hardware */ - failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring)); + //if (ICE_RX_DESC_UNUSED(rx_ring) >= 16) + failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring)); + libeth_xdp_save_buff(&rx_ring->xdp, xdp); if (xdp_xmit) ice_finalize_xdp_rx(xdp_ring, xdp_xmit, cached_ntu); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h index cb347c852ba9e8..83ffc34fb7c6c3 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx.h @@ -5,6 +5,9 @@ #define _ICE_TXRX_H_ #include "ice_type.h" +#include +#include +#include #define ICE_DFLT_IRQ_WORK 256 #define ICE_RXBUF_3072 3072 @@ -196,15 +199,6 @@ struct ice_tx_offload_params { u8 header_len; }; -struct ice_rx_buf { - dma_addr_t dma; - struct page *page; - unsigned int page_offset; - unsigned int pgcnt; - unsigned int act; - unsigned int pagecnt_bias; -}; - struct ice_q_stats { u64 pkts; u64 bytes; @@ -324,14 +318,15 @@ struct ice_rx_ring { u16 reg_idx; /* HW register index of the ring */ u16 next_to_alloc; - union { - struct ice_rx_buf *rx_buf; - struct xdp_buff **xdp_buf; - }; + struct page_pool *pp; + + struct libeth_fqe *rx_fqes; + struct xdp_buff **xdp_buf; + /* CL2 - 2nd cacheline starts here */ union { struct ice_xdp_buff xdp_ext; - struct xdp_buff xdp; + struct libeth_xdp_buff_stash xdp; }; /* CL3 - 3rd cacheline starts here */ union { @@ -348,6 +343,7 @@ struct ice_rx_ring { u16 next_to_use; u16 next_to_clean; u16 first_desc; + u32 truesize; /* stats structs */ struct ice_ring_stats *ring_stats; @@ -359,8 +355,6 @@ struct ice_rx_ring { struct ice_rx_ring *next; /* pointer to next ring in q_vector */ struct xsk_buff_pool *xsk_pool; u32 nr_frags; - u16 max_frame; - u16 rx_buf_len; dma_addr_t dma; /* physical address of ring */ u8 dcb_tc; /* Traffic class of ring */ u8 ptp_rx; @@ -370,6 +364,7 @@ struct ice_rx_ring { u8 flags; /* CL5 - 5th cacheline starts here */ struct xdp_rxq_info xdp_rxq; + u32 rx_buf_len; } ____cacheline_internodealigned_in_smp; struct ice_tx_ring { @@ -479,15 +474,9 @@ struct ice_coalesce_stored { static inline unsigned int ice_rx_pg_order(struct ice_rx_ring *ring) { -#if (PAGE_SIZE < 8192) - if (ring->rx_buf_len > (PAGE_SIZE / 2)) - return 1; -#endif return 0; } -#define ice_rx_pg_size(_ring) (PAGE_SIZE << ice_rx_pg_order(_ring)) - union ice_32b_rx_flex_desc; bool ice_alloc_rx_bufs(struct ice_rx_ring *rxr, unsigned int cleaned_count); diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 2719f0e20933f2..c9991ca3f6a34f 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -244,19 +244,22 @@ static void ice_clean_xdp_tx_buf(struct device *dev, struct ice_tx_buf *tx_buf, struct xdp_frame_bulk *bq) { - dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); + u32 len = dma_unmap_len(tx_buf, len); + struct page *page; switch (tx_buf->type) { case ICE_TX_BUF_XDP_TX: - page_frag_free(tx_buf->raw_buf); + page = virt_to_page(tx_buf->raw_buf); + page_pool_put_page(page->pp, page, len, true); break; case ICE_TX_BUF_XDP_XMIT: + dma_unmap_single(dev, dma_unmap_addr(tx_buf, dma), + dma_unmap_len(tx_buf, len), DMA_TO_DEVICE); xdp_return_frame_bulk(tx_buf->xdpf, bq); break; } + dma_unmap_len_set(tx_buf, len, 0); tx_buf->type = ICE_TX_BUF_EMPTY; } @@ -374,19 +377,22 @@ int __ice_xmit_xdp_ring(struct xdp_buff *xdp, struct ice_tx_ring *xdp_ring, tx_buf = tx_head; for (;;) { + struct page *page = virt_to_page(data); + enum dma_data_direction dma_dir = page_pool_get_dma_dir(page->pp); dma_addr_t dma; - dma = dma_map_single(dev, data, size, DMA_TO_DEVICE); - if (dma_mapping_error(dev, dma)) - goto dma_unmap; - /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); if (frame) { + dma = dma_map_single(dev, data, size, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + goto dma_unmap; tx_buf->type = ICE_TX_BUF_FRAG; } else { + dma = page_pool_get_dma_addr(page) + xdp->data - xdp->data_hard_start; + dma_sync_single_for_device(dev, dma, size, dma_dir); tx_buf->type = ICE_TX_BUF_XDP_TX; tx_buf->raw_buf = data; } diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h index 79f960c6680d17..6cf32b40412753 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h @@ -5,49 +5,6 @@ #define _ICE_TXRX_LIB_H_ #include "ice.h" -/** - * ice_set_rx_bufs_act - propagate Rx buffer action to frags - * @xdp: XDP buffer representing frame (linear and frags part) - * @rx_ring: Rx ring struct - * act: action to store onto Rx buffers related to XDP buffer parts - * - * Set action that should be taken before putting Rx buffer from first frag - * to the last. - */ -static inline void -ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, - const unsigned int act) -{ - u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; - u32 nr_frags = rx_ring->nr_frags + 1; - u32 idx = rx_ring->first_desc; - u32 cnt = rx_ring->count; - struct ice_rx_buf *buf; - - for (int i = 0; i < nr_frags; i++) { - buf = &rx_ring->rx_buf[idx]; - buf->act = act; - - if (++idx == cnt) - idx = 0; - } - - /* adjust pagecnt_bias on frags freed by XDP prog */ - if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { - u32 delta = rx_ring->nr_frags - sinfo_frags; - - while (delta) { - if (idx == 0) - idx = cnt - 1; - else - idx--; - buf = &rx_ring->rx_buf[idx]; - buf->pagecnt_bias--; - delta--; - } - } -} - /** * ice_test_staterr - tests bits in Rx descriptor status and error fields * @status_err_n: Rx descriptor status_error0 or status_error1 bits diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index aa2080747714a5..475c18f43480cb 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -2071,18 +2071,17 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg) (qpi->rxq.databuffer_size > ((16 * 1024) - 128) || qpi->rxq.databuffer_size < 1024)) goto error_param; - ring->rx_buf_len = qpi->rxq.databuffer_size; if (qpi->rxq.max_pkt_size > max_frame_size || qpi->rxq.max_pkt_size < 64) goto error_param; - ring->max_frame = qpi->rxq.max_pkt_size; + vsi->max_frame = qpi->rxq.max_pkt_size; /* add space for the port VLAN since the VF driver is * not expected to account for it in the MTU * calculation */ if (ice_vf_is_port_vlan_ena(vf)) - ring->max_frame += VLAN_HLEN; + vsi->max_frame += VLAN_HLEN; if (ice_vsi_cfg_single_rxq(vsi, q_idx)) { dev_warn(ice_pf_to_dev(pf), "VF-%d failed to configure RX queue %d\n", diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index 334ae945d6404c..c71030fd9fdcaa 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -316,21 +316,16 @@ ice_xsk_pool_enable(struct ice_vsi *vsi, struct xsk_buff_pool *pool, u16 qid) static int ice_realloc_rx_xdp_bufs(struct ice_rx_ring *rx_ring, bool pool_present) { - size_t elem_size = pool_present ? sizeof(*rx_ring->xdp_buf) : - sizeof(*rx_ring->rx_buf); + size_t elem_size = sizeof(*rx_ring->xdp_buf); void *sw_ring = kcalloc(rx_ring->count, elem_size, GFP_KERNEL); if (!sw_ring) return -ENOMEM; if (pool_present) { - kfree(rx_ring->rx_buf); - rx_ring->rx_buf = NULL; rx_ring->xdp_buf = sw_ring; } else { - kfree(rx_ring->xdp_buf); rx_ring->xdp_buf = NULL; - rx_ring->rx_buf = sw_ring; } return 0; @@ -801,35 +796,6 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, return result; } -static int -ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, - struct xdp_buff *xdp, const unsigned int size) -{ - struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(first); - - if (!size) - return 0; - - if (!xdp_buff_has_frags(first)) { - sinfo->nr_frags = 0; - sinfo->xdp_frags_size = 0; - xdp_buff_set_frags_flag(first); - } - - if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { - xsk_buff_free(first); - return -ENOMEM; - } - - __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, - virt_to_page(xdp->data_hard_start), - XDP_PACKET_HEADROOM, size); - sinfo->xdp_frags_size += size; - xsk_buff_add_frag(xdp); - - return 0; -} - /** * ice_clean_rx_irq_zc - consumes packets from the hardware ring * @rx_ring: AF_XDP Rx ring @@ -895,7 +861,8 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, if (!first) { first = xdp; - } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { + } else if (likely(size) && !xsk_buff_add_frag(first, xdp)) { + xsk_buff_free(first); break; } diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig index 480293b71dbc37..80e99d0445998f 100644 --- a/drivers/net/ethernet/intel/libeth/Kconfig +++ b/drivers/net/ethernet/intel/libeth/Kconfig @@ -7,3 +7,9 @@ config LIBETH help libeth is a common library containing routines shared between several drivers, but not yet promoted to the generic kernel API. + +config LIBETH_XDP + tristate + select LIBETH + help + XDP and XSk helpers based on libeth hotpath management. diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile index 52492b08113261..8b01b2af54cf74 100644 --- a/drivers/net/ethernet/intel/libeth/Makefile +++ b/drivers/net/ethernet/intel/libeth/Makefile @@ -4,3 +4,9 @@ obj-$(CONFIG_LIBETH) += libeth.o libeth-y := rx.o +libeth-y += tx.o + +obj-$(CONFIG_LIBETH_XDP) += libeth_xdp.o + +libeth_xdp-y += xdp.o +libeth_xdp-y += xsk.o diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h new file mode 100644 index 00000000000000..b9171461186ef0 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/priv.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_PRIV_H +#define __LIBETH_PRIV_H + +#include + +/* XDP */ + +enum xdp_action; +struct libeth_xdp_buff; +struct libeth_xdp_tx_frame; +struct skb_shared_info; +struct xdp_frame_bulk; + +extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow; + +void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count); +u32 libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, enum xdp_action act, + int ret); + +struct libeth_xdp_ops { + void (*bulk)(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); + void (*xsk)(struct libeth_xdp_buff *xdp); +}; + +void libeth_attach_xdp(const struct libeth_xdp_ops *ops); + +static inline void libeth_detach_xdp(void) +{ + libeth_attach_xdp(NULL); +} + +#endif /* __LIBETH_PRIV_H */ diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c index f209266693185b..1105fd0db4c3e3 100644 --- a/drivers/net/ethernet/intel/libeth/rx.c +++ b/drivers/net/ethernet/intel/libeth/rx.c @@ -68,7 +68,7 @@ static u32 libeth_rx_hw_len_truesize(const struct page_pool_params *pp, static bool libeth_rx_page_pool_params(struct libeth_fq *fq, struct page_pool_params *pp) { - pp->offset = LIBETH_SKB_HEADROOM; + pp->offset = fq->xdp ? LIBETH_XDP_HEADROOM : LIBETH_SKB_HEADROOM; /* HW-writeable / syncable length per one page */ pp->max_len = LIBETH_RX_PAGE_LEN(pp->offset); @@ -155,11 +155,12 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) .dev = napi->dev->dev.parent, .netdev = napi->dev, .napi = napi, - .dma_dir = DMA_FROM_DEVICE, }; struct libeth_fqe *fqes; struct page_pool *pool; - bool ret; + int ret; + + pp.dma_dir = fq->xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; if (!fq->hsplit) ret = libeth_rx_page_pool_params(fq, &pp); @@ -173,18 +174,26 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) return PTR_ERR(pool); fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid); - if (!fqes) + if (!fqes) { + ret = -ENOMEM; goto err_buf; + } + + ret = xdp_reg_page_pool(pool); + if (ret) + goto err_mem; fq->fqes = fqes; fq->pp = pool; return 0; +err_mem: + kvfree(fqes); err_buf: page_pool_destroy(pool); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH); @@ -194,6 +203,7 @@ EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH); */ void libeth_rx_fq_destroy(struct libeth_fq *fq) { + xdp_unreg_page_pool(fq->pp); kvfree(fq->fqes); page_pool_destroy(fq->pp); } @@ -205,7 +215,7 @@ EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH); * * To be used on exceptions or rare cases not requiring fast inline recycling. */ -void libeth_rx_recycle_slow(struct page *page) +void __cold libeth_rx_recycle_slow(struct page *page) { page_pool_recycle_direct(page->pp, page); } diff --git a/drivers/net/ethernet/intel/libeth/tx.c b/drivers/net/ethernet/intel/libeth/tx.c new file mode 100644 index 00000000000000..dc8df216c92286 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/tx.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation */ + +#include + +#include "priv.h" + +/* Tx buffer completion */ + +DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk); +DEFINE_STATIC_CALL_NULL(xsk, libeth_xsk_buff_free_slow); + +/** + * libeth_tx_complete_any - perform Tx completion for one SQE of any type + * @sqe: Tx buffer to complete + * @cp: polling params + * + * Can be used to complete both regular and XDP SQEs, for example when + * destroying queues. + * When libeth_xdp is not loaded, XDPSQEs won't be handled. + */ +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp) +{ + if (sqe->type >= __LIBETH_SQE_XDP_START) + __libeth_xdp_complete_tx(sqe, cp, static_call(bulk), + static_call(xsk)); + else + libeth_tx_complete(sqe, cp); +} +EXPORT_SYMBOL_NS_GPL(libeth_tx_complete_any, LIBETH); + +/* Module */ + +void libeth_attach_xdp(const struct libeth_xdp_ops *ops) +{ + static_call_update(bulk, ops ? ops->bulk : NULL); + static_call_update(xsk, ops ? ops->xsk : NULL); +} +EXPORT_SYMBOL_NS_GPL(libeth_attach_xdp, LIBETH); diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c new file mode 100644 index 00000000000000..00b9877cc80069 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -0,0 +1,444 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation */ + +#include + +#include "priv.h" + +/* XDPSQ sharing */ + +DEFINE_STATIC_KEY_FALSE(libeth_xdpsq_share); +EXPORT_SYMBOL_NS_GPL(libeth_xdpsq_share, LIBETH_XDP); + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + bool warn; + + spin_lock_init(&lock->lock); + lock->share = true; + + warn = !static_key_enabled(&libeth_xdpsq_share); + static_branch_inc_cpuslocked(&libeth_xdpsq_share); + + if (warn && net_ratelimit()) + netdev_warn(dev, "XDPSQ sharing enabled, possible XDP Tx slowdown\n"); +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdpsq_get, LIBETH_XDP); + +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + static_branch_dec_cpuslocked(&libeth_xdpsq_share); + + if (!static_key_enabled(&libeth_xdpsq_share) && net_ratelimit()) + netdev_notice(dev, "XDPSQ sharing disabled\n"); + + lock->share = false; +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdpsq_put, LIBETH_XDP); + +void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + spin_lock(&lock->lock); +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdpsq_lock, LIBETH_XDP); + +void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + spin_unlock(&lock->lock); +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdpsq_unlock, LIBETH_XDP); + +/* XDPSQ clean-up timers */ + +/** + * libeth_xdpsq_init_timer - initialize an XDPSQ clean-up timer + * @timer: timer to initialize + * @xdpsq: queue this timer belongs to + * @lock: corresponding XDPSQ lock + * @poll: queue polling/completion function + * + * XDPSQ clean-up timers must be set up before using at the queue configuration + * time. Set the required pointers and the cleaning callback. + */ +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)) +{ + timer->xdpsq = xdpsq; + timer->lock = lock; + + INIT_DELAYED_WORK(&timer->dwork, poll); +} +EXPORT_SYMBOL_NS_GPL(libeth_xdpsq_init_timer, LIBETH_XDP); + +/* ``XDP_TX`` bulking */ + +static void __cold +libeth_xdp_tx_return_one(const struct libeth_xdp_tx_frame *frm) +{ + if (frm->len_fl & LIBETH_XDP_TX_MULTI) + libeth_xdp_return_frags(frm->data + frm->soff, true); + + libeth_xdp_return_va(frm->data, true); +} + +static void __cold +libeth_xdp_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, u32 count) +{ + for (u32 i = 0; i < count; i++) { + const struct libeth_xdp_tx_frame *frm = &bq[i]; + + if (!(frm->len_fl & LIBETH_XDP_TX_FIRST)) + continue; + + libeth_xdp_tx_return_one(frm); + } +} + +static void __cold libeth_trace_xdp_exception(const struct net_device *dev, + const struct bpf_prog *prog, + u32 act) +{ + trace_xdp_exception(dev, prog, act); +} + +/** + * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames + * @bq: XDP Tx frame bulk + * @sent: number of frames sent successfully (from this bulk) + * @flags: internal libeth_xdp flags (XSk, .ndo_xdp_xmit etc.) + * + * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly. + * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust + * the Tx bulk to try again later. + */ +void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags) +{ + const struct libeth_xdp_tx_frame *pos = &bq->bulk[sent]; + u32 left = bq->count - sent; + + if (!(flags & LIBETH_XDP_TX_NDO)) + libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX); + + if (!(flags & LIBETH_XDP_TX_DROP)) { + memmove(bq->bulk, pos, left * sizeof(*bq->bulk)); + bq->count = left; + + return; + } + + if (flags & LIBETH_XDP_TX_XSK) + libeth_xsk_tx_return_bulk(pos, left); + else if (!(flags & LIBETH_XDP_TX_NDO)) + libeth_xdp_tx_return_bulk(pos, left); + else + libeth_xdp_xmit_return_bulk(pos, left, bq->dev); + + bq->count = 0; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_tx_exception, LIBETH_XDP); + +/* .ndo_xdp_xmit() implementation */ + +u32 __cold libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev) +{ + u32 n = 0; + + for (u32 i = 0; i < count; i++) { + const struct libeth_xdp_tx_frame *frm = &bq[i]; + dma_addr_t dma; + + if (frm->flags & LIBETH_XDP_TX_FIRST) + dma = *libeth_xdp_xmit_frame_dma(frm->xdpf); + else + dma = dma_unmap_addr(frm, dma); + + dma_unmap_page(dev->dev.parent, dma, dma_unmap_len(frm, len), + DMA_TO_DEVICE); + + /* Actual xdp_frames are freed by the core */ + n += !!(frm->flags & LIBETH_XDP_TX_FIRST); + } + + return n; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_xmit_return_bulk, LIBETH_XDP); + +/* Rx polling path */ + +/** + * libeth_xdp_load_stash - recreate an &xdp_buff from a libeth_xdp buffer stash + * @dst: target &libeth_xdp_buff to initialize + * @src: source stash + * + * External helper used by libeth_xdp_init_buff(), do not call directly. + * Recreate an onstack &libeth_xdp_buff using the stash saved earlier. + * The only field untouched (rxq) is initialized later in the + * abovementioned function. + */ +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src) +{ + dst->data = src->data; + dst->base.data_end = src->data + src->len; + dst->base.data_meta = src->data; + dst->base.data_hard_start = src->data - src->headroom; + + dst->base.frame_sz = src->frame_sz; + dst->base.flags = src->flags; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_load_stash, LIBETH_XDP); + +/** + * libeth_xdp_save_stash - convert an &xdp_buff to a libeth_xdp buffer stash + * @dst: target &libeth_xdp_buff_stash to initialize + * @src: source XDP buffer + * + * External helper used by libeth_xdp_save_buff(), do not call directly. + * Use the fields from the passed XDP buffer to initialize the stash on the + * queue, so that a partially received frame can be finished later during + * the next NAPI poll. + */ +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + dst->data = src->data; + dst->headroom = src->data - src->base.data_hard_start; + dst->len = src->base.data_end - src->data; + + dst->frame_sz = src->base.frame_sz; + dst->flags = src->base.flags; + + WARN_ON_ONCE(dst->flags != src->base.flags); +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_save_stash, LIBETH_XDP); + +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + LIBETH_XDP_ONSTACK_BUFF(xdp); + + libeth_xdp_load_stash(xdp, stash); + libeth_xdp_return_buff_slow(xdp); + + stash->data = NULL; +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdp_return_stash, LIBETH_XDP); + +/** + * libeth_xdp_return_buff_slow - free a &libeth_xdp_buff + * @xdp: buffer to free/return + * + * Slowpath version of libeth_xdp_return_buff() to be called on exceptions, + * queue clean-ups etc., without unwanted inlining. + */ +void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp) +{ + libeth_xdp_return_buff(xdp); +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_return_buff_slow, LIBETH_XDP); + +/** + * libeth_xdp_buff_add_frag - add a frag to an XDP buffer + * @xdp: head XDP buffer + * @fqe: Rx buffer containing the frag + * @len: frag length reported by HW + * + * External helper used by libeth_xdp_process_buff(), do not call directly. + * Frees both head and frag buffers on error. + * + * Return: true success, false on error (no space for a new frag). + */ +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + struct page *page = fqe->page; + + if (!xdp_buff_add_frag(&xdp->base, page, + fqe->offset + page->pp->p.offset, + len, fqe->truesize)) + goto recycle; + + return true; + +recycle: + libeth_rx_recycle_slow(page); + libeth_xdp_return_buff_slow(xdp); + + return false; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_buff_add_frag, LIBETH_XDP); + +/** + * libeth_xdp_prog_exception - handle XDP prog exceptions + * @bq: XDP Tx bulk + * @xdp: buffer to process + * @act: original XDP prog verdict + * @ret: error code if redirect failed + * + * External helper used by __libeth_xdp_run_prog() and + * __libeth_xsk_run_prog_slow(), do not call directly. + * Reports invalid @act, XDP exception trace event and frees the buffer. + * + * Return: libeth_xdp XDP prog verdict. + */ +u32 __cold libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret) +{ + if (act > XDP_REDIRECT) + bpf_warn_invalid_xdp_action(bq->dev, bq->prog, act); + + libeth_trace_xdp_exception(bq->dev, bq->prog, act); + + if (xdp->base.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + return libeth_xsk_prog_exception(xdp, act, ret); + + libeth_xdp_return_buff_slow(xdp); + + return LIBETH_XDP_DROP; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_prog_exception, LIBETH_XDP); + +/* Tx buffer completion */ + +static void libeth_xdp_put_page_bulk(struct page *page, + struct xdp_frame_bulk *bq) +{ + if (unlikely(bq->count == XDP_BULK_QUEUE_SIZE)) + xdp_flush_frame_bulk(bq); + + bq->q[bq->count++] = page; +} + +/** + * libeth_xdp_return_buff_bulk - free &xdp_buff as part of a bulk + * @sinfo: shared info corresponding to the buffer + * @bq: XDP frame bulk to store the buffer + * @frags: whether the buffer has frags + * + * Same as xdp_return_frame_bulk(), but for &libeth_xdp_buff, speeds up Tx + * completion of ``XDP_TX`` buffers and allows to free them in same bulks + * with &xdp_frame buffers. + */ +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags) +{ + if (!frags) + goto head; + + for (u32 i = 0; i < sinfo->nr_frags; i++) + libeth_xdp_put_page_bulk(skb_frag_page(&sinfo->frags[i]), bq); + +head: + libeth_xdp_put_page_bulk(virt_to_page(sinfo), bq); +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_return_buff_bulk, LIBETH_XDP); + +/* Misc */ + +/** + * libeth_xdp_queue_threshold - calculate XDP queue clean/refill threshold + * @count: number of descriptors in the queue + * + * The threshold is the limit at which RQs start to refill (when the number of + * empty buffers exceeds it) and SQs get cleaned up (when the number of free + * descriptors goes below it). To speed up hotpath processing, threshold is + * always pow-2, closest to 1/4 of the queue length. + * Don't call it on hotpath, calculate and cache the threshold during the + * queue initialization. + * + * Return: the calculated threshold. + */ +u32 libeth_xdp_queue_threshold(u32 count) +{ + u32 quarter, low, high; + + if (likely(is_power_of_2(count))) + return count >> 2; + + quarter = DIV_ROUND_CLOSEST(count, 4); + low = rounddown_pow_of_two(quarter); + high = roundup_pow_of_two(quarter); + + return high - quarter <= quarter - low ? high : low; +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_queue_threshold, LIBETH_XDP); + +/** + * __libeth_xdp_set_features - set XDP features for a netdev + * @dev: &net_device to configure + * @xmo: XDP metadata ops (Rx hints) + * @zc_segs: maximum number of S/G frags the HW can transmit + * @tmo: XSk Tx metadata ops (Tx hints) + * + * Set all the features libeth_xdp supports. Only the first argument is + * necessary; without the third one (zero), XSk support won't be advertised. + * Use the non-underscored versions in drivers instead. + */ +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo) +{ + xdp_set_features_flag(dev, + NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT | + (zc_segs ? NETDEV_XDP_ACT_XSK_ZEROCOPY : 0) | + NETDEV_XDP_ACT_RX_SG | + NETDEV_XDP_ACT_NDO_XMIT_SG); + dev->xdp_metadata_ops = xmo; + + tmo = tmo == libeth_xsktmo ? &libeth_xsktmo_slow : tmo; + + dev->xdp_zc_max_segs = zc_segs ? : 1; + dev->xsk_tx_metadata_ops = zc_segs ? tmo : NULL; +} +EXPORT_SYMBOL_NS_GPL(__libeth_xdp_set_features, LIBETH_XDP); + +/** + * libeth_xdp_set_redirect - toggle the XDP redirect feature + * @dev: &net_device to configure + * @enable: whether XDP is enabled + * + * Use this when XDPSQs are not always available to dynamically enable + * and disable redirect feature. + */ +void libeth_xdp_set_redirect(struct net_device *dev, bool enable) +{ + if (enable) + xdp_features_set_redirect_target(dev, true); + else + xdp_features_clear_redirect_target(dev); +} +EXPORT_SYMBOL_NS_GPL(libeth_xdp_set_redirect, LIBETH_XDP); + +/* Module */ + +static const struct libeth_xdp_ops xdp_ops __initconst = { + .bulk = libeth_xdp_return_buff_bulk, + .xsk = libeth_xsk_buff_free_slow, +}; + +static int __init libeth_xdp_module_init(void) +{ + libeth_attach_xdp(&xdp_ops); + + return 0; +} +module_init(libeth_xdp_module_init); + +static void __exit libeth_xdp_module_exit(void) +{ + libeth_detach_xdp(); +} +module_exit(libeth_xdp_module_exit); + +MODULE_DESCRIPTION("Common Ethernet library - XDP infra"); +MODULE_IMPORT_NS(LIBETH); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c new file mode 100644 index 00000000000000..eccb66a644f083 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2024 Intel Corporation */ + +#include + +#include "priv.h" + +/* ``XDP_TX`` bulking */ + +void __cold libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count) +{ + for (u32 i = 0; i < count; i++) + libeth_xsk_buff_free_slow(bq[i].xsk); +} + +/* XSk TMO */ + +const struct xsk_tx_metadata_ops libeth_xsktmo_slow = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/* Rx polling path */ + +/** + * libeth_xsk_buff_free_slow - free an XSk Rx buffer + * @xdp: buffer to free + * + * Slowpath version of xsk_buff_free() to be used on exceptions, cleanups etc. + * to avoid unwanted inlining. + */ +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp) +{ + xsk_buff_free(&xdp->base); +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_buff_free_slow, LIBETH_XDP); + +/** + * libeth_xsk_buff_add_frag - add a frag to an XSk Rx buffer + * @head: head buffer + * @xdp: frag buffer + * + * External helper used by libeth_xsk_process_buff(), do not call directly. + * Frees both main and frag buffers on error. + * + * Return: main buffer with attached frag on success, %NULL on error (no space + * for a new frag). + */ +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp) +{ + if (!xsk_buff_add_frag(&head->base, &xdp->base)) + goto free; + + return head; + +free: + libeth_xsk_buff_free_slow(xdp); + libeth_xsk_buff_free_slow(head); + + return NULL; +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_buff_add_frag, LIBETH_XDP); + +/** + * libeth_xsk_buff_stats_frags - update onstack RQ stats with XSk frags info + * @rs: onstack stats to update + * @xdp: buffer to account + * + * External helper used by __libeth_xsk_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp) +{ + libeth_xdp_buff_stats_frags(rs, xdp); +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_buff_stats_frags, LIBETH_XDP); + +/** + * __libeth_xsk_run_prog_slow - process the non-``XDP_REDIRECT`` verdicts + * @xdp: buffer to process + * @bq: Tx bulk for queueing on ``XDP_TX`` + * @act: verdict to process + * @ret: error code if ``XDP_REDIRECT`` failed + * + * External helper used by __libeth_xsk_run_prog(), do not call directly. + * ``XDP_REDIRECT`` is the most common and hottest verdict on XSk, thus + * it is processed inline. The rest goes here for out-of-line processing, + * together with redirect errors. + * + * Return: libeth_xdp XDP prog verdict. + */ +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret) +{ + switch (act) { + case XDP_DROP: + xsk_buff_free(&xdp->base); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_PASS: + return LIBETH_XDP_PASS; + default: + break; + } + + return libeth_xdp_prog_exception(bq, xdp, act, ret); +} +EXPORT_SYMBOL_NS_GPL(__libeth_xsk_run_prog_slow, LIBETH_XDP); + +/** + * libeth_xsk_prog_exception - handle XDP prog exceptions on XSk + * @xdp: buffer to process + * @act: verdict returned by the prog + * @ret: error code if ``XDP_REDIRECT`` failed + * + * Internal. Frees the buffer and, if the queue uses XSk wakeups, stop the + * current NAPI poll when there are no free buffers left. + * + * Return: libeth_xdp's XDP prog verdict. + */ +u32 __cold libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret) +{ + const struct xdp_buff_xsk *xsk; + u32 __ret = LIBETH_XDP_DROP; + + if (act != XDP_REDIRECT) + goto drop; + + xsk = container_of(&xdp->base, typeof(*xsk), xdp); + if (xsk_uses_need_wakeup(xsk->pool) && ret == -ENOBUFS) + __ret = LIBETH_XDP_ABORTED; + +drop: + libeth_xsk_buff_free_slow(xdp); + + return __ret; +} + +/* Refill */ + +/** + * libeth_xskfq_create - create an XSkFQ + * @fq: fill queue to initialize + * + * Allocates the FQEs and initializes the fields used by libeth_xdp: number + * of buffers to refill, refill threshold and buffer len. + * + * Return: %0 on success, -errno otherwise. + */ +int libeth_xskfq_create(struct libeth_xskfq *fq) +{ + fq->fqes = kvcalloc_node(fq->count, sizeof(*fq->fqes), GFP_KERNEL, + fq->nid); + if (!fq->fqes) + return -ENOMEM; + + fq->pending = fq->count; + fq->thresh = libeth_xdp_queue_threshold(fq->count); + fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(libeth_xskfq_create, LIBETH_XDP); + +/** + * libeth_xskfq_destroy - destroy an XSkFQ + * @fq: fill queue to destroy + * + * Zeroes the used fields and frees the FQEs array. + */ +void libeth_xskfq_destroy(struct libeth_xskfq *fq) +{ + fq->buf_len = 0; + fq->thresh = 0; + fq->pending = 0; + + kvfree(fq->fqes); +} +EXPORT_SYMBOL_NS_GPL(libeth_xskfq_destroy, LIBETH_XDP); + +/* .ndo_xsk_wakeup */ + +static void libeth_xsk_napi_sched(void *info) +{ + __napi_schedule_irqoff(info); +} + +/** + * libeth_xsk_init_wakeup - initialize libeth XSk wakeup structure + * @csd: struct to initialize + * @napi: NAPI corresponding to this queue + * + * libeth_xdp uses inter-processor interrupts to perform XSk wakeups. In order + * to do that, the corresponding CSDs must be initialized when creating the + * queues. + */ +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi) +{ + INIT_CSD(csd, libeth_xsk_napi_sched, napi); +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_init_wakeup, LIBETH_XDP); + +/** + * libeth_xsk_wakeup - perform an XSk wakeup + * @csd: CSD corresponding to the queue + * @qid: the stack queue index + * + * Try to mark the NAPI as missed first, so that it could be rescheduled. + * If it's not, schedule it on the corresponding CPU using IPIs (or directly + * if already running on it). + */ +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid) +{ + struct napi_struct *napi = csd->info; + + if (napi_if_scheduled_mark_missed(napi) || + unlikely(!napi_schedule_prep(napi))) + return; + + if (qid != raw_smp_processor_id()) + smp_call_function_single_async(qid, csd); + else + __napi_schedule(napi); +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_wakeup, LIBETH_XDP); + +/* Pool setup */ + +#define LIBETH_XSK_DMA_ATTR \ + (DMA_ATTR_WEAK_ORDERING | DMA_ATTR_SKIP_CPU_SYNC) + +/** + * libeth_xsk_setup_pool - setup or destroy an XSk pool for a queue + * @dev: target &net_device + * @qid: stack queue index to configure + * @enable: whether to enable or disable the pool + * + * Check that @qid is valid and then map or unmap the pool. + * + * Return: %0 on success, -errno otherwise. + */ +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable) +{ + struct xsk_buff_pool *pool; + + pool = xsk_get_pool_from_qid(dev, qid); + if (!pool) + return -EINVAL; + + if (enable) + return xsk_pool_dma_map(pool, dev->dev.parent, + LIBETH_XSK_DMA_ATTR); + else + xsk_pool_dma_unmap(pool, LIBETH_XSK_DMA_ATTR); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(libeth_xsk_setup_pool, LIBETH_XDP); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 0d6d0d749d440e..048eb599a95a8d 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -634,7 +634,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, break; case XDP_TX: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (unlikely(veth_xdp_tx(rq, xdp, bq) < 0)) { trace_xdp_exception(rq->dev, xdp_prog, act); frame = &orig_frame; @@ -646,7 +646,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, goto xdp_xmit; case XDP_REDIRECT: orig_frame = *frame; - xdp->rxq->mem = frame->mem; + xdp->rxq->mem.type = frame->mem_type; if (xdp_do_redirect(rq->dev, xdp, xdp_prog)) { frame = &orig_frame; stats->rx_drops++; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bdadb0bb6cecd1..0d537d547dced9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2542,10 +2542,10 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog); + const struct bpf_prog *xdp_prog); int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress); + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress); void __cpu_map_flush(struct list_head *flush_list); int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, @@ -2809,15 +2809,15 @@ struct sk_buff; static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { return 0; } static inline int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { return 0; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 7d7578a8eac10b..ee067ab1327242 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1178,17 +1178,18 @@ static inline int xdp_ok_fwd_dev(const struct net_device *fwd, * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *prog); + struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *prog); + const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, - struct bpf_prog *prog); + const struct bpf_prog *prog); void xdp_do_flush(void); -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0aae346d919e99..87fa2b6b565b3d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3314,6 +3314,7 @@ struct softnet_data { }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +DECLARE_PER_CPU(struct page_pool *, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) @@ -3950,9 +3951,9 @@ static inline void dev_consume_skb_any(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog); -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb); + const struct bpf_prog *xdp_prog); +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 58009fa661026b..f4fe699248a21b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -608,11 +608,19 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; - unsigned int xdp_frags_size; - /* Intermediate layers must ensure that destructor_arg - * remains valid until skb destructor */ - void * destructor_arg; + union { + struct { + u32 xdp_frags_size; + u32 xdp_frags_truesize; + }; + + /* + * Intermediate layers must ensure that destructor_arg + * remains valid until skb destructor. + */ + void *destructor_arg; + }; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; @@ -3627,7 +3635,7 @@ static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag) int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog); + const struct bpf_prog *prog); /** * skb_frag_address - gets the address of the data contained in a paged fragment @@ -3674,7 +3682,7 @@ static inline void skb_frag_page_copy(skb_frag_t *fragto, bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); /** - * skb_frag_dma_map - maps a paged fragment via the DMA API + * __skb_frag_dma_map - maps a paged fragment via the DMA API * @dev: the device to map the fragment to * @frag: the paged fragment to map * @offset: the offset within the fragment (starting at the @@ -3684,15 +3692,36 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio); * * Maps the page associated with @frag to @device. */ -static inline dma_addr_t skb_frag_dma_map(struct device *dev, - const skb_frag_t *frag, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t __skb_frag_dma_map(struct device *dev, + const skb_frag_t *frag, + size_t offset, size_t size, + enum dma_data_direction dir) { return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); } +#define skb_frag_dma_map(dev, frag, ...) \ + CONCATENATE(_skb_frag_dma_map, \ + COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__) + +#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({ \ + const skb_frag_t *uf = (frag); \ + size_t uo = (offset); \ + \ + __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo, \ + DMA_TO_DEVICE); \ +}) +#define _skb_frag_dma_map1(dev, frag, offset) \ + __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_), \ + __UNIQUE_ID(offset_)) +#define _skb_frag_dma_map0(dev, frag) \ + _skb_frag_dma_map1(dev, frag, 0) +#define _skb_frag_dma_map2(dev, frag, offset, size) \ + __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE) +#define _skb_frag_dma_map3(dev, frag, offset, size, dir) \ + __skb_frag_dma_map(dev, frag, offset, size, dir) + static inline struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { diff --git a/include/linux/unroll.h b/include/linux/unroll.h index d42fd6366373ef..2870c89a93f4cb 100644 --- a/include/linux/unroll.h +++ b/include/linux/unroll.h @@ -9,6 +9,49 @@ #include +#ifdef CONFIG_CC_IS_CLANG +#define __pick_unrolled(x, y) _Pragma(#x) +#elif CONFIG_GCC_VERSION >= 80000 +#define __pick_unrolled(x, y) _Pragma(#y) +#else +#define __pick_unrolled(x, y) /* not supported */ +#endif + +/** + * unrolled - loop attributes to ask the compiler to unroll it + * + * Usage: + * + * #define BATCH 4 + * unrolled_count(BATCH) + * for (u32 i = 0; i < BATCH; i++) + * // loop body without cross-iteration dependencies + * + * This is only a hint and the compiler is free to disable unrolling if it + * thinks the count is suboptimal and may hurt performance and/or hugely + * increase object code size. + * Not having any cross-iteration dependencies (i.e. when iter x + 1 depends + * on what iter x will do with variables) is not a strict requirement, but + * provides best performance and object code size. + * Available only on Clang and GCC 8.x onwards. + */ + +/* Ask the compiler to pick an optimal unroll count, Clang only */ +#define unrolled \ + __pick_unrolled(clang loop unroll(enable), /* nothing */) + +/* Unroll each @n iterations of a loop */ +#define unrolled_count(n) \ + __pick_unrolled(clang loop unroll_count(n), GCC unroll n) + +/* Unroll the whole loop */ +#define unrolled_full \ + __pick_unrolled(clang loop unroll(full), GCC unroll 65534) + +/* Never unroll a loop */ +#define unrolled_none \ + __pick_unrolled(clang loop unroll(disable), GCC unroll 1) + #define UNROLL(N, MACRO, args...) CONCATENATE(__UNROLL_, N)(MACRO, args) #define __UNROLL_0(MACRO, args...) diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index 43574bd6612f0e..148be5cd822e92 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -13,8 +13,10 @@ /* Space reserved in front of each frame */ #define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \ + NET_IP_ALIGN) /* Maximum headroom for worst-case calculations */ -#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) /* Maximum supported L2-L4 header length */ @@ -66,6 +68,7 @@ enum libeth_fqe_type { * @count: number of descriptors/buffers the queue has * @type: type of the buffers this queue has * @hsplit: flag whether header split is enabled + * @xdp: flag indicating whether XDP is enabled * @buf_len: HW-writeable length per each buffer * @nid: ID of the closest NUMA node with memory */ @@ -81,6 +84,7 @@ struct libeth_fq { /* Cold fields */ enum libeth_fqe_type type:2; bool hsplit:1; + bool xdp:1; u32 buf_len; int nid; diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 35614f9523f600..63a76ab7746d51 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -12,11 +12,17 @@ /** * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion - * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + * @__LIBETH_SQE_XDP_START: separator between skb and XDP types + * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats + * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats + * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA + * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats + * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free() */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -24,6 +30,13 @@ enum libeth_sqe_type { LIBETH_SQE_SLAB, LIBETH_SQE_FRAG, LIBETH_SQE_SKB, + + __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_XMIT, + LIBETH_SQE_XDP_XMIT_FRAG, + LIBETH_SQE_XSK_TX, + LIBETH_SQE_XSK_TX_FRAG, }; /** @@ -32,6 +45,9 @@ enum libeth_sqe_type { * @rs_idx: index of the last buffer from the batch this one was sent in * @raw: slab buffer to free via kfree() * @skb: &sk_buff to consume + * @sinfo: skb shared info of an XDP_TX frame + * @xdpf: XDP frame from ::ndo_xdp_xmit() + * @xsk: XSk Rx frame from XDP_TX action * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -46,6 +62,9 @@ struct libeth_sqe { union { void *raw; struct sk_buff *skb; + struct skb_shared_info *sinfo; + struct xdp_frame *xdpf; + struct libeth_xdp_buff *xsk; }; DEFINE_DMA_UNMAP_ADDR(dma); @@ -71,7 +90,10 @@ struct libeth_sqe { /** * struct libeth_cq_pp - completion queue poll params * @dev: &device to perform DMA unmapping + * @bq: XDP frame bulk to combine return operations * @ss: onstack NAPI stats to fill + * @xss: onstack XDPSQ NAPI stats to fill + * @xdp_tx: number of XDP-not-XSk frames processed * @napi: whether it's called from the NAPI context * * libeth uses this structure to access objects needed for performing full @@ -80,7 +102,13 @@ struct libeth_sqe { */ struct libeth_cq_pp { struct device *dev; - struct libeth_sq_napi_stats *ss; + struct xdp_frame_bulk *bq; + + union { + struct libeth_sq_napi_stats *ss; + struct libeth_xdpsq_napi_stats *xss; + }; + u32 xdp_tx; bool napi; }; @@ -126,4 +154,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe, sqe->type = LIBETH_SQE_EMPTY; } +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp); + #endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 603825e451339a..27ded729467aa8 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -4,7 +4,27 @@ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H -#include +#include + +/* Stats */ + +/** + * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop + * @packets: received frames counter + * @bytes: sum of bytes of received frames above + * @fragments: sum of fragments of received S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_rq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop @@ -22,4 +42,84 @@ struct libeth_sq_napi_stats { }; }; +/** + * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx + * completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @fragments: sum of fragments of completed S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_xdpsq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +/* XDP */ + +/* + * The following structures should be embedded into driver's queue structure + * and passed to the libeth_xdp helpers, never used directly. + */ + +/* XDPSQ sharing */ + +/** + * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs + * @lock: spinlock for locking the queue + * @share: whether this particular queue is shared + */ +struct libeth_xdpsq_lock { + spinlock_t lock; + bool share; +}; + +/* XDPSQ clean-up timers */ + +/** + * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts + * @xdpsq: queue this timer belongs to + * @lock: lock for the queue + * @dwork: work performing cleanups + * + * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no + * space for sending the current queued frame/bulk, must fire up timers to + * make sure there are no stale buffers to free. + */ +struct libeth_xdpsq_timer { + void *xdpsq; + struct libeth_xdpsq_lock *lock; + + struct delayed_work dwork; +}; + +/* Rx polling path */ + +/** + * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue + * @data: pointer to the start of the frame, xdp_buff.data + * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start + * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data + * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz + * @flags: xdp_buff.flags + * + * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This + * structure carries only necessary fields to save/restore a partially built + * frame on the queue structure to finish it during the next NAPI poll. + */ +struct libeth_xdp_buff_stash { + void *data; + u16 headroom; + u16 len; + + u32 frame_sz:24; + u32 flags:8; +} __aligned_largest; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h new file mode 100644 index 00000000000000..dfca2680cf755f --- /dev/null +++ b/include/net/libeth/xdp.h @@ -0,0 +1,1864 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_XDP_H +#define __LIBETH_XDP_H + +#include +#include + +#include +#include +#include + +/* Defined as bits to be able to use them as a mask */ +enum { + LIBETH_XDP_PASS = 0U, + LIBETH_XDP_DROP = BIT(0), + LIBETH_XDP_ABORTED = BIT(1), + LIBETH_XDP_TX = BIT(2), + LIBETH_XDP_REDIRECT = BIT(3), +}; + +/* + * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, + * pick maximum pointer-compatible alignment. + */ +#define __LIBETH_XDP_BUFF_ALIGN \ + (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \ + IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \ + sizeof(long)) + +/** + * struct libeth_xdp_buff - libeth extension over &xdp_buff + * @base: main &xdp_buff + * @data: shortcut for @base.data + * @desc: RQ descriptor containing metadata for this buffer + * @priv: driver-private scratchspace + * + * The main reason for this is to have a pointer to the descriptor to be able + * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks + * (as well as bigger alignment). + * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk. + */ +struct libeth_xdp_buff { + union { + struct xdp_buff base; + void *data; + }; + + const void *desc; + unsigned long priv[] + __aligned(__LIBETH_XDP_BUFF_ALIGN); +} __aligned(__LIBETH_XDP_BUFF_ALIGN); +static_assert(offsetof(struct libeth_xdp_buff, data) == + offsetof(struct xdp_buff_xsk, xdp.data)); +static_assert(offsetof(struct libeth_xdp_buff, desc) == + offsetof(struct xdp_buff_xsk, cb)); +static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), + __alignof(struct libeth_xdp_buff))); + +/** + * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: sizeof() of the driver-private data + */ +#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__) +/** + * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: type or variable name of the driver-private data + */ +#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__)) + +#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + _DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \ + LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \ + /* no init */); \ + LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0) + +#define __libeth_xdp_priv_sz(...) \ + CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define __libeth_xdp_psz0(...) +#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__) + +#define LIBETH_XDP_PRIV_SZ(sz) \ + (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long)) + +/* Performs XSK_CHECK_PRIV_TYPE() */ +#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \ + static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \ + struct_size_t(struct libeth_xdp_buff, priv, \ + LIBETH_XDP_PRIV_SZ(sz))) + +/* XDPSQ sharing */ + +DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); + +/** + * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys + * @rxq: current number of active Rx queues + * @txq: current number of active Tx queues + * @max: maximum number of Tx queues + * + * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ + * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these + * two with the number of SQs the device can have (minus used ones). + * + * Return: number of XDP Tx queues the device needs to use. + */ +static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max) +{ + return min(max(nr_cpu_ids, rxq), max - txq); +} + +/** + * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs + * @num: number of active XDPSQs + * + * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise. + */ +static inline bool libeth_xdpsq_shared(u32 num) +{ + return num < nr_cpu_ids; +} + +/** + * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU + * @qid: number of active XDPSQs + * + * Helper for libeth_xdp routines, do not use in drivers directly. + * + * Return: XDPSQ index needs to be used on this CPU. + */ +static inline u32 libeth_xdpsq_id(u32 qid) +{ + u32 ret = raw_smp_processor_id(); + + if (static_branch_unlikely(&libeth_xdpsq_share) && + libeth_xdpsq_shared(qid)) + ret %= qid; + + return ret; +} + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); + +#define libeth_xdpsq_get_start cpus_read_lock +#define libeth_xdpsq_get_end cpus_read_unlock + +/** + * libeth_xdpsq_get - initialize a &libeth_xdpsq_lock + * @lock: lock to initialize + * @dev: netdev which this lock belongs to + * @share: whether XDPSQs can be shared + * + * Must be called only inside a libeth_xdpsq_get_{start,put}() block. + * Tracks the current XDPSQ association and enables the static lock + * if needed. + */ +static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev, + bool share) +{ + if (unlikely(share)) + __libeth_xdpsq_get(lock, dev); +} + +/** + * libeth_xdpsq_put - deinitialize a &libeth_xdpsq_lock + * @lock: lock to deinitialize + * @dev: netdev which this lock belongs to + * + * Must be called only inside a libeth_xdpsq_get_{start,put}() block. + * Tracks the current XDPSQ association and disables the static lock + * if needed. + */ +static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_put(lock, dev); +} + +void __acquires(&lock->lock) +__libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock); +void __releases(&lock->lock) +__libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock); + +/** + * libeth_xdpsq_lock - grab a &libeth_xdpsq_lock if needed + * @lock: lock to take + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_lock(lock); +} + +/** + * libeth_xdpsq_unlock - free a &libeth_xdpsq_lock if needed + * @lock: lock to free + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_unlock(lock); +} + +/* XDPSQ clean-up timers */ + +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)); + +/** + * libeth_xdpsq_deinit_timer - deinitialize a &libeth_xdpsq_timer + * @timer: timer to deinitialize + * + * Flush and disable the underlying workqueue. + */ +static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer) +{ + cancel_delayed_work_sync(&timer->dwork); +} + +/** + * libeth_xdpsq_queue_timer - run a &libeth_xdpsq_timer + * @timer: timer to queue + * + * Should be called after the queue was filled and the transmission was run + * to complete the pending buffers if no further sending will be done in a + * second (-> lazy cleaning won't happen). + * If the timer was already run, it will be requeued back to one second + * timeout again. + */ +static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer) +{ + mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq, + &timer->dwork, HZ); +} + +/** + * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event + * @work: workqueue belonging to the corresponding timer + * @poll: driver-specific completion queue poll function + * + * Run the polling function on the locked queue and requeue the timer if + * there's more work to do. + * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below. + */ +static __always_inline void +libeth_xdpsq_run_timer(struct work_struct *work, + u32 (*poll)(void *xdpsq, u32 budget)) +{ + struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer), + dwork.work); + + libeth_xdpsq_lock(timer->lock); + + if (poll(timer->xdpsq, U32_MAX)) + libeth_xdpsq_queue_timer(timer); + + libeth_xdpsq_unlock(timer->lock); +} + +/* Common Tx bits */ + +/** + * enum - libeth_xdp internal Tx flags + * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue + * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled + * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() + * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk + */ +enum { + LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, + LIBETH_XDP_TX_BATCH = 8, + + LIBETH_XDP_TX_DROP = BIT(0), + LIBETH_XDP_TX_NDO = BIT(1), + LIBETH_XDP_TX_XSK = BIT(2), +}; + +/** + * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags + * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload + * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits + * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame + * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame + * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags + * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags + */ +enum { + LIBETH_XDP_TX_LEN = GENMASK(15, 0), + + LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM, + LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN, + + LIBETH_XDP_TX_FIRST = BIT(16), + LIBETH_XDP_TX_LAST = BIT(17), + LIBETH_XDP_TX_MULTI = BIT(18), + + LIBETH_XDP_TX_FLAGS = GENMASK(31, 16), +}; + +/** + * struct libeth_xdp_tx_frame - represents one XDP Tx element + * @data: frame start pointer for ``XDP_TX`` + * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed + * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info + * @frag: one (non-head) frag for ``XDP_TX`` + * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() + * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() + * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag + * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() + * @flags: Tx flags for the above + * @opts: combined @len + @flags for the above for speed + * @desc: XSk xmit descriptor for direct casting + */ +struct libeth_xdp_tx_frame { + union { + /* ``XDP_TX`` */ + struct { + void *data; + u32 len_fl; + u32 soff; + }; + + /* ``XDP_TX`` frag */ + skb_frag_t frag; + + /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */ + struct { + union { + struct xdp_frame *xdpf; + dma_addr_t dma; + + struct libeth_xdp_buff *xsk; + }; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; + }; + + /* XSk xmit */ + struct xdp_desc desc; + }; +} __aligned(sizeof(struct xdp_desc)); +static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == + offsetof(struct libeth_xdp_tx_frame, len_fl)); +static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc)); + +/** + * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending + * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() + * @dev: &net_device which the frames are transmitted on + * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session + * @count: current number of frames in @bulk + * @bulk: array of queued frames for bulk Tx + * + * All XDP Tx operations except XSk xmit queue each frame to the bulk first + * and flush it when @count reaches the array end. Bulk is always placed on + * the stack for performance. One bulk element contains all the data necessary + * for sending a frame and then freeing it on completion. + * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly + * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is + * not used. + */ +struct libeth_xdp_tx_bulk { + const struct bpf_prog *prog; + struct net_device *dev; + void *xdpsq; + + u32 act_mask; + u32 count; + struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; +} __aligned(sizeof(struct libeth_xdp_tx_frame)); + +/** + * struct libeth_xdpsq - abstraction for an XDPSQ + * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit + * @sqes: array of Tx buffers from the actual queue struct + * @descs: opaque pointer to the HW descriptor array + * @ntu: pointer to the next free descriptor index + * @count: number of descriptors on that queue + * @pending: pointer to the number of sent-not-completed descs on that queue + * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames + * @lock: corresponding XDPSQ lock + * + * Abstraction for driver-independent implementation of Tx. Placed on the stack + * and filled by the driver before the transmission, so that the generic + * functions can access and modify driver-specific resources. + */ +struct libeth_xdpsq { + struct xsk_buff_pool *pool; + struct libeth_sqe *sqes; + void *descs; + + u32 *ntu; + u32 count; + + u32 *pending; + u32 *xdp_tx; + struct libeth_xdpsq_lock *lock; +}; + +/** + * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor + * @addr: DMA address of the frame + * @len: length of the frame + * @flags: XDP Tx flags + * @opts: combined @len + @flags for speed + * + * Filled by the generic functions and then passed to driver-specific functions + * to fill a HW Tx descriptor, always placed on the [function] stack. + */ +struct libeth_xdp_tx_desc { + dma_addr_t addr; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; +} __aligned_largest; + +/** + * libeth_xdp_ptr_to_priv - convert a pointer to a libeth_xdp u64 priv + * @ptr: pointer to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when you want to pass a pointer there. + */ +#define libeth_xdp_ptr_to_priv(ptr) ({ \ + typecheck_pointer(ptr); \ + ((u64)(uintptr_t)(ptr)); \ +}) +/** + * libeth_xdp_priv_to_ptr - convert a libeth_xdp u64 priv to a pointer + * @priv: private data to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when your callback takes this u64 and you want to convert + * it back to a pointer. + */ +#define libeth_xdp_priv_to_ptr(priv) ({ \ + static_assert(__same_type(priv, u64)); \ + ((const void *)(uintptr_t)(priv)); \ +}) + +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + +/** + * libeth_xdp_tx_xmit_bulk - main XDP Tx function + * @bulk: array of frames to send + * @xdpsq: pointer to the driver-specific XDPSQ struct + * @n: number of frames to send + * @unroll: whether to unroll the queue filling loop for speed + * @priv: driver-specific private data + * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq + * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: callback for filling a HW descriptor with the frame info + * + * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for + * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX`` and XSk + * xmit. + * @prep must lock the queue as this function releases it at the end. @unroll + * greatly increases the object code size, but also greatly increases XSk xmit + * performance; for other types of frames, it's not enabled. + * The compilers inline all those onstack abstractions to direct data accesses. + * + * Return: number of frames actually placed on the queue, <= @n. The function + * can't fail, but can send less frames if there's no enough free descriptors + * available. The actual free space is returned by @prep from the driver. + */ +static __always_inline u32 +libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, + u32 n, bool unroll, u64 priv, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv)) +__releases(sq.lock) +{ + u32 this, batched, off = 0; + struct libeth_xdpsq sq; + u32 ntu, i = 0; + + n = min(n, prep(xdpsq, &sq)); + if (unlikely(!n)) + goto unlock; + + ntu = *sq.ntu; + + this = sq.count - ntu; + if (likely(this > n)) + this = n; + +again: + if (!unroll) + goto linear; + + batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH); + + for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) { + u32 base = ntu + i - off; + + unrolled_count(LIBETH_XDP_TX_BATCH) + for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++) + xmit(fill(bulk[i + j], base + j, &sq, priv), + base + j, &sq, priv); + } + + if (batched < this) { +linear: + for ( ; i < off + this; i++) + xmit(fill(bulk[i], ntu + i - off, &sq, priv), + ntu + i - off, &sq, priv); + } + + ntu += this; + if (likely(ntu < sq.count)) + goto out; + + ntu = 0; + + if (i < n) { + this = n - i; + off = i; + + goto again; + } + +out: + *sq.ntu = ntu; + *sq.pending += n; + if (sq.xdp_tx) + *sq.xdp_tx += n; + +unlock: + libeth_xdpsq_unlock(sq.lock); + + return n; +} + +/* ``XDP_TX`` bulking */ + +void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp); + +/** + * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XDP buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + const struct libeth_xdp_buff *xdp) +{ + const struct xdp_buff *base = &xdp->base; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .data = xdp->data, + .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST, + .soff = xdp_data_hard_end(base) - xdp->data, + }; + + if (!xdp_buff_has_frags(base)) + return false; + + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + */ +static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag) +{ + bq->bulk[bq->count++].frag = *frag; +} + +/** + * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XDP buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + const struct skb_shared_info *sinfo; + bool ret = true; + u32 nr_frags; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + libeth_xdp_return_buff_slow(xdp); + return false; + } + + if (!libeth_xdp_tx_queue_head(bq, xdp)) + goto out; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + nr_frags = sinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + ret = false; + break; + } + + libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]); + } + +out: + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST; + xdp->data = NULL; + + return ret; +} + +/** + * libeth_xdp_tx_fill_stats - fill a &libeth_sqe with ``XDP_TX`` frame stats + * @sqe: SQ element to fill + * @desc: libeth_xdp Tx descriptor + * @sinfo: &skb_shared_info for this frame + * + * Internal helper for filling an SQE with the frame stats, do not use in + * drivers. Fills the number of frags and bytes for this frame. + */ +#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \ + __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \ + __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_)) + +#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \ + const struct libeth_xdp_tx_desc *ud = (desc); \ + const struct skb_shared_info *us; \ + struct libeth_sqe *ue = (sqe); \ + \ + ue->nr_frags = 1; \ + ue->bytes = ud->len; \ + \ + if (ud->flags & LIBETH_XDP_TX_MULTI) { \ + us = (sinfo); \ + ue->nr_frags += us->nr_frags; \ + ue->bytes += us->xdp_frags_size; \ + } \ +} while (0) + +/** + * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct skb_shared_info *sinfo; + skb_frag_t *frag = &frm.frag; + struct libeth_sqe *sqe; + + if (frm.len_fl & LIBETH_XDP_TX_FIRST) { + sinfo = frm.data + frm.soff; + skb_frag_fill_page_desc(frag, virt_to_page(frm.data), + offset_in_page(frm.data), + frm.len_fl); + } else { + sinfo = NULL; + } + + desc = (typeof(desc)){ + .addr = page_pool_get_dma_addr(skb_frag_page(frag)) + + skb_frag_off(frag), + .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN, + .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS, + }; + + dma_sync_single_for_device(skb_frag_page(frag)->pp->p.dev, desc.addr, + desc.len, DMA_BIDIRECTIONAL); + + if (!sinfo) + return desc; + + sqe = &sq->sqes[i]; + sqe->type = LIBETH_SQE_XDP_TX; + sqe->sinfo = sinfo; + libeth_xdp_tx_fill_stats(sqe, &desc, sinfo); + + return desc; +} + +void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags); + +/** + * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk + * @bq: bulk to flush + * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.) + * @prep: driver-specific callback to prepare the queue for sending + * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: driver callback to fill a HW descriptor + * + * Internal abstraction to create bulk flush functions for drivers. Used for + * everything except XSk xmit. + * + * Return: true if anything was sent, false otherwise. + */ +static __always_inline bool +__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv)) +{ + u32 sent, drops; + int err = 0; + + sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq, + min(bq->count, LIBETH_XDP_TX_BULK), + false, 0, prep, fill, xmit); + drops = bq->count - sent; + + if (unlikely(drops)) { + libeth_xdp_tx_exception(bq, sent, flags); + err = -ENXIO; + } else { + bq->count = 0; + } + + trace_xdp_bulk_tx(bq->dev, sent, drops, err); + + return likely(sent); +} + +/** + * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see above + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver + * callback. + */ +#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ + xmit) + +/* .ndo_xdp_xmit() implementation */ + +/** + * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit + * @bq: bulk to initialize + * @dev: target &net_device + * @xdpsqs: array of driver-specific XDPSQ structs + * @num: number of active XDPSQs (the above array length) + */ +#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \ + __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)]) + +static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq, + struct net_device *dev, + void *xdpsq) +{ + bq->dev = dev; + bq->xdpsq = xdpsq; + bq->count = 0; +} + +/** + * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame + * @xf: pointer to the XDP frame + * + * There's no place in &libeth_xdp_tx_frame to store DMA address for an + * &xdp_frame head. The headroom is used then, the address is placed right + * after the frame struct, naturally aligned. + * + * Return: pointer to the DMA address to use. + */ +#define libeth_xdp_xmit_frame_dma(xf) \ + _Generic((xf), \ + const struct xdp_frame *: \ + (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \ + struct xdp_frame *: \ + (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \ + ) + +static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf) +{ + void *addr = (void *)(xdpf + 1); + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + __alignof(*xdpf) < sizeof(dma_addr_t)) + addr = PTR_ALIGN(addr, sizeof(dma_addr_t)); + + return addr; +} + +/** + * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head + * @bq: XDP Tx bulk to queue the head frag to + * @xdpf: XDP frame with the head to queue + * @dev: device to perform DMA mapping + * + * Return: ``LIBETH_XDP_DROP`` on DMA mapping error, + * ``LIBETH_XDP_PASS`` if it's the only frag in the frame, + * ``LIBETH_XDP_TX`` if it's an S/G frame. + */ +static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + struct device *dev) +{ + dma_addr_t dma; + + dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + return LIBETH_XDP_DROP; + + *libeth_xdp_xmit_frame_dma(xdpf) = dma; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xdpf = xdpf, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), + }; + + if (!xdp_frame_has_frags(xdpf)) + return LIBETH_XDP_PASS; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return LIBETH_XDP_TX; +} + +/** + * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + * @dev: device to perform DMA mapping + * + * Return: true on success, false on DMA mapping error. + */ +static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag, + struct device *dev) +{ + dma_addr_t dma; + + dma = skb_frag_dma_map(dev, frag); + if (dma_mapping_error(dev, dma)) + return false; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .dma = dma, + __libeth_xdp_tx_len(skb_frag_size(frag)), + }; + + return true; +} + +/** + * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame + * @bq: XDP Tx bulk to queue the frame to + * @xdpf: XDP frame to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: ``LIBETH_XDP_TX`` on success, + * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack, + * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp. + */ +static __always_inline u32 +libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 head, nr_frags, i, ret = LIBETH_XDP_TX; + struct device *dev = bq->dev->dev.parent; + const struct skb_shared_info *sinfo; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + return LIBETH_XDP_DROP; + + head = libeth_xdp_xmit_queue_head(bq, xdpf, dev); + if (head == LIBETH_XDP_PASS) + goto out; + else if (head == LIBETH_XDP_DROP) + return LIBETH_XDP_DROP; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + nr_frags = sinfo->nr_frags; + + for (i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + break; + + if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev)) + break; + } + + if (unlikely(i < nr_frags)) + ret = LIBETH_XDP_ABORTED; + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the mapped DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct libeth_sqe *sqe; + struct xdp_frame *xdpf; + + if (frm.flags & LIBETH_XDP_TX_FIRST) { + xdpf = frm.xdpf; + desc.addr = *libeth_xdp_xmit_frame_dma(xdpf); + } else { + xdpf = NULL; + desc.addr = frm.dma; + } + desc.opts = frm.opts; + + sqe = &sq->sqes[i]; + dma_unmap_addr_set(sqe, dma, desc.addr); + dma_unmap_len_set(sqe, len, desc.len); + + if (!xdpf) { + sqe->type = LIBETH_SQE_XDP_XMIT_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XDP_XMIT; + sqe->xdpf = xdpf; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_frame(xdpf)); + + return desc; +} + +/** + * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver + * callback. + */ +#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ + libeth_xdp_xmit_fill_buf, xmit) + +u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev); + +/** + * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit() + * @bq: XDP Tx bulk to queue frames to + * @frames: XDP frames passed by the stack + * @n: number of frames + * @flags: flags passed by the stack + * @flush_bulk: driver callback to flush an XDP xmit bulk + * @finalize: driver callback to finalize sending XDP Tx frames on the queue + * + * Perform common checks, map the frags and queue them to the bulk, then flush + * the bulk to the XDPSQ. If requested by the stack, finalize the queue. + * + * Return: number of frames send or -errno on error. + */ +static __always_inline int +__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame **frames, u32 n, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + u32 nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + for (u32 i = 0; likely(i < n); i++) { + u32 ret; + + ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk); + if (unlikely(ret != LIBETH_XDP_TX)) { + nxmit += ret == LIBETH_XDP_ABORTED; + break; + } + + nxmit++; + } + + if (bq->count) { + flush_bulk(bq, LIBETH_XDP_TX_NDO); + if (unlikely(bq->count)) + nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk, + bq->count, + bq->dev); + } + + finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH); + + return nxmit; +} + +/** + * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver + * @dev: target &net_device + * @n: number of frames to send + * @fr: XDP frames to send + * @f: flags passed by the stack + * @xqs: array of XDPSQs driver structs + * @nqs: number of active XDPSQs, the above array length + * @fl: driver callback to flush an XDP xmit bulk + * @fin: driver cabback to finalize the queue + * + * If the driver has active XDPSQs, perform common checks and send the frames. + * Finalize the queue, if requested. + * + * Return: number of frames sent or -errno on error. + */ +#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \ + _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \ + __UNIQUE_ID(nqs_)) + +#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \ +({ \ + u32 un = (nqs); \ + int ur; \ + \ + if (likely(un)) { \ + struct libeth_xdp_tx_bulk ub; \ + \ + libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \ + ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \ + } else { \ + ur = -ENXIO; \ + } \ + \ + ur; \ +}) + +/* Rx polling path */ + +/** + * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (can be %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, + * assumes XDP is not enabled. + * Do not use for XSk, it has its own optimized helper. + */ +#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \ + typeof(bq) ub = (bq); \ + u32 un = (num); \ + \ + if (un || (xsk)) { \ + ub->prog = rcu_dereference(pr); \ + ub->dev = (d); \ + ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ + } else { \ + ub->prog = NULL; \ + } \ + \ + ub->act_mask = 0; \ + ub->count = 0; \ +} while (0) + +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src); +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src); +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); + +/** + * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll + * @dst: onstack buffer to initialize + * @src: XDP buffer stash placed on the queue + * @rxq: registered &xdp_rxq_info corresponding to this queue + * + * Should be called before the main NAPI polling loop. Loads the content of + * the previously saved stash or initializes the buffer from scratch. + * Do not use for XSk. + */ +static inline void +libeth_xdp_init_buff(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src, + struct xdp_rxq_info *rxq) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_load_stash(dst, src); + + dst->base.rxq = rxq; +} + +/** + * libeth_xdp_save_buff - save a partially built buffer on a queue + * @dst: XDP buffer stash placed on the queue + * @src: onstack buffer to save + * + * Should be called after the main NAPI polling loop. If the loop exited before + * the buffer was finished, saves its content on the queue, so that it can be + * completed during the next poll. Otherwise, clears the stash. + */ +static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_save_stash(dst, src); +} + +/** + * libeth_xdp_return_stash - free an XDP buffer stash from a queue + * @stash: stash to free + * + * If the queue is about to be destroyed, but it still has an incompleted + * buffer stash, this helper should be called to free it. + */ +static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + if (stash->data) + __libeth_xdp_return_stash(stash); +} + +static inline void libeth_xdp_return_va(const void *data, bool napi) +{ + struct page *page = virt_to_page(data); + + page_pool_put_full_page(page->pp, page, napi); +} + +static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo, + bool napi) +{ + for (u32 i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + page_pool_put_full_page(page->pp, page, napi); + } +} + +/** + * libeth_xdp_return_buff - free/recycle a &libeth_xdp_buff + * @xdp: buffer to free + * + * Hotpath helper to free a &libeth_xdp_buff. Comparing to xdp_return_buff(), + * it's faster as it gets inlined and always assumes order-0 pages and safe + * direct recycling. Zeroes @xdp->data to avoid UAFs. + */ +static inline void libeth_xdp_return_buff(struct libeth_xdp_buff *xdp) +{ + if (!xdp_buff_has_frags(&xdp->base)) + goto out; + + libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base), + true); + +out: + libeth_xdp_return_va(xdp->data, true); + xdp->data = NULL; +} + +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len); + +/** + * libeth_xdp_prepare_buff - fill a &libeth_xdp_buff with a head FQE data + * @xdp: XDP buffer to attach the head to + * @fqe: FQE containing the head buffer + * @len: buffer len passed from HW + * + * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer + * head with the Rx buffer data: data pointer, length, headroom, and + * truesize/tailroom. Zeroes the flags. + * Uses faster single u64 write instead of per-field access. + */ +static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + const struct page *page = fqe->page; + +#ifdef __LIBETH_WORD_ACCESS + static_assert(offsetofend(typeof(xdp->base), flags) - + offsetof(typeof(xdp->base), frame_sz) == + sizeof(u64)); + + *(u64 *)&xdp->base.frame_sz = fqe->truesize; +#else + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +#endif + xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, + page->pp->p.offset, len, true); +} + +/** + * libeth_xdp_process_buff - attach an Rx buffer to a &libeth_xdp_buff + * @xdp: XDP buffer to attach the Rx buffer to + * @fqe: Rx buffer to process + * @len: received data length from the descriptor + * + * If the XDP buffer is empty, attaches the Rx buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: true on success, false if the descriptor must be skipped (empty or + * no space for a new frag). + */ +static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + if (!libeth_rx_sync_for_cpu(fqe, len)) + return false; + + if (xdp->data) + return libeth_xdp_buff_add_frag(xdp, fqe, len); + + libeth_xdp_prepare_buff(xdp, fqe, len); + + prefetch(xdp->data); + + return true; +} + +/** + * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info + * @ss: onstack stats to update + * @xdp: buffer to account + * + * Internal helper used by __libeth_xdp_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +static inline void +libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss, + const struct libeth_xdp_buff *xdp) +{ + const struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + ss->bytes += sinfo->xdp_frags_size; + ss->fragments += sinfo->nr_frags + 1; +} + +u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret); + +/** + * __libeth_xdp_run_prog - run XDP program on an XDP buffer + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program. Handles ``XDP_DROP`` + * and ``XDP_REDIRECT`` only, the rest is processed levels up. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act < XDP_DROP || act > XDP_REDIRECT)) + goto out; + + switch (act) { + case XDP_PASS: + return LIBETH_XDP_PASS; + case XDP_DROP: + libeth_xdp_return_buff(xdp); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_REDIRECT: + if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog))) + break; + + xdp->data = NULL; + + return LIBETH_XDP_REDIRECT; + default: + break; + } + +out: + return libeth_xdp_prog_exception(bq, xdp, act, 0); +} + +/** + * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * @run: internal callback for running XDP program + * @queue: internal callback for queuing ``XDP_TX`` frame + * @flush_bulk: driver callback for flushing a bulk + * + * Internal inline abstraction to run XDP program and additionally handle + * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue. + * Do not use directly. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, + u32 (*run)(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq), + bool (*queue)(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk) + (struct libeth_xdp_tx_bulk *bq, + u32 flags)), + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 act; + + act = run(xdp, bq); + if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk))) + act = LIBETH_XDP_DROP; + + bq->act_mask |= act; + + return act; +} + +/** + * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. XSk has its + * own version. + * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: true if the buffer should be passed up the stack, false if the poll + * should go to the next buffer. + */ +#define libeth_xdp_run_prog(xdp, bq, fl) \ + (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \ + libeth_xdp_tx_queue_bulk, \ + fl) == LIBETH_XDP_PASS) + +/** + * __libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XDP buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction that does the following (non-XSk path): + * 1) adds frame size and frag number (if needed) to the onstack stats; + * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff + * 3) runs XDP program if present; + * 4) handles all possible verdicts; + * 5) on ``XDP_PASS`, builds an skb from the buffer; + * 6) populates it with the descriptor metadata; + * 7) passes it up the stack. + * + * In most cases, number 2 means just writing the pointer to the HW descriptor + * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}() + * wrappers to build a driver function. + */ +static __always_inline void +__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + bool (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (xdp_buff_has_frags(&xdp->base)) + libeth_xdp_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + if (!bq || !run || !bq->prog) + goto build; + + if (!run(xdp, bq)) + return; + +build: + skb = xdp_build_skb_from_buff(&xdp->base); + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(xdp); + return; + } + + xdp->data = NULL; + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return; + } + + napi_gro_receive(napi, skb); +} + +static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, + const void *desc) +{ + xdp->desc = desc; +} + +/** + * libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @ss: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \ + __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk) + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xdp_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, 0, flush, finalize) + +static __always_inline void +__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + if (bq->act_mask & LIBETH_XDP_TX) { + if (bq->count) + flush_bulk(bq, flags | LIBETH_XDP_TX_DROP); + finalize(bq->xdpsq, true, true); + } + if (bq->act_mask & LIBETH_XDP_REDIRECT) + xdp_do_flush(); +} + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep, + * driver_xdp_xmit); + * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog, + * driver_xdp_flush_tx, driver_populate_skb); + * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx, + * driver_xdp_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * driver_xdp_run(xdp, &bq, napi, &rs, desc); + * } + * driver_xdp_finalize_rx(&bq); + */ + +#define LIBETH_XDP_DEFINE_START() \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wold-style-declaration", \ + "Allow specifying \'static\' after the return type") + +/** + * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback + * @name: name of the function to define + * @poll: Tx polling/completion function + */ +#define LIBETH_XDP_DEFINE_TIMER(name, poll) \ +void name(struct work_struct *work) \ +{ \ + libeth_xdpsq_run_timer(work, poll); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp) + +#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + */ +#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \ + bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \ +{ \ + return libeth_##pfx##_run_prog(xdp, bq, flush); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \ + void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \ + struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \ + const void *desc) \ +{ \ + return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \ + populate); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP) + +#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \ + LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \ + LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate) + +/** + * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp) + +#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \ +void name(struct libeth_xdp_tx_bulk *bq) \ +{ \ + libeth_##pfx##_finalize_rx(bq, flush, finalize); \ +} + +#define LIBETH_XDP_DEFINE_END() __diag_pop() + +/* XMO */ + +/** + * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer + * @xdp: &libeth_xdp_buff corresponding to the queue + * @type: typeof() of the driver Rx queue structure + * @member: name of &xdp_rxq_info inside @type + * + * Often times, pointer to the RQ is needed when reading/filling metadata from + * HW descriptors. The helper can be used to quickly jump from an XDP buffer + * to the queue corresponding to its &xdp_rxq_info without introducing + * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64). + */ +#define libeth_xdp_buff_to_rq(xdp, type, member) \ + container_of_const((xdp)->base.rxq, type, member) + +/** + * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata + * @hash: pointer to the variable to write the hash to + * @rss_type: pointer to the variable to write the hash type to + * @val: hash value from the HW descriptor + * @pt: libeth parsed packet type + * + * Handle zeroed/non-available hash and convert libeth parsed packet type to + * the corresponding XDP RSS hash type. To be called at the end of + * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation. + * Note that if the driver doesn't use a constant packet type lookup table but + * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to + * generate XDP RSS hash type for each packet type. + * + * Return: 0 on success, -ENODATA when the hash is not available. + */ +static inline int libeth_xdpmo_rx_hash(u32 *hash, + enum xdp_rss_hash_type *rss_type, + u32 val, struct libeth_rx_pt pt) +{ + if (unlikely(!val)) + return -ENODATA; + + *hash = val; + *rss_type = pt.hash_type; + + return 0; +} + +/* Tx buffer completion */ + +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp); + +/** + * __libeth_xdp_complete_tx - complete a sent XDPSQE + * @sqe: SQ element / Tx buffer to complete + * @cp: Tx polling/completion params + * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * @xsk: internal callback to free XSk ``XDP_TX`` buffers + * + * Use the non-underscored version in drivers instead. This one is shared + * internally with libeth_tx_complete_any(). + * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping + * when needed, buffer freeing, stats update, and SQE invalidating. + */ +static __always_inline void +__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, + typeof(libeth_xdp_return_buff_bulk) bulk, + typeof(libeth_xsk_buff_free_slow) xsk) +{ + enum libeth_sqe_type type = sqe->type; + + switch (type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XDP_XMIT_FRAG: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1); + break; + case LIBETH_SQE_XDP_XMIT: + xdp_return_frame_bulk(sqe->xdpf, cp->bq); + break; + case LIBETH_SQE_XSK_TX: + case LIBETH_SQE_XSK_TX_FRAG: + xsk(sqe->xsk); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XSK_TX: + cp->xdp_tx -= sqe->nr_frags; + + cp->xss->packets++; + cp->xss->bytes += sqe->bytes; + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, + struct libeth_cq_pp *cp) +{ + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk, + libeth_xsk_buff_free_slow); +} + +/* Misc */ + +u32 libeth_xdp_queue_threshold(u32 count); + +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo); +void libeth_xdp_set_redirect(struct net_device *dev, bool enable); + +/** + * libeth_xdp_set_features - set XDP features for a netdev + * @dev: &net_device to configure + * @...: optional params, see __libeth_xdp_set_features() + * + * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That + * said, it should be used only when XDPSQs are always available regardless + * of whether an XDP prog is attached to @dev. + */ +#define libeth_xdp_set_features(dev, ...) \ + CONCATENATE(__libeth_xdp_feat, \ + COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) + +#define __libeth_xdp_feat0(dev) \ + __libeth_xdp_set_features(dev, NULL, 0, NULL) +#define __libeth_xdp_feat1(dev, xmo) \ + __libeth_xdp_set_features(dev, xmo, 0, NULL) +#define __libeth_xdp_feat2(dev, xmo, zc_segs) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, NULL) +#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, tmo) + +/** + * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir + * @dev: target &net_device + * @...: optional params, see __libeth_xdp_set_features() + * + * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are + * not available right after netdev registration. + */ +#define libeth_xdp_set_features_noredir(dev, ...) \ + __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \ + ##__VA_ARGS__) + +#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \ + struct net_device *ud = (dev); \ + \ + libeth_xdp_set_features(ud, ##__VA_ARGS__); \ + libeth_xdp_set_redirect(ud, false); \ +} while (0) + +#define libeth_xsktmo ((const void *)true) + +#endif /* __LIBETH_XDP_H */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h new file mode 100644 index 00000000000000..3d5d20a6bdde1e --- /dev/null +++ b/include/net/libeth/xsk.h @@ -0,0 +1,684 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_XSK_H +#define __LIBETH_XSK_H + +#include +#include + +/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */ +#ifdef XDP_TXMD_FLAGS_VALID +static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD); +#endif + +/* ``XDP_TX`` bulking */ + +/** + * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XSk buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = xdp, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), + }; + + if (likely(!xdp_buff_has_frags(&xdp->base))) + return false; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: XSk frag to queue + */ +static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *frag) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = frag, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), + }; +} + +/** + * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XSk buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + bool ret = true; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + libeth_xsk_buff_free_slow(xdp); + return false; + } + + if (!libeth_xsk_tx_queue_head(bq, xdp)) + goto out; + + for (const struct libeth_xdp_buff *head = xdp; ; ) { + xdp = container_of(xsk_buff_get_frag(&head->base), + typeof(*xdp), base); + if (!xdp) + break; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + ret = false; + break; + } + + libeth_xsk_tx_queue_frag(bq, xdp); + } + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_buff *xdp = frm.xsk; + struct libeth_xdp_tx_desc desc = { + .addr = xsk_buff_xdp_get_dma(&xdp->base), + .opts = frm.opts, + }; + struct libeth_sqe *sqe; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + sqe = &sq->sqes[i]; + sqe->xsk = xdp; + + if (!(desc.flags & LIBETH_XDP_TX_FIRST)) { + sqe->type = LIBETH_SQE_XSK_TX_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XSK_TX; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_buff(&xdp->base)); + + return desc; +} + +/** + * libeth_xsk_tx_flush_bulk - wrapper to define flush of an XSk ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver + * callback. + */ +#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ + libeth_xsk_tx_fill_buf, xmit) + +/* XSk TMO */ + +/** + * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload + * @csum_start: unused + * @csum_offset: unused + * @priv: &libeth_xdp_tx_desc from the filling helper + * + * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't + * require filling checksum offsets and other parameters beside the checksum + * request bit. + * Consider using within @libeth_xsktmo unless the driver requires HW-specific + * callbacks. + */ +static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset, + void *priv) +{ + ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM; +} + +/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */ +static const struct xsk_tx_metadata_ops __libeth_xsktmo = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/** + * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and + * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload. + * + * Return: XDP Tx descriptor with the DMA, metadata request bits, and other + * info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq, + u64 priv) +{ + const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv); + struct libeth_xdp_tx_desc desc; + struct xdp_desc_ctx ctx; + + ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); + desc = (typeof(desc)){ + .addr = ctx.dma, + __libeth_xdp_tx_len(xdesc->len), + }; + + BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); + tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo; + + xsk_tx_metadata_request(ctx.meta, tmo, &desc); + + return desc; +} + +/* XSk xmit implementation */ + +/** + * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * + * Return: XDP Tx descriptor with the DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq) +{ + return (struct libeth_xdp_tx_desc){ + .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), + __libeth_xdp_tx_len(xdesc->len), + }; +} + +/** + * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit + * @frm: &xdp_desc from the XSk buffer pool + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Depending on the metadata ops presence (determined at compile time), calls + * the quickest helper to build a libeth XDP Tx descriptor. + * + * Return: XDP Tx descriptor with the synced DMA, metadata request bits, + * and other info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + + if (priv) + desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv); + else + desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq); + + desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + return desc; +} + +/** + * libeth_xsk_xmit_do_bulk - send XSk xmit frames + * @pool: XSk buffer pool containing the frames to send + * @xdpsq: opaque pointer to driver's XDPSQ struct + * @budget: maximum number of frames can be sent + * @tmo: optional XSk Tx metadata ops + * @prep: driver callback to build a &libeth_xdpsq + * @xmit: driver callback to put frames to a HW queue + * @finalize: driver callback to start a transmission + * + * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed + * lazy cleaning is used and interrupts are disabled for the queue. + * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize + * writes. + * Note that unlike other XDP Tx ops, the queue must be locked and cleaned + * prior to calling this function to already know available @budget. + * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``. + * + * Return: false if @budget was exhausted, true otherwise. + */ +static __always_inline bool +libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, + const struct xsk_tx_metadata_ops *tmo, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + const struct libeth_xdp_tx_frame *bulk; + bool wake; + u32 n; + + wake = xsk_uses_need_wakeup(pool); + if (wake) + xsk_clear_tx_need_wakeup(pool); + + n = xsk_tx_peek_release_desc_batch(pool, budget); + bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc); + + libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true, + libeth_xdp_ptr_to_priv(tmo), prep, + libeth_xsk_xmit_fill_buf, xmit); + finalize(xdpsq, n, true); + + if (wake) + xsk_set_tx_need_wakeup(pool); + + return n < budget; +} + +/* Rx polling path */ + +/** + * libeth_xsk_tx_init_bulk - initialize an XDP Tx bulk for XSk Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (never %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. + * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled + * when hitting this path. + */ +#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp); + +/** + * libeth_xsk_process_buff - attach an XSk Rx buffer to a &libeth_xdp_buff + * @head: head XSk buffer to attach the XSk buffer to (or %NULL) + * @xdp: XSk buffer to process + * @len: received data length from the descriptor + * + * If @head == %NULL, treats the XSk buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: head XSk buffer on success or if the descriptor must be skipped + * (empty), %NULL if there is no space for a new frag. + */ +static inline struct libeth_xdp_buff * +libeth_xsk_process_buff(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp, u32 len) +{ + if (unlikely(!len)) { + libeth_xsk_buff_free_slow(xdp); + return head; + } + + xsk_buff_set_size(&xdp->base, len); + xsk_buff_dma_sync_for_cpu(&xdp->base); + + if (head) + return libeth_xsk_buff_add_frag(head, xdp); + + prefetch(xdp->data); + + return xdp; +} + +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp); + +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret); + +/** + * __libeth_xsk_run_prog - run XDP program on an XSk buffer + * @xdp: XSk buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program on XSk Rx path. Handles + * only the most common ``XDP_REDIRECT`` inline, the rest is processed + * externally. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + int ret = 0; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act != XDP_REDIRECT)) +rest: + return __libeth_xsk_run_prog_slow(xdp, bq, act, ret); + + ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog); + if (unlikely(ret)) + goto rest; + + return LIBETH_XDP_REDIRECT; +} + +/** + * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +#define libeth_xsk_run_prog(xdp, bq, fl) \ + __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \ + libeth_xsk_tx_queue_bulk, fl) + +/** + * __libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XSk buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its + * doc for details. + * + * Return: false if the polling loop must be exited due to lack of free + * buffers, true otherwise. + */ +static __always_inline bool +__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + u32 (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + u32 act; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (unlikely(xdp_buff_has_frags(&xdp->base))) + libeth_xsk_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + act = run(xdp, bq); + if (unlikely(act == LIBETH_XDP_ABORTED)) + return false; + else if (likely(act != LIBETH_XDP_PASS)) + return true; + + skb = xdp_build_skb_from_zc(&xdp->base); + if (unlikely(!skb)) { + libeth_xsk_buff_free_slow(xdp); + return true; + } + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return true; + } + + napi_gro_receive(napi, skb); + + return true; +} + +/** + * libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \ + __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xsk_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize) + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep, + * driver_xdp_xmit); + * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog, + * driver_xsk_flush_tx, driver_populate_skb); + * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx, + * driver_xsk_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc)) + * break; + * } + * driver_xsk_finalize_rx(&bq); + */ + +/** + * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + */ +#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \ + u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \ + bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK) + +/** + * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) + +/* Refilling */ + +/** + * struct libeth_xskfq - structure representing an XSk buffer (fill) queue + * @fp: hotpath part of the structure + * @pool: &xsk_buff_pool for buffer management + * @fqes: array of XSk buffer pointers + * @descs: opaque pointer to the HW descriptor array + * @ntu: index of the next buffer to poll + * @count: number of descriptors/buffers the queue has + * @pending: current number of XSkFQEs to refill + * @thresh: threshold below which the queue is refilled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_xskfq { + struct_group_tagged(libeth_xskfq_fp, fp, + struct xsk_buff_pool *pool; + struct libeth_xdp_buff **fqes; + void *descs; + + u32 ntu; + u32 count; + ); + + /* Cold fields */ + u32 pending; + u32 thresh; + + u32 buf_len; + int nid; +}; + +int libeth_xskfq_create(struct libeth_xskfq *fq); +void libeth_xskfq_destroy(struct libeth_xskfq *fq); + +/** + * libeth_xsk_buff_xdp_get_dma - get DMA address for an XSk &libeth_xdp_buff + * @xdp: buffer to get the DMA addr for + */ +#define libeth_xsk_buff_xdp_get_dma(xdp) \ + xsk_buff_xdp_get_dma(&(xdp)->base) + +/** + * libeth_xskfqe_alloc - allocate @n XSk Rx buffers + * @fq: hotpath part of the XSkFQ, usually onstack + * @n: number of buffers to allocate + * @fill: driver callback to write DMA addresses to HW descriptors + * + * Note that @fq->ntu gets updated, but ::pending must be recalculated + * by the caller. + * + * Return: number of buffers refilled. + */ +static __always_inline u32 +libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n, + void (*fill)(const struct libeth_xskfq_fp *fq, u32 i)) +{ + u32 this, ret, done = 0; + struct xdp_buff **xskb; + + this = fq->count - fq->ntu; + if (likely(this > n)) + this = n; + +again: + xskb = (typeof(xskb))&fq->fqes[fq->ntu]; + ret = xsk_buff_alloc_batch(fq->pool, xskb, this); + + for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++) + fill(fq, ntu + i); + + done += ret; + fq->ntu += ret; + + if (likely(fq->ntu < fq->count) || unlikely(ret < this)) + goto out; + + fq->ntu = 0; + + if (this < n) { + this = n - this; + goto again; + } + +out: + return done; +} + +/* .ndo_xsk_wakeup */ + +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi); +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid); + +/* Pool setup */ + +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable); + +#endif /* __LIBETH_XSK_H */ diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c022c410abe39d..269ee1788388b9 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -259,8 +259,7 @@ void page_pool_disable_direct_recycling(struct page_pool *pool); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem); -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count); +void page_pool_put_page_bulk(struct page **data, u32 count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -272,8 +271,7 @@ static inline void page_pool_use_xdp_mem(struct page_pool *pool, { } -static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static inline void page_pool_put_page_bulk(struct page **data, u32 count) { } #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index e6770dd40c917e..accff7de46d676 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -11,6 +11,8 @@ #include #include /* skb_shared_info */ +#include + /** * DOC: XDP RX-queue information * @@ -88,7 +90,7 @@ struct xdp_buff { u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } @@ -103,7 +105,8 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } -static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +static __always_inline bool +xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -144,15 +147,16 @@ xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * -xdp_get_shared_info_from_buff(struct xdp_buff *xdp) +xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } -static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +static __always_inline unsigned int +xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; - struct skb_shared_info *sinfo; + const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; @@ -163,26 +167,109 @@ static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) return len; } +/** + * __xdp_buff_add_frag - attach a frag to an &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @page: page containing the frag + * @offset: page offset at which the frag starts + * @size: size of the frag + * @truesize: truesize (page / page frag size) of the frag + * @try_coalesce: whether to try coalescing the frags + * + * Attach a frag to an XDP buffer. If it currently has no frags attached, + * initialize the related fields, otherwise check that the frag number + * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing + * the frag with the previous one. + * The function doesn't check/update the pfmemalloc bit. Please use the + * non-underscored wrapper in drivers. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, struct page *page, + u32 offset, u32 size, u32 truesize, + bool try_coalesce) +{ + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + skb_frag_t *prev; + u32 nr_frags; + + if (!xdp_buff_has_frags(xdp)) { + xdp_buff_set_frags_flag(xdp); + + nr_frags = 0; + sinfo->xdp_frags_size = 0; + sinfo->xdp_frags_truesize = 0; + + goto fill; + } + + nr_frags = sinfo->nr_frags; + if (unlikely(nr_frags == MAX_SKB_FRAGS)) + return false; + + prev = &sinfo->frags[nr_frags - 1]; + if (try_coalesce && page == skb_frag_page(prev) && + offset == skb_frag_off(prev) + skb_frag_size(prev)) + skb_frag_size_add(prev, size); + else +fill: + __skb_fill_page_desc_noacc(sinfo, nr_frags++, page, + offset, size); + + sinfo->nr_frags = nr_frags; + sinfo->xdp_frags_size += size; + sinfo->xdp_frags_truesize += truesize; + + return true; +} + +/** + * xdp_buff_add_frag - attach a frag to an &xdp_buff + * @xdp: XDP buffer to attach the frag to + * @page: page containing the frag + * @offset: page offset at which the frag starts + * @size: size of the frag + * @truesize: truesize (page / page frag size) of the frag + * + * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. + * + * Return: true on success, false if there's no space for the frag in + * the shared info struct. + */ +static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, struct page *page, + u32 offset, u32 size, u32 truesize) +{ + if (!__xdp_buff_add_frag(xdp, page, offset, size, truesize, true)) + return false; + + if (unlikely(page_is_pfmemalloc(page))) + xdp_buff_set_frag_pfmemalloc(xdp); + + return true; +} + struct xdp_frame { void *data; - u16 len; - u16 headroom; + u32 len; + u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. + * while mem_type is valid on remote CPU. */ - struct xdp_mem_info mem; + enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; -static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } -static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +static __always_inline bool +xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } @@ -190,18 +277,16 @@ static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; - void *xa; - void *q[XDP_BULK_QUEUE_SIZE]; + struct page *q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { - /* bq->count will be zero'ed when bq->xa gets updated */ - bq->xa = NULL; + bq->count = 0; } static inline struct skb_shared_info * -xdp_get_shared_info_from_frame(struct xdp_frame *frame) +xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); @@ -227,7 +312,13 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { - skb_shinfo(skb)->nr_frags = nr_frags; + struct skb_shared_info *sinfo = skb_shinfo(skb); + + sinfo->nr_frags = nr_frags; + /* ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, + * reset it after that these fields aren't used anymore. + */ + sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; @@ -239,6 +330,8 @@ xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, @@ -249,7 +342,8 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline -void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) +void xdp_convert_frame_to_buff(const struct xdp_frame *frame, + struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; @@ -260,7 +354,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) } static inline -int xdp_update_frame_from_buff(struct xdp_buff *xdp, +int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; @@ -302,24 +396,33 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_frame->mem = xdp->rxq->mem; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ + xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); -static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { - struct skb_shared_info *sinfo; + if (unlikely(!bq->count)) + return; + + page_pool_put_page_bulk(bq->q, bq->count); + bq->count = 0; +} + +static __always_inline unsigned int +xdp_get_frame_len(const struct xdp_frame *xdpf) +{ + const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) @@ -351,6 +454,38 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); +int xdp_reg_page_pool(struct page_pool *pool); +void xdp_unreg_page_pool(const struct page_pool *pool); +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool); + +/** + * xdp_rxq_info_attach_mem_model - attach a registered mem info to an RxQ info + * @xdp_rxq: XDP RxQ info to attach the memory info to + * @mem: already registered memory info + * + * If a driver registers its memory providers manually, it must use this + * function instead of xdp_rxq_info_reg_mem_model(). + */ +static inline void +xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, + const struct xdp_mem_info *mem) +{ + xdp_rxq->mem = *mem; +} + +/** + * xdp_rxq_info_detach_mem_model - detach a registered mem info from RxQ info + * @xdp_rxq: XDP RxQ info to detach the memory info from + * + * If a driver registers its memory providers manually and then attaches it + * via xdp_rxq_info_attach_mem_model(), it must call this function before + * xdp_rxq_info_unreg(). + */ +static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->mem = (struct xdp_mem_info){ }; +} /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 40085afd916077..324a4bb0443169 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -101,7 +101,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return xp_alloc(pool); } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return !xp_mb_desc(desc); } @@ -136,14 +136,24 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { - struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); + const void *data = xdp->data; + struct xdp_buff_xsk *frag; + + if (!__xdp_buff_add_frag(head, virt_to_page(data), + offset_in_page(data), xdp->data_end - data, + xdp->frame_sz, false)) + return false; + frag = container_of(xdp, struct xdp_buff_xsk, xdp); list_add_tail(&frag->list_node, &frag->pool->xskb_list); + + return true; } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); struct xdp_buff *ret = NULL; @@ -195,12 +205,30 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +/** + * xsk_buff_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Wrapper for xp_raw_get_ctx() to be used in drivers, see its kdoc for + * details. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return xp_raw_get_ctx(pool, addr); +} + #define XDP_TXMD_FLAGS_VALID ( \ XDP_TXMD_FLAGS_TIMESTAMP | \ XDP_TXMD_FLAGS_CHECKSUM | \ 0) -static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +static inline bool +xsk_buff_valid_tx_metadata(const struct xsk_tx_metadata *meta) { return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); } @@ -337,7 +365,7 @@ static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) return NULL; } -static inline bool xsk_is_eop_desc(struct xdp_desc *desc) +static inline bool xsk_is_eop_desc(const struct xdp_desc *desc) { return false; } @@ -356,11 +384,13 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline void xsk_buff_add_frag(struct xdp_buff *xdp) +static inline bool xsk_buff_add_frag(struct xdp_buff *head, + struct xdp_buff *xdp) { + return false; } -static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) +static inline struct xdp_buff *xsk_buff_get_frag(const struct xdp_buff *first) { return NULL; } @@ -389,6 +419,12 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xdp_desc_ctx +xsk_buff_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + return (struct xdp_desc_ctx){ }; +} + static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) { return false; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index bb03cee716b316..1dcd4d71468a5f 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -29,7 +29,7 @@ struct xdp_buff_xsk { dma_addr_t frame_dma; struct xsk_buff_pool *pool; struct list_head list_node; -}; +} __aligned_largest; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) #define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) @@ -141,6 +141,14 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max); bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); + +struct xdp_desc_ctx { + dma_addr_t dma; + struct xsk_tx_metadata *meta; +}; + +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr); + static inline dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) { return xskb->dma; @@ -183,7 +191,7 @@ static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, !(pool->dma_pages[addr >> PAGE_SHIFT] & XSK_NEXT_PG_CONTIG_MASK); } -static inline bool xp_mb_desc(struct xdp_desc *desc) +static inline bool xp_mb_desc(const struct xdp_desc *desc) { return desc->options & XDP_PKT_CONTD; } diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a2f46785ac3b3c..774accbd4a223b 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, int err; rxq.dev = xdpf->dev_rx; - rxq.mem = xdpf->mem; + rxq.mem.type = xdpf->mem_type; /* TODO: report queue_index to xdp_rxq_info */ xdp_convert_frame_to_buff(xdpf, &xdp); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7878be18e9d264..effde52bc857fa 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, } int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { int err; @@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, struct sk_buff *skb, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *nskb; int err; @@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, } int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog, struct bpf_map *map, - bool exclude_ingress) + const struct bpf_prog *xdp_prog, + struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 93a822d3c468ca..1034c034899559 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -182,6 +182,7 @@ bool static_key_slow_inc_cpuslocked(struct static_key *key) } return true; } +EXPORT_SYMBOL_GPL(static_key_slow_inc_cpuslocked); bool static_key_slow_inc(struct static_key *key) { @@ -342,6 +343,7 @@ void static_key_slow_dec_cpuslocked(struct static_key *key) STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_cpuslocked); void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 501ec4249fedc3..9ae2a7f1738b35 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -153,7 +153,7 @@ static void xdp_test_run_init_page(netmem_ref netmem, void *arg) new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); - frm->mem = new_ctx->rxq->mem; + frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } @@ -246,7 +246,7 @@ static void reset_ctx(struct xdp_page_head *head) head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); - head->frame->mem = head->orig_ctx.rxq->mem; + head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, diff --git a/net/core/dev.c b/net/core/dev.c index 13d00fc10f5599..b93dba1e98eee7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -460,7 +460,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool *, system_page_pool); #ifdef CONFIG_LOCKDEP /* @@ -4931,7 +4931,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -5033,7 +5033,7 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, } static int -netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; @@ -5057,7 +5057,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; @@ -5110,7 +5110,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -5135,7 +5135,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; @@ -12146,11 +12146,18 @@ static int net_page_pool_create(int cpuid) .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; + int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + per_cpu(system_page_pool, cpuid) = pp_ptr; #endif return 0; @@ -12284,6 +12291,7 @@ static int __init net_dev_init(void) if (!pp_ptr) continue; + xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); per_cpu(system_page_pool, i) = NULL; } diff --git a/net/core/filter.c b/net/core/filter.c index 82f92ed0dc7234..3ea82ff692fb06 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4105,13 +4105,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - struct xdp_mem_info *mem_info, bool release) + enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_info, false, zc_frag); + __xdp_return(NULL, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4120,18 +4120,18 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { - struct xdp_mem_info *mem_info = &xdp->rxq->mem; + enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; - if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } if (release) { struct page *page = skb_frag_page(frag); - __xdp_return(page_address(page), mem_info, false, NULL); + __xdp_return(page_address(page), mem_type, false, NULL); } out: @@ -4334,9 +4334,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4357,10 +4357,10 @@ static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4429,7 +4429,7 @@ static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4443,7 +4443,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4458,9 +4459,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, void *fwd, - enum bpf_map_type map_type, u32 map_id, - u32 flags) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; @@ -4514,7 +4515,8 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -9061,7 +9063,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a813d30d213536..2d67c2cc3d770b 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -841,7 +841,6 @@ EXPORT_SYMBOL(page_pool_put_unrefed_page); /** * page_pool_put_page_bulk() - release references on multiple pages - * @pool: pool from which pages were allocated * @data: array holding page pointers * @count: number of pages in @data * @@ -854,35 +853,61 @@ EXPORT_SYMBOL(page_pool_put_unrefed_page); * Please note the caller must not use data area after running * page_pool_put_page_bulk(), as this function overwrites it. */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +void page_pool_put_page_bulk(struct page **data, u32 count) { - int i, bulk_len = 0; + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + int i, bulk_len, foreign; + struct page_pool *pool; + bool again = false; bool allow_direct; bool in_softirq; - allow_direct = page_pool_napi_local(pool); +again: + pool = NULL; + bulk_len = 0; + foreign = 0; for (i = 0; i < count; i++) { - netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i])); + struct page *page; + netmem_ref netmem; + + if (!again) { + page = compound_head(data[i]); + netmem = page_to_netmem(page); - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(netmem)) + /* It is not the last user for the page frag case */ + if (!page_pool_is_last_ref(netmem)) + continue; + } else { + page = data[i]; + netmem = page_to_netmem(page); + } + + if (unlikely(!pool)) { + pool = page->pp; + allow_direct = page_pool_napi_local(pool); + } else if (page->pp != pool) { + /* + * If the page belongs to a different page_pool, save + * it for a second round after the main loop. + */ + data[foreign++] = page; continue; + } netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) - data[bulk_len++] = (__force void *)netmem; + bulk[bulk_len++] = netmem; } if (!bulk_len) - return; + goto out; /* Bulk producer into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; @@ -893,13 +918,22 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, /* Hopefully all pages was return into ptr_ring */ if (likely(i == bulk_len)) - return; + goto out; /* ptr_ring cache full, free remaining pages outside producer lock * since put_page() with refcnt == 1 can be an expensive operation */ for (; i < bulk_len; i++) - page_pool_return_page(pool, (__force netmem_ref)data[i]); + page_pool_return_page(pool, bulk[i]); + +out: + if (!foreign) + return; + + count = foreign; + again = true; + + goto again; } EXPORT_SYMBOL(page_pool_put_page_bulk); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6841e61a6bd0b6..a441613a1e6c17 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1009,7 +1009,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog) + const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; diff --git a/net/core/xdp.c b/net/core/xdp.c index bcc5551c6424bd..40c8acde7e3f79 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -22,6 +22,8 @@ #include #include +#include "dev.h" + #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 #define REG_STATE_UNREGISTERED 0x2 @@ -358,6 +360,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; @@ -365,18 +370,74 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register a &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister a &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach a registered pool to an RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to an RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, +void __xdp_return(void *data, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp) { struct page *page; - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) @@ -399,7 +460,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -416,10 +477,10 @@ void xdp_return_frame(struct xdp_frame *xdpf) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); + __xdp_return(page_address(page), xdpf->mem_type, false, NULL); } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(xdpf->data, xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); @@ -435,10 +496,10 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); + __xdp_return(page_address(page), xdpf->mem_type, true, NULL); } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(xdpf->data, xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -452,46 +513,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; @@ -500,12 +534,12 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_page(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_page(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); @@ -521,10 +555,11 @@ void xdp_return_buff(struct xdp_buff *xdp) for (i = 0; i < sinfo->nr_frags; i++) { struct page *page = skb_frag_page(&sinfo->frags[i]); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); + __xdp_return(page_address(page), xdp->rxq->mem.type, true, + xdp); } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(xdp->data, xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); @@ -570,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -594,6 +629,197 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) } EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); +/** + * xdp_build_skb_from_buff - create an skb from an &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * an &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is PP-backed, + * Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; + + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (is_page_pool_compiled_in() && rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, tsize, + xdp_buff_is_frag_pfmemalloc(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + +/** + * xdp_copy_frags_from_zc - copy the frags from an XSk buff to an skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from an XSk &xdp_buff to an skb to pass it up the stack. + * Allocate a new page / page frag for each frag, copy it and attach to + * the skb. + * + * Return: true on success, false on page allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + const struct skb_shared_info *xinfo; + struct skb_shared_info *sinfo; + u32 nr_frags, ts; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + sinfo = skb_shinfo(skb); + +#if IS_ENABLED(CONFIG_PAGE_POOL) + ts = 0; +#else + ts = xinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; +#endif + + for (u32 i = 0; i < nr_frags; i++) { + u32 len = skb_frag_size(&xinfo->frags[i]); + void *data; +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 truesize = len; + + data = page_pool_dev_alloc_va(pp, &truesize); + ts += truesize; +#else + data = napi_alloc_frag(len); +#endif + if (unlikely(!data)) + return false; + + memcpy(data, skb_frag_address(&xinfo->frags[i]), + LARGEST_ALIGN(len)); + __skb_fill_page_desc(skb, sinfo->nr_frags++, + virt_to_page(data), + offset_in_page(data), len); + } + + xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, + ts, false); + + return true; +} + +/** + * xdp_build_skb_from_zc - create an skb from an XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new page for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new pages for them and copy. + * If Page Pool is available, the function allocates memory from the system + * percpu pools to try recycling the pages, otherwise it uses the NAPI page + * frag caches. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + struct page_pool *pp; + struct sk_buff *skb; + int metalen; +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 truesize; + void *data; + + pp = this_cpu_read(system_page_pool); + truesize = xdp->frame_sz; + + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + return NULL; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + return NULL; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); +#else /* !CONFIG_PAGE_POOL */ + struct napi_struct *napi; + + pp = NULL; + napi = napi_by_id(rxq->napi_id); + if (likely(napi)) + skb = napi_alloc_skb(napi, len); + else + skb = __netdev_alloc_skb_ip_align(rxq->dev, len, + GFP_ATOMIC | __GFP_NOWARN); + if (unlikely(!skb)) + return NULL; +#endif /* !CONFIG_PAGE_POOL */ + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + return NULL; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); + struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev) @@ -640,7 +866,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - if (xdpf->mem.type == MEM_TYPE_PAGE_POOL) + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ @@ -687,8 +913,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index ae71da7d2cd6ae..02c42caec9f41c 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -715,3 +715,43 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) (addr & ~PAGE_MASK); } EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * Please use xsk_buff_raw_get_ctx() in drivers instead. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xsk_tx_metadata *meta; + struct xdp_desc_ctx ret; + + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + ret = (typeof(ret)){ + /* Same logic as in xp_raw_get_dma() */ + .dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK), + }; + + if (!pool->tx_metadata_len) + goto out; + + /* Same logic as in xp_raw_get_data() + xsk_buff_get_metadata() */ + meta = pool->addrs + addr - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + goto out; + + ret.meta = meta; + +out: + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx);